In [1]:
import os

import pandas as pd

from IPython.display import display, Markdown
from recommender import RecommendationEngine
from utils import *


In [11]:
def count_good_recommendations(bought_items, recommendation, cluster_recommendations):
    # This function counts how many of the recommended product categories are similar
    # to the bought product categories
    cluster_counter = 0
    reco_counter = 0

    # This is a failsafe since clustering doesn't always return recommendations.
    if len(recommendation) > 0:
        for cat in recommendation:
            if cat in bought_items:
                reco_counter += 1

    if len(cluster_recommendations) > 0:
        for cat in cluster_recommendations:
            if cat in bought_items:
                cluster_counter += 1

    return reco_counter, cluster_counter


In [2]:
# Loading necessary csvs into Pandas
data_folder = "/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset"

df_names = [
    "orders_df",
    "reviews_df",
    "products_df",
    "order_items_df",
    "customer_df",
    "cat_name_translation",
]
df_files = [
    "olist_orders_dataset.csv",
    "olist_order_reviews_dataset.csv",
    "olist_products_dataset.csv",
    "olist_order_items_dataset.csv",
    "olist_customers_dataset.csv",
    "product_category_name_translation.csv",
]

# Loading dataframes
dataframes = preprocess_dataframes(read_dataframes(df_names, df_files, data_folder))

# Filtering final dataframe by most active users and bought items
final_df = filter_dataframe(
    join_dataframes(dataframes), item_number=500, user_number=1000
)

# Loading translation dictionary [Portugues -> English]
translate_dict = get_translation_dict(dataframes["cat_name_translation"])

# Initializing our custom recommendation engine
recommendationengine = RecommendationEngine(
    final_df, dataframes["products_df"], dataframes["order_items_df"], translate_dict
)


In [13]:
# NOTE: 'normal' means recommendations without clustering
good_cluster_reco = 0
good_normal_reco = 0

total_cluster_reco = 0
total_normal_reco = 0

for customer_idx in range(0, 999):
    bought_items_list = recommendationengine.get_bought_items(
        customer_idx=customer_idx, nr_of_items=2
    )["product_category_name"].to_list()
    no_cluster_reco = recommendationengine.get_recommendation(
        customer_idx=customer_idx, nr_of_items=2
    )["product_category_name"].to_list()
    cluster_reco = recommendationengine.get_recommendation(
        customer_idx=customer_idx, nr_of_items=2, cluster=True
    )["product_category_name"].to_list()

    a, b = count_good_recommendations(bought_items_list, no_cluster_reco, cluster_reco)

    good_normal_reco += a
    good_cluster_reco += b

    total_normal_reco += len(no_cluster_reco)
    total_cluster_reco += len(cluster_reco)

print(
    f"No cluster good recommendations: {good_normal_reco} out of {total_normal_reco} recommendations"
)
print(
    f"Clustered good recommendations: {good_cluster_reco} out of {total_cluster_reco} recommendations"
)


No cluster good recommendations: 88 out of 1998 recommendations
Clustered good recommendations: 601 out of 1009 recommendations
