In [1]:
import os

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, Math
from sklearn.metrics.pairwise import cosine_similarity
from recommender import RecommendationEngine
from utils import get_translation_dict

In [2]:
# Loading necessary csvs into Pandas
data_folder = "/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset"

orders_df = pd.read_csv(os.path.join(data_folder, "olist_orders_dataset.csv"))
reviews_df = pd.read_csv(os.path.join(data_folder, "olist_order_reviews_dataset.csv"))
products_df = pd.read_csv(os.path.join(data_folder, "olist_products_dataset.csv"))
order_items_df = pd.read_csv(os.path.join(data_folder, "olist_order_items_dataset.csv"))
customer_df = pd.read_csv(os.path.join(data_folder, "olist_customers_dataset.csv"))
cat_name_translation = pd.read_csv(
    os.path.join(data_folder, "product_category_name_translation.csv")
)

dfs = [orders_df, reviews_df, products_df, order_items_df, customer_df]

# Converting ID columns from 'object' type to string
for df in dfs:
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == "object":
            df[column] = df[column].astype("string")

unique_id_df = pd.merge(
    orders_df[["order_id", "customer_id"]],
    customer_df[["customer_id", "customer_unique_id"]],
    on=["customer_id"],
    how="inner",
)

product_and_order_id_df = pd.merge(
    orders_df[["order_id", "customer_id"]],
    order_items_df[["order_id", "product_id"]],
    on=["order_id"],
    how="inner",
)
user_product_order_id_df = pd.merge(
    unique_id_df, product_and_order_id_df, on=["order_id", "customer_id"], how="inner"
)
final_df = pd.merge(
    user_product_order_id_df,
    reviews_df[["order_id", "review_score"]],
    on=["order_id"],
    how="inner",
).drop(["customer_id", "order_id"], axis=1)

### Filtering dataframes so we have a smaller subset
data = final_df.copy()

# get total counts of no. of occurence of product
data["count"] = data.groupby("product_id").transform("count")["customer_unique_id"]

# fetch top 100 movies based on count
product_id = (
    data.drop_duplicates("product_id")
    .sort_values("count", ascending=False)
    .iloc[:500]["product_id"]
)

# filter out data as per the product_id
data = data[data["product_id"].isin(product_id)].reset_index(drop=True)

# get total counts of no. of occurence of customer
data["count"] = data.groupby("customer_unique_id").transform("count")["product_id"]

# fetch top 1000 products based on count
customer_id = (
        data.drop_duplicates("customer_unique_id")
        .sort_values("count", ascending=False)
        .iloc[:1000]["customer_unique_id"]
    )

data = data[data["customer_unique_id"].isin(customer_id)].reset_index(drop=True)

In [3]:
# Pivoting DataFrame to obtain a User-Item matrix
df = data.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

In [31]:
customer_idx = 200
# customer_idx = 0
customer_id = df.iloc[customer_idx].name
customer_id

'36cfec707344b75d20e6c7ef583c3b8c'

In [32]:
items_bought = list(final_df[final_df['customer_unique_id'] == customer_id].drop_duplicates('product_id')['product_id'].values)
users_with_same_items = final_df[final_df['product_id'].isin(items_bought)]
df_same_items = users_with_same_items.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")
df_same_items

product_id,368c6c730842d78016ad823897a372db,389d119b48cf3043d311335e499d9c6b
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
002feefec5af0a3b26ee7839c66d205e,1.0,
0064a1b5f5cddd047c987b988a90c8c1,5.0,
00bbfe2f540d0cfccbb098d92c503eca,,5.0
00ec23a308504080697e5204d3dbcb2c,,5.0
026e135c007282454338db204e44381f,,5.0
...,...,...
fd4fbaea0fdd24464c6e1738ce21a5ea,5.0,
fdbc9e3e8ff4d5fa66c92c652912aef4,,1.0
fe3ff2b93571c00e1f2fa3bd25c845f7,3.0,
ff5f0e6c9223056cd0e65b534856f5d0,,3.0


In [34]:
df_same_items_full = pd.merge(users_with_same_items['customer_unique_id'], final_df, on='customer_unique_id', how='inner').drop_duplicates()
df_same_items_full = df_same_items_full.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

In [35]:
# Loading translation dictionary [Portugues -> English]
translate_dict = get_translation_dict(cat_name_translation)

# Initializing our custom recommendation engine
recommendationengine = RecommendationEngine(df_same_items_full, products_df, order_items_df, translate_dict)

In [36]:
display(recommendationengine.get_recommendation(customer_idx=customer_idx, nr_of_items=2))
display(recommendationengine.get_bought_items(customer_idx=customer_idx, nr_of_items=2))

Unnamed: 0,product_id,product_category_name,score,price
1,6cdd53843498f92890544667809f1595,health_beauty,4.998688,350.834615
0,3b213ba02fcd0799a85c9c3580a3033d,baby,4.997168,54.9


Unnamed: 0,product_id,rating,product_category_name,price
0,368c6c730842d78016ad823897a372db,4.0,garden_tools,49.9
1,b0961721fd839e9982420e807758a2a6,4.0,garden_tools,53.9
