In [1]:
import os

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, Math
from sklearn.metrics.pairwise import cosine_similarity
from recommender import RecommendationEngine
from utils import get_translation_dict

In [2]:
# Loading necessary csvs into Pandas
data_folder = "/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset"

orders_df = pd.read_csv(os.path.join(data_folder, "olist_orders_dataset.csv"))
reviews_df = pd.read_csv(os.path.join(data_folder, "olist_order_reviews_dataset.csv"))
products_df = pd.read_csv(os.path.join(data_folder, "olist_products_dataset.csv"))
order_items_df = pd.read_csv(os.path.join(data_folder, "olist_order_items_dataset.csv"))
customer_df = pd.read_csv(os.path.join(data_folder, "olist_customers_dataset.csv"))
cat_name_translation = pd.read_csv(
    os.path.join(data_folder, "product_category_name_translation.csv")
)

dfs = [orders_df, reviews_df, products_df, order_items_df, customer_df]

# Converting ID columns from 'object' type to string
for df in dfs:
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == "object":
            df[column] = df[column].astype("string")

unique_id_df = pd.merge(
    orders_df[["order_id", "customer_id"]],
    customer_df[["customer_id", "customer_unique_id"]],
    on=["customer_id"],
    how="inner",
)

product_and_order_id_df = pd.merge(
    orders_df[["order_id", "customer_id"]],
    order_items_df[["order_id", "product_id"]],
    on=["order_id"],
    how="inner",
)
user_product_order_id_df = pd.merge(
    unique_id_df, product_and_order_id_df, on=["order_id", "customer_id"], how="inner"
)
final_df = pd.merge(
    user_product_order_id_df,
    reviews_df[["order_id", "review_score"]],
    on=["order_id"],
    how="inner",
).drop(["customer_id", "order_id"], axis=1)

### Filtering dataframes so we have a smaller subset
data = final_df.copy()

# get total counts of no. of occurence of product
data["count"] = data.groupby("product_id").transform("count")["customer_unique_id"]

# fetch top 100 movies based on count
product_id = (
    data.drop_duplicates("product_id")
    .sort_values("count", ascending=False)
    .iloc[:500]["product_id"]
)

# filter out data as per the product_id
data = data[data["product_id"].isin(product_id)].reset_index(drop=True)

# get total counts of no. of occurence of customer
data["count"] = data.groupby("customer_unique_id").transform("count")["product_id"]

# fetch top 1000 products based on count
customer_id = (
        data.drop_duplicates("customer_unique_id")
        .sort_values("count", ascending=False)
        .iloc[:1000]["customer_unique_id"]
    )

data = data[data["customer_unique_id"].isin(customer_id)].reset_index(drop=True)

In [3]:
# Pivoting DataFrame to obtain a User-Item matrix
df = data.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

In [18]:
customer_idx = 280
# customer_idx = 0
customer_id = df.iloc[customer_idx].name
customer_id

'4c4584a82633ac90e58fe2d141d7433e'

In [19]:
items_bought = list(final_df[final_df['customer_unique_id'] == customer_id].drop_duplicates('product_id')['product_id'].values)
users_with_same_items = final_df[final_df['product_id'].isin(items_bought)]
df_same_items = users_with_same_items.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")
df_same_items

product_id,634256dbcd184b3757ffd7632c9fe52a
customer_unique_id,Unnamed: 1_level_1
00b1066a65657ee1759544f6b61a4e2c,5
08371b8fa87d33d519335b836d8abf12,5
0f1057714bc593f65f125ca25d140d17,5
10b442e762ba8a5ad22de5ba2207e5f1,5
161a927c39ab276c394039d5f64c9c99,5
17cf7606dd2c75de6761b0a30652c8ce,5
197884ee7c9574b7d8c907d911035aa4,1
1e728b530adb1a64bd4e2a3f728e65e5,1
327a48c6f5aa9ae7cec1faa4576bad5d,4
33959d22e93db0d397bfb42624e1ba92,5


In [20]:
df_same_items_full = pd.merge(users_with_same_items['customer_unique_id'], final_df, on='customer_unique_id', how='inner').drop_duplicates()
# df_same_items_full = df_same_items_full.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

In [21]:
# Loading translation dictionary [Portugues -> English]
translate_dict = get_translation_dict(cat_name_translation)

# Initializing our custom recommendation engine
recommendationengine = RecommendationEngine(df_same_items_full, products_df, order_items_df, translate_dict)

In [22]:
display(recommendationengine.get_recommendation(customer_idx=customer_idx, nr_of_items=2))
display(recommendationengine.get_bought_items(customer_idx=customer_idx, nr_of_items=2))

IndexError: index 280 is out of bounds for axis 0 with size 50

In [11]:
recommendationengine_2 = RecommendationEngine(data, products_df, order_items_df, translate_dict)

In [12]:
display(recommendationengine_2.get_recommendation(customer_idx=customer_idx, nr_of_items=2))
display(recommendationengine_2.get_bought_items(customer_idx=customer_idx, nr_of_items=2))

Unnamed: 0,product_id,product_category_name,score,price
0,349ce46a0e2e20054aa9d80c48af8816,construction_tools_lights,4.99943,150.470238
1,b38b25d838ae0b8385e8cc68b9017644,health_beauty,4.998256,160.28125


Unnamed: 0,product_id,rating,product_category_name,price
0,368c6c730842d78016ad823897a372db,5.0,garden_tools,49.9
1,389d119b48cf3043d311335e499d9c6b,5.0,garden_tools,49.9


In [24]:
df_same_items_full = pd.merge(users_with_same_items['customer_unique_id'], final_df, on='customer_unique_id', how='inner').drop_duplicates()
df_same_items_full_pivot = df_same_items_full.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

df_same_items_full_imputed = df_same_items_full_pivot.fillna(df_same_items_full_pivot.mean(axis=0))
similarity_matrix = cosine_similarity(df_same_items_full_imputed)

In [25]:
similarity_matrix

array([[1.        , 1.        , 1.        , ..., 1.        , 0.9701425 ,
        0.83205029],
       [1.        , 1.        , 1.        , ..., 1.        , 0.9701425 ,
        0.83205029],
       [1.        , 1.        , 1.        , ..., 1.        , 0.9701425 ,
        0.83205029],
       ...,
       [1.        , 1.        , 1.        , ..., 1.        , 0.9701425 ,
        0.83205029],
       [0.9701425 , 0.9701425 , 0.9701425 , ..., 0.9701425 , 1.        ,
        0.94174191],
       [0.83205029, 0.83205029, 0.83205029, ..., 0.83205029, 0.94174191,
        1.        ]])

In [26]:
customer_idx = 280
# customer_idx = 0
customer_id = df.iloc[customer_idx].name
customer_id

'4c4584a82633ac90e58fe2d141d7433e'

In [54]:
df_same_items_full_imputed[df_same_items_full_imputed.index == customer_id]

product_id,634256dbcd184b3757ffd7632c9fe52a,72ef87b1953a2844ed089bd6dcf73913
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4c4584a82633ac90e58fe2d141d7433e,1.0,5.0


In [68]:
df_same_items_full_imputed.index.get_loc(customer_id)

12

In [69]:
df_same_items_full_imputed.iloc[12]

product_id
634256dbcd184b3757ffd7632c9fe52a    1.0
72ef87b1953a2844ed089bd6dcf73913    5.0
Name: 4c4584a82633ac90e58fe2d141d7433e, dtype: float64

In [70]:
type(df_same_items_full_imputed)

pandas.core.frame.DataFrame