In [2]:
import os

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, Math
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def normalize_matrix(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))


def get_recommendation(customer_index, df, df_imputed, similarity_matrix, metadata, translate_dict, top_n=10):
    idx = customer_index
    user_id = df.iloc[idx].name

    # similarity_matrix = normalize_matrix(similarity_matrix)

    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get unrated products indices
    unrated_products = df.iloc[idx][df.iloc[idx].isna()].index

    product_ratings = (
        df_imputed.iloc[similarity_matrix[idx]][unrated_products].T
        * [x[1] for x in sim_scores]
    ).T

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    product_ratings = product_ratings.iloc[[x[0] for x in sim_scores]].mean()

    recommended_products = metadata[
        metadata["product_id"].isin(
            product_ratings.reset_index().sort_values(0, ascending=False)["product_id"]
        )
    ][["product_category_name", "product_id"]]

    assumed_ratings = sorted(product_ratings, reverse=True)

    return_df = pd.DataFrame(
        {
            "product_id": recommended_products[:top_n]["product_id"],
            "product_category_name": recommended_products[:top_n]["product_category_name"].map(translate_dict),
            "inferred_rating": assumed_ratings[:top_n],
        }
    )
    return_df.style.set_caption(f'User id: {user_id}')

    return return_df


def get_bought_items(customer_idx, rating_matrix, products_df, translate_dict, order_items_df, top_n=None):
    rated_items_df = rating_matrix.iloc[customer_idx][rating_matrix.iloc[customer_idx].notnull()].reset_index()
    rated_items_df.columns = ['product_id', 'rating']
    rated_items_df = rated_items_df.sort_values(by='rating', ascending=False)

    filtered_products = products_df[products_df['product_id'].isin(rated_items_df['product_id'].values)][['product_id', 'product_category_name']]
    filtered_products['product_category_name'] = filtered_products['product_category_name'].map(translate_dict)

    rated_items_df = pd.merge(rated_items_df, filtered_products, on='product_id', how='inner')
    rated_items_df = pd.merge(rated_items_df, order_items_df[['product_id', 'price']], on='product_id', how='inner').drop_duplicates(
    subset=['product_id'], keep='first').reset_index(drop=True)
    
    if top_n:
        return rated_items_df.head(top_n)
    else:
        return rated_items_df

In [4]:
data_folder = '/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset'

orders_df = pd.read_csv(os.path.join(data_folder, 'olist_orders_dataset.csv'))
reviews_df = pd.read_csv(os.path.join(data_folder, 'olist_order_reviews_dataset.csv'))
products_df = pd.read_csv(os.path.join(data_folder, 'olist_products_dataset.csv'))
order_items_df = pd.read_csv(os.path.join(data_folder, 'olist_order_items_dataset.csv'))
customer_df = pd.read_csv(os.path.join(data_folder, 'olist_customers_dataset.csv'))
cat_name_translation =  pd.read_csv(os.path.join(data_folder, 'product_category_name_translation.csv'))

dfs = [orders_df, reviews_df, products_df, order_items_df, customer_df]

In [5]:
# Converting ID columns from 'object' type to string
for df in dfs:
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == 'object':
            df[column] = df[column].astype("string")

unique_id_df = pd.merge(orders_df[['order_id', 'customer_id']], customer_df[['customer_id', 'customer_unique_id']], on=['customer_id'], how='inner')
product_and_order_id_df = pd.merge(orders_df[['order_id', 'customer_id']], order_items_df[['order_id', 'product_id']], on=['order_id'], how='inner')
user_product_order_id_df = pd.merge(unique_id_df, product_and_order_id_df, on=['order_id', 'customer_id'], how='inner')
final_df = pd.merge(user_product_order_id_df, reviews_df[['order_id', 'review_score']], on=['order_id'], how='inner').drop(['customer_id', 'order_id'], axis=1)

In [6]:
tmp_short_df = final_df.head(2000)
selection = tmp_short_df.customer_unique_id.unique().tolist()
final_df_short = final_df[pd.DataFrame(final_df.customer_unique_id.tolist()).isin(selection).any(1).values]

rating_matrix = final_df_short.pivot_table(index='customer_unique_id', columns='product_id', values='review_score')

rows, cols = rating_matrix.shape
for i in range(rows):
    col_places = [random.randint(0, cols-1) for _ in range(random.randint(10, int(rows/2)))]
    for col in col_places:
        rating_matrix.iloc[i][col] = random.randint(1, 5)

In [7]:
df_imputed = rating_matrix.fillna(rating_matrix.mean(axis=0))
similarity_matrix = cosine_similarity(df_imputed.values)

In [8]:
customer_idx = 50

portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
translate_dict = {}

for p_key in portuguese_cat_names:
    if portuguese_cat_names[p_key] not in translate_dict:
        translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

recommendations = get_recommendation(customer_idx, rating_matrix, df_imputed, similarity_matrix, products_df, translate_dict, top_n=10)
# recommendations_with_price = pd.merge(recommendations, order_items_df[['product_id', 'price']], on='product_id', how='inner').drop_duplicates(subset=['product_id'], keep='first').reset_index(drop=True)

display(recommendations)
# display(recommendations_with_price)

bought_items = get_bought_items(customer_idx, rating_matrix, products_df, translate_dict, order_items_df)
display(bought_items)

Unnamed: 0,product_id,product_category_name,inferred_rating
22,e3e020af31d4d89d2602272b315c3f6e,health_beauty,4.800942
34,ce5b91848b91118daffb3af53b747475,sports_leisure,4.800942
39,680874c570dad71c0a2844cfbf417054,furniture_decor,4.800942
76,278b3c6462e86b4556b99989513ddf73,small_appliances,4.800942
160,1ffe365aa7583189e63f2fa4f060b269,sports_leisure,4.800942
161,3014e35fd70fce29095ced5cdc89f4ce,telephony,4.800942
202,47969dd948e918289f809be899ddfb4c,stationery,4.800942
207,1b37262ebd1307a6c9aab9aca8584e76,pet_shop,4.800942
275,c9fe02894880831f6f28d5ac212d036f,construction_tools_safety,4.800942
295,2e511b5741ab14e7f5294df6f1310b03,computers_accessories,4.800942


Unnamed: 0,product_id,rating,product_category_name,price
0,bbdf7b7e937d9526d3acd9854d7e939b,5.0,stationery,19.99
1,c384e8f1f1c380ef1dbafaf080ba25fc,5.0,sports_leisure,89.99
2,c1c4271977e10331523b010496ab2f9b,5.0,toys,299.00
3,c063e7cfa2a5277de17cde1837399dc5,5.0,furniture_decor,119.90
4,bf5d132b4d30a18970b8ee7798725af1,5.0,kitchen_dining_laundry_garden_furniture,140.00
...,...,...,...,...
592,cd3f4942e46362adc32589ff1a90e3dc,1.0,housewares,29.98
593,a526c4bc0a48c0e4345f97af496d87cd,1.0,sports_leisure,144.90
594,a5215a7a9f46c4185b12f38e9ddf2abc,1.0,computers,1340.00
595,a50acd33ba7a8da8e9db65094fa990a4,1.0,auto,117.30


In [9]:
customer_idx = 1200

portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
translate_dict = {}

for p_key in portuguese_cat_names:
    if portuguese_cat_names[p_key] not in translate_dict:
        translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

recommendations = get_recommendation(customer_idx, rating_matrix, df_imputed, similarity_matrix, products_df, translate_dict, top_n=10)
recommendations_with_price = pd.merge(recommendations, order_items_df[['product_id', 'price']], on='product_id', how='inner').drop_duplicates(subset=['product_id'], keep='first').reset_index(drop=True)

display(recommendations)

bought_items = get_bought_items(customer_idx, rating_matrix, products_df, translate_dict, order_items_df)
display(bought_items)

Unnamed: 0,product_id,product_category_name,inferred_rating
22,e3e020af31d4d89d2602272b315c3f6e,health_beauty,4.966299
34,ce5b91848b91118daffb3af53b747475,sports_leisure,4.966299
39,680874c570dad71c0a2844cfbf417054,furniture_decor,4.966299
76,278b3c6462e86b4556b99989513ddf73,small_appliances,4.966299
141,bcd58b852a3b0152b6b384c4e4474ba2,cool_stuff,4.966299
153,b521fddf2c8a7e7d1791bfb3c1b2f278,sports_leisure,4.966299
160,1ffe365aa7583189e63f2fa4f060b269,sports_leisure,4.966299
161,3014e35fd70fce29095ced5cdc89f4ce,telephony,4.966299
173,d458378a178cd7cae60052319cebf235,stationery,4.966299
198,a5bc1334f1762ce0a844f3f694dc08e1,bed_bath_table,4.966299


Unnamed: 0,product_id,rating,product_category_name,price
0,73a6530caef9511c04711d12dcef551c,5.0,office_furniture,129.0
1,7612afaa8218e79b3011243ed9edea9c,5.0,sports_leisure,389.0
2,785c2986ce8232a718aaad761ac2a321,5.0,housewares,197.0
3,6d59a3fe3140425fb175cdd1e688dc2e,5.0,furniture_decor,29.9
4,7ba8971d032709efa2bf76c413975d2c,5.0,luggage_accessories,71.9
5,82a61259a621866c4ba63743da29a342,5.0,sports_leisure,119.8
6,58b0b0b3e6b7f8ecd3874db9c141250d,5.0,toys,198.0
7,952670bc019a73f0933157ec82ff6e4f,5.0,bed_bath_table,205.0
8,a0b7d5a992ccda646f2d34e418fff5a0,5.0,furniture_decor,69.9
9,425db55cb3b0f5b18a2d9964da31c3c0,5.0,stationery,49.9


In [11]:
# tmp_short_df = final_df.head(2000)
# selection = tmp_short_df.customer_unique_id.unique().tolist()
# final_df_short = final_df[pd.DataFrame(final_df.customer_unique_id.tolist()).isin(selection).any(1).values]

copy_df = final_df.copy()
copy_df["count"] = copy_df.groupby("customer_unique_id").transform("count")["product_id"]

userId = (
        copy_df.drop_duplicates("customer_unique_id")
        .sort_values("count", ascending=False)
        .iloc[:500]["customer_unique_id"]
    )

copy_df = copy_df[copy_df["customer_unique_id"].isin(userId)].reset_index(drop=True)
# copy_df.sort_values(by='count', ascending=False)[['customer_unique_id', 'count']]
# copy_df

rating_matrix = copy_df.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

# rows, cols = rating_matrix.shape
# for i in range(rows):
#     col_places = [random.randint(0, cols-1) for _ in range(random.randint(10, int(rows/2)))]
#     for col in col_places:
#         rating_matrix.iloc[i][col] = random.randint(1, 5)

df_imputed = rating_matrix.fillna(rating_matrix.mean(axis=0))
similarity_matrix = cosine_similarity(df_imputed.values)

In [23]:
customer_idx = 240
recommendations = get_recommendation(customer_idx, rating_matrix, df_imputed, similarity_matrix, products_df, translate_dict, top_n=10)
recommendations

Unnamed: 0,product_id,product_category_name,inferred_rating
7,2548af3e6e77a690cf3eb6368e9ab61e,furniture_decor,4.928181
19,f53103a77d9cf245e579ea37e5ec51f0,bed_bath_table,4.928181
33,1eba879220bd0981a0e2fbab499ed4e1,office_furniture,4.928181
180,df473738565b52f77b4e22b328b41576,costruction_tools_tools,4.928181
197,5fb61f482620cb672f5e586bb132eae9,,4.928181
268,23ab7bb9eac81e85ec43ed71064cf7ce,bed_bath_table,4.928181
295,2e511b5741ab14e7f5294df6f1310b03,computers_accessories,4.928181
331,f0737b524fe6b57f2ad4f174ce23a62c,sports_leisure,4.928181
357,f2a1b32f85cad59ff2a8444154ac25f0,air_conditioning,4.928181
403,09c3d39641970009c198caed304ccfc4,baby,4.928181


In [75]:
customer_idx = 0
user_id = df.iloc[customer_idx].name
print(f"User with id {customer_idx} has id {user_id}")

unrated_products = rating_matrix.iloc[customer_idx][rating_matrix.iloc[customer_idx].isna()].index
similarity_matrix[customer_idx]


User with id 0 has id 000fbf0473c10fc1ab6f8d2d286ce20c


array([1.        , 0.99997705, 0.99997639, ..., 1.        , 1.        ,
       1.        ])

In [112]:
data = final_df.copy()

# get total counts of no. of occurence of movie
data["count"] = data.groupby("product_id").transform("count")["customer_unique_id"]

# fetch top 100 movies based on count
movieId = (
    data.drop_duplicates("product_id")
    .sort_values("count", ascending=False)
    .iloc[:100]["product_id"]
)

# filter out data as per the movieId
data = data[data["product_id"].isin(movieId)].reset_index(drop=True)
data

Unnamed: 0,customer_unique_id,product_id,review_score,count
0,af07308b275d755c9edb36a90c618231,595fac2a385ac33a80bd5114aec74eb8,4,106
1,f2a85dec752b8517b5e58a06ff3cd937,08574b074924071f4e201e151b152b4e,1,112
2,f2a85dec752b8517b5e58a06ff3cd937,08574b074924071f4e201e151b152b4e,1,112
3,68954feaafe4dd638f3bd3e2afa174ec,2b4609f8948be18874494203496bc318,5,269
4,c796780c7daeab9e94cc052b1f103b21,8c591ab0ca519558779df02023177f44,5,141
...,...,...,...,...
14287,40163091ee73ffaa57448cb8e21a2615,4fcb3d9a5f4871e8362dfedbdb02b064,5,90
14288,9e4accf95024aa6565ca1efcadb96752,0a57f7d2c983bcf8188589a5fea4a8da,5,102
14289,1a3b8f1d0782ebedbcf220a96cbc1655,595fac2a385ac33a80bd5114aec74eb8,5,106
14290,5097a5312c8b157bb7be58ae360ef43c,d1c427060a0f73f6b889a5c7c61f2ac4,2,340


In [111]:
final_df

Unnamed: 0,customer_unique_id,product_id,review_score
0,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,4
1,af07308b275d755c9edb36a90c618231,595fac2a385ac33a80bd5114aec74eb8,4
2,3a653a41f6f9fc3d2a113cf8398680e8,aa4383b373c6aca5d8797843e5594415,5
3,7c142cf63193a1473d2e66489a9ae977,d0b61bfb1de832b15ba9d266ca96e5b0,5
4,72632f0f9dd73dfee390c9b22eb56dd6,65266b2da20d04dbe00c5c2d3bb7859e,5
...,...,...,...
112367,da62f9e57a76d978d02ab5362c509660,f1d4ce8c6dd66c47bbaa8c6781c2a923,4
112368,737520a9aad80b3fbbdad19b66b37b30,b80910977a37536adeddd63663f916ad,5
112369,5097a5312c8b157bb7be58ae360ef43c,d1c427060a0f73f6b889a5c7c61f2ac4,2
112370,5097a5312c8b157bb7be58ae360ef43c,d1c427060a0f73f6b889a5c7c61f2ac4,2
