In [4]:
import os

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, Math
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
def get_recommendation(customer_index, df, df_imputed, similarity_matrix, metadata, translate_dict):
    idx = customer_index
    user_id = df.iloc[customer_idx].name

    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get unrated products indices
    unrated_products = df.iloc[idx][df.iloc[idx].isna()].index

    product_ratings = (
        df_imputed.iloc[similarity_matrix[idx]][unrated_products].T
        * [x[1] for x in sim_scores]
    ).T

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    product_ratings = product_ratings.iloc[[x[0] for x in sim_scores]].mean()

    recommended_products = metadata[
        metadata["product_id"].isin(
            product_ratings.reset_index().sort_values(0, ascending=False)["product_id"]
        )
    ][["product_category_name", "product_id"]]

    assumed_ratings = sorted(product_ratings, reverse=True)

    return pd.DataFrame(
        {
            "ProductId": recommended_products[:10]["product_id"],
            "ProductCategory": recommended_products[:10]["product_category_name"].map(translate_dict),
            "Assumed Rating": assumed_ratings[:10],
        }
    ).style.set_caption(f'User id: {user_id}')


def get_bought_items(customer_idx, rating_matrix, products_df, translate_dict, order_items_df, top_n=None):
    rated_items_df = rating_matrix.iloc[customer_idx][rating_matrix.iloc[customer_idx].notnull()].reset_index()
    rated_items_df.columns = ['product_id', 'rating']
    rated_items_df = rated_items_df.sort_values(by='rating', ascending=False)

    filtered_products = products_df[products_df['product_id'].isin(rated_items_df['product_id'].values)][['product_id', 'product_category_name']]
    filtered_products['product_category_name'] = filtered_products['product_category_name'].map(translate_dict)

    rated_items_df = pd.merge(rated_items_df, filtered_products, on='product_id', how='inner')
    rated_items_df = pd.merge(rated_items_df, order_items_df[['product_id', 'price']], on='product_id', how='inner').drop_duplicates(
    subset=['product_id'], keep='first').reset_index(drop=True)
    
    if top_n:
        return rated_items_df.head(top_n)
    else:
        return rated_items_df

In [1]:
# def get_bought_items_and_ratings(customer_index, df):
#     idx = customer_index
#     return df.iloc[customer_idx][df.iloc[customer_idx].notnull()]



In [6]:
data_folder = '/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset'

orders_df = pd.read_csv(os.path.join(data_folder, 'olist_orders_dataset.csv'))
reviews_df = pd.read_csv(os.path.join(data_folder, 'olist_order_reviews_dataset.csv'))
products_df = pd.read_csv(os.path.join(data_folder, 'olist_products_dataset.csv'))
order_items_df = pd.read_csv(os.path.join(data_folder, 'olist_order_items_dataset.csv'))
customer_df = pd.read_csv(os.path.join(data_folder, 'olist_customers_dataset.csv'))
cat_name_translation =  pd.read_csv(os.path.join(data_folder, 'product_category_name_translation.csv'))

dfs = [orders_df, reviews_df, products_df, order_items_df, customer_df]

In [7]:
# Converting ID columns from 'object' type to string
for df in dfs:
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == 'object':
            df[column] = df[column].astype("string")

unique_id_df = pd.merge(orders_df[['order_id', 'customer_id']], customer_df[['customer_id', 'customer_unique_id']], on=['customer_id'], how='inner')
product_and_order_id_df = pd.merge(orders_df[['order_id', 'customer_id']], order_items_df[['order_id', 'product_id']], on=['order_id'], how='inner')
user_product_order_id_df = pd.merge(unique_id_df, product_and_order_id_df, on=['order_id', 'customer_id'], how='inner')
final_df = pd.merge(user_product_order_id_df, reviews_df[['order_id', 'review_score']], on=['order_id'], how='inner').drop(['customer_id', 'order_id'], axis=1)

In [8]:
tmp_short_df = final_df.head(2000)
selection = tmp_short_df.customer_unique_id.unique().tolist()
final_df_short = final_df[pd.DataFrame(final_df.customer_unique_id.tolist()).isin(selection).any(1).values]

rating_matrix = final_df_short.pivot_table(index='customer_unique_id', columns='product_id', values='review_score')

rows, cols = rating_matrix.shape
for i in range(rows):
    col_places = [random.randint(0, cols-1) for _ in range(random.randint(10, int(cols/2)))]
    for col in col_places:
        rating_matrix.iloc[i][col] = random.randint(1, 5)

product_id,008cff0e5792219fae03e570f980b330,00989337a1916a0055eedd1fdb35eb53,009c09f439988bc06a93d6b8186dce73,00ae7076313576f94d9107599d79a978,00baba5b58e274d0332a0c8a0a66f877,00d2add85b1f5aba6bb3d5d977314e25,010f24a605e8431fc68562e597f80d63,011967a30ceeaa86acb72e79664544ad,0134eb6b933b7ebc0f9e24ae5abefaa0,013ee64977aaa6b2b25475095162e0e9,...,fe6a9515d655fa7936b8a7c841039f34,fe75529a267b76ac6e96f0c322a899a4,fe83af233315b04d9093c7edbcf789dd,fed7ed9a2c9239558d696bfb40fc96a9,feeed1b9f26cce6eb3cb81af65eddd9e,ff26009ac6b838dc6cffa2d589cdbefb,ff5d7f21d255435967dfbe72b5dcdb57,ff922797a6771cab4e0c51d482285ec3,ffa7e0cbe11656d11a117b534bb1db27,ffd4bf4306745865e5692f69bd237893
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0019e8c501c85848ac0966d45226fa1d,,,,,,,,,,,...,,,,,,,,,,
0037eb1ff742ee2e71ba887f65f3c6f3,,,,,,,,,,,...,,,,,,,,,,
007953b3182d5ccc9189a694f5c68163,,,2.0,,,,,,,,...,,,3.0,,,3.0,,3.0,,
00ac9cd5c4ad19e16e7c6f6864711737,,,,,,,,,,1.0,...,,,,,,,,,,
00b2ca23369b68c4d4105ecea9c0cb93,,,3.0,,3.0,,4.0,,,,...,1.0,,,,,1.0,,,,


In [44]:
df_imputed = rating_matrix.fillna(rating_matrix.mean(axis=0))
similarity_matrix = cosine_similarity(df_imputed.values)

In [45]:
customer_idx = 1200

portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
translate_dict = {}

for p_key in portuguese_cat_names:
    if portuguese_cat_names[p_key] not in translate_dict:
        translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

get_recommendation(customer_idx, rating_matrix, df_imputed, similarity_matrix, products_df, translate_dict)

Unnamed: 0,ProductId,ProductCategory,Assumed Rating
22,e3e020af31d4d89d2602272b315c3f6e,health_beauty,4.884621
34,ce5b91848b91118daffb3af53b747475,sports_leisure,4.884621
39,680874c570dad71c0a2844cfbf417054,furniture_decor,4.884621
76,278b3c6462e86b4556b99989513ddf73,small_appliances,4.884621
141,bcd58b852a3b0152b6b384c4e4474ba2,cool_stuff,4.884621
153,b521fddf2c8a7e7d1791bfb3c1b2f278,sports_leisure,4.884621
161,3014e35fd70fce29095ced5cdc89f4ce,telephony,4.884621
173,d458378a178cd7cae60052319cebf235,stationery,4.884621
198,a5bc1334f1762ce0a844f3f694dc08e1,bed_bath_table,4.884621
207,1b37262ebd1307a6c9aab9aca8584e76,pet_shop,4.884621
