In [40]:
import os

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

pd.options.mode.chained_assignment = None

In [3]:
data_folder = '/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset'

orders_df = pd.read_csv(os.path.join(data_folder, 'olist_orders_dataset.csv'))
reviews_df = pd.read_csv(os.path.join(data_folder, 'olist_order_reviews_dataset.csv'))
products_df = pd.read_csv(os.path.join(data_folder, 'olist_products_dataset.csv'))
order_items_df = pd.read_csv(os.path.join(data_folder, 'olist_order_items_dataset.csv'))
customer_df = pd.read_csv(os.path.join(data_folder, 'olist_customers_dataset.csv'))
cat_name_translation =  pd.read_csv(os.path.join(data_folder, 'product_category_name_translation.csv'))

dfs = [orders_df, reviews_df, products_df, order_items_df, customer_df]

In [4]:
# Converting ID columns from 'object' type to string
for df in dfs:
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == 'object':
            df[column] = df[column].astype("string")

In [5]:
unique_id_df = pd.merge(orders_df[['order_id', 'customer_id']], customer_df[['customer_id', 'customer_unique_id']], on=['customer_id'], how='inner')
product_and_order_id_df = pd.merge(orders_df[['order_id', 'customer_id']], order_items_df[['order_id', 'product_id']], on=['order_id'], how='inner')
user_product_order_id_df = pd.merge(unique_id_df, product_and_order_id_df, on=['order_id', 'customer_id'], how='inner')
final_df = pd.merge(user_product_order_id_df, reviews_df[['order_id', 'review_score']], on=['order_id'], how='inner').drop(['customer_id', 'order_id'], axis=1)

In [6]:
tmp_short_df = final_df.head(2000)
selection = tmp_short_df.customer_unique_id.unique().tolist()
final_df_short = final_df[pd.DataFrame(final_df.customer_unique_id.tolist()).isin(selection).any(1).values]

In [7]:
rating_matrix = final_df_short.pivot_table(index='customer_unique_id', columns='product_id', values='review_score')
df_imputed = rating_matrix.fillna(rating_matrix.mean(axis=0))

In [8]:
similiarty_index = cosine_similarity(df_imputed.values)

In [9]:
def get_recommendation(user_index, df, df_imputed, similarity_matrix, metadata, translate_dict):
    idx = user_index
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get unrated movie indices
    unrated_products = df.iloc[idx][df.iloc[idx].isna()].index

    product_ratings = (
        df_imputed.iloc[similarity_matrix[idx]][unrated_products].T
        * [x[1] for x in sim_scores]
    ).T

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    product_ratings = product_ratings.iloc[[x[0] for x in sim_scores]].mean()

    recommended_products = metadata[
        metadata["product_id"].isin(
            product_ratings.reset_index().sort_values(0, ascending=False)["product_id"]
        )
    ][["product_category_name", "product_id"]]

    assumed_ratings = sorted(product_ratings, reverse=True)

    return pd.DataFrame(
        {
            "ProductId": recommended_products[:10]["product_id"],
            "ProductCategory": recommended_products[:10]["product_category_name"].map(translate_dict),
            "Assumed Rating": assumed_ratings[:10],
        }
    )

In [10]:
customer_id = "00b2ca23369b68c4d4105ecea9c0cb93"
customer_idx = 1748

portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
translate_dict = {}

for p_key in portuguese_cat_names:
    if portuguese_cat_names[p_key] not in translate_dict:
        translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

get_recommendation(customer_idx, rating_matrix, df_imputed, similiarty_index, products_df, translate_dict)

Unnamed: 0,ProductId,ProductCategory,Assumed Rating
22,e3e020af31d4d89d2602272b315c3f6e,health_beauty,5.0
34,ce5b91848b91118daffb3af53b747475,sports_leisure,5.0
39,680874c570dad71c0a2844cfbf417054,furniture_decor,5.0
76,278b3c6462e86b4556b99989513ddf73,small_appliances,5.0
141,bcd58b852a3b0152b6b384c4e4474ba2,cool_stuff,5.0
153,b521fddf2c8a7e7d1791bfb3c1b2f278,sports_leisure,5.0
160,1ffe365aa7583189e63f2fa4f060b269,sports_leisure,5.0
161,3014e35fd70fce29095ced5cdc89f4ce,telephony,5.0
173,d458378a178cd7cae60052319cebf235,stationery,5.0
198,a5bc1334f1762ce0a844f3f694dc08e1,bed_bath_table,5.0


In [11]:
rating_matrix_2 = rating_matrix.copy()

rows, cols = rating_matrix_2.shape

for i in range(rows):
    col_places = [random.randint(0, cols-1) for _ in range(random.randint(10, int(cols/2)))]
    for col in col_places:
        rating_matrix_2.iloc[i][col] = random.randint(1, 5)

df_imputed_2 = rating_matrix_2.fillna(rating_matrix.mean(axis=0))
similiarty_index_2 = cosine_similarity(df_imputed_2.values)


In [12]:
def get_recommendation(user_index, df, df_imputed, similarity_matrix, metadata, translate_dict):
    idx = user_index
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get unrated movie indices
    unrated_products = df.iloc[idx][df.iloc[idx].isna()].index

    product_ratings = (
        df_imputed.iloc[similarity_matrix[idx]][unrated_products].T
        * [x[1] for x in sim_scores]
    ).T

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    product_ratings = product_ratings.iloc[[x[0] for x in sim_scores]].mean()

    recommended_products = metadata[
        metadata["product_id"].isin(
            product_ratings.reset_index().sort_values(0, ascending=False)["product_id"]
        )
    ][["product_category_name", "product_id"]]

    assumed_ratings = sorted(product_ratings, reverse=True)

    return pd.DataFrame(
        {
            "ProductId": recommended_products[:10]["product_id"],
            "ProductCategory": recommended_products[:10]["product_category_name"].map(translate_dict),
            "Assumed Rating": assumed_ratings[:10],
        }
    )

In [13]:
customer_idx = 1000

portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
translate_dict = {}

for p_key in portuguese_cat_names:
    if portuguese_cat_names[p_key] not in translate_dict:
        translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

get_recommendation(customer_idx, rating_matrix_2, df_imputed_2, similiarty_index_2, products_df, translate_dict)

Unnamed: 0,ProductId,ProductCategory,Assumed Rating
22,e3e020af31d4d89d2602272b315c3f6e,health_beauty,4.963594
34,ce5b91848b91118daffb3af53b747475,sports_leisure,4.963594
76,278b3c6462e86b4556b99989513ddf73,small_appliances,4.963594
141,bcd58b852a3b0152b6b384c4e4474ba2,cool_stuff,4.963594
153,b521fddf2c8a7e7d1791bfb3c1b2f278,sports_leisure,4.963594
160,1ffe365aa7583189e63f2fa4f060b269,sports_leisure,4.963594
161,3014e35fd70fce29095ced5cdc89f4ce,telephony,4.963594
173,d458378a178cd7cae60052319cebf235,stationery,4.963594
198,a5bc1334f1762ce0a844f3f694dc08e1,bed_bath_table,4.963594
202,47969dd948e918289f809be899ddfb4c,stationery,4.963594


In [39]:
prod_id = "ce5b91848b91118daffb3af53b747475"
display(products_df[products_df["product_id"] == prod_id])
display(order_items_df[order_items_df["product_id"] == prod_id])

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
34,ce5b91848b91118daffb3af53b747475,esporte_lazer,50.0,699.0,4.0,1388.0,34.0,9.0,31.0


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
19132,2bfb6645f99e561b82bbdd84c98da7e1,1,ce5b91848b91118daffb3af53b747475,2ff97219cb8622eaf3cd89b7d9c09824,2018-05-16 23:11:15,19.9,9.44


In [26]:
customer_df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [41]:
prod_value_df = order_items_df[['product_id', 'price', 'freight_value']]
prod_value_df['product_value'] = prod_value_df['price'] - prod_value_df['freight_value']
prod_value_df.head()

Unnamed: 0,product_id,price,freight_value,product_value
0,4244733e06e7ecb4970a6e2683c13e61,58.9,13.29,45.61
1,e5f2d52b802189ee658865ca93d83a8f,239.9,19.93,219.97
2,c777355d18b72b67abbeef9df44fd0fd,199.0,17.87,181.13
3,7634da152a4610f1595efa32f14722fc,12.99,12.79,0.2
4,ac6c3623068f30de03045865e4e10089,199.9,18.14,181.76
