In [1]:
import os

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, Math
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# def normalize_matrix(data):
#     return (data - np.min(data)) / (np.max(data) - np.min(data))


# def get_recommendation(customer_index, df, df_imputed, similarity_matrix, metadata, translate_dict, top_n=10):
#     idx = customer_index
#     user_id = df.iloc[idx].name

#     similarity_matrix = normalize_matrix(similarity_matrix)

#     sim_scores = list(enumerate(similarity_matrix[idx]))

#     # get unrated products indices
#     unrated_products = df.iloc[idx][df.iloc[idx].isna()].index

#     product_ratings = (
#         df_imputed.iloc[similarity_matrix[idx]][unrated_products].T
#         * [x[1] for x in sim_scores]
#     ).T

#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

#     product_ratings = product_ratings.iloc[[x[0] for x in sim_scores]].mean()

#     recommended_products = metadata[
#         metadata["product_id"].isin(
#             product_ratings.reset_index().sort_values(0, ascending=False)["product_id"]
#         )
#     ][["product_category_name", "product_id"]]

#     assumed_ratings = sorted(product_ratings, reverse=True)

#     return_df = pd.DataFrame(
#         {
#             "product_id": recommended_products[:top_n]["product_id"],
#             "product_category_name": recommended_products[:top_n]["product_category_name"].map(translate_dict),
#             "inferred_rating": assumed_ratings[:top_n],
#         }
#     )
#     return_df.style.set_caption(f'User id: {user_id}')

#     return return_df


# def get_bought_items(customer_idx, rating_matrix, products_df, translate_dict, order_items_df, top_n=None):
#     rated_items_df = rating_matrix.iloc[customer_idx][rating_matrix.iloc[customer_idx].notnull()].reset_index()
#     rated_items_df.columns = ['product_id', 'rating']
#     rated_items_df = rated_items_df.sort_values(by='rating', ascending=False)

#     filtered_products = products_df[products_df['product_id'].isin(rated_items_df['product_id'].values)][['product_id', 'product_category_name']]
#     filtered_products['product_category_name'] = filtered_products['product_category_name'].map(translate_dict)

#     rated_items_df = pd.merge(rated_items_df, filtered_products, on='product_id', how='inner')
#     rated_items_df = pd.merge(rated_items_df, order_items_df[['product_id', 'price']], on='product_id', how='inner').drop_duplicates(
#     subset=['product_id'], keep='first').reset_index(drop=True)
    
#     if top_n:
#         return rated_items_df.head(top_n)
#     else:
#         return rated_items_df

In [15]:
class RecommendationEngine:
    def __init__(self, df, products_metadata, order_information, translate_dict):
        self.df = df
        self.products_metadata = products_metadata
        self.order_information = order_information
        self.df_imputed = self.df.fillna(self.df.mean(axis=0))
        self.similarity_matrix = cosine_similarity(self.df_imputed.values)
        self.translate_dict = translate_dict
    
    def get_recommendation(self, customer_idx, nr_of_items=2):
        customer_id = self.df.iloc[customer_idx].name
        similarity_scores = list(enumerate(self.similarity_matrix[customer_idx]))
        unrated_products = self.df.iloc[customer_idx][self.df.iloc[customer_idx].isna()].index
        weights = [x[1] for x in similarity_scores]
        product_ratings = (self.df[unrated_products].T * weights).T
        product_ratings = product_ratings.iloc[[x[0] for x in similarity_scores]].mean()
        recommendations = product_ratings.sort_values(ascending=False)[:nr_of_items]

        recommendations_tmp = (
            self.products_metadata[
                self.products_metadata["product_id"].isin(
                    recommendations.reset_index().sort_values(0, ascending=False)['product_id']
                    )
            ][["product_category_name", "product_id"]]
        )
        recommendations_tmp['product_category_name'] = (
            recommendations_tmp['product_category_name'].map(self.translate_dict))

        recommendations_df = pd.DataFrame({'product_id': recommendations.index, 
                                           'score': recommendations.values})
        
        recommendations_final = pd.merge(recommendations_tmp, recommendations_df, on='product_id', 
                                         how='inner').sort_values(by='score', ascending=False)
        
        return recommendations_final
        

    def get_bought_items(self, customer_idx, nr_of_items=2):
        rated_items_df = self.df.iloc[customer_idx][
            self.df.iloc[customer_idx].notnull()].reset_index()
        rated_items_df.columns = ['product_id', 'rating']
        rated_items_df = rated_items_df.sort_values(by='rating', ascending=False)

        filtered_products = self.products_metadata[self.products_metadata[
            'product_id'].isin(rated_items_df['product_id'].values)
        ][['product_id', 'product_category_name']]

        filtered_products['product_category_name'] = filtered_products[
            'product_category_name'].map(self.translate_dict)

        rated_items_df = pd.merge(rated_items_df, filtered_products, on='product_id', how='inner')
        rated_items_df = pd.merge(rated_items_df, self.order_information[['product_id', 'price']],
            on='product_id', how='inner').drop_duplicates(subset=['product_id'], keep='first'
            ).reset_index(drop=True)
        
        
        return rated_items_df[:nr_of_items]


def get_translation_dict(cat_name_translation):
    portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
    english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
    translate_dict = {}

    for p_key in portuguese_cat_names:
        if portuguese_cat_names[p_key] not in translate_dict:
            translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

    return translate_dict

In [3]:
data_folder = '/Users/alex/Workspace/Datasets/OlistEcommercePublicDataset'

orders_df = pd.read_csv(os.path.join(data_folder, 'olist_orders_dataset.csv'))
reviews_df = pd.read_csv(os.path.join(data_folder, 'olist_order_reviews_dataset.csv'))
products_df = pd.read_csv(os.path.join(data_folder, 'olist_products_dataset.csv'))
order_items_df = pd.read_csv(os.path.join(data_folder, 'olist_order_items_dataset.csv'))
customer_df = pd.read_csv(os.path.join(data_folder, 'olist_customers_dataset.csv'))
cat_name_translation =  pd.read_csv(os.path.join(data_folder, 'product_category_name_translation.csv'))

dfs = [orders_df, reviews_df, products_df, order_items_df, customer_df]

In [4]:
# Converting ID columns from 'object' type to string
for df in dfs:
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == 'object':
            df[column] = df[column].astype("string")

unique_id_df = pd.merge(orders_df[['order_id', 'customer_id']], customer_df[['customer_id', 'customer_unique_id']], on=['customer_id'], how='inner')
product_and_order_id_df = pd.merge(orders_df[['order_id', 'customer_id']], order_items_df[['order_id', 'product_id']], on=['order_id'], how='inner')
user_product_order_id_df = pd.merge(unique_id_df, product_and_order_id_df, on=['order_id', 'customer_id'], how='inner')
final_df = pd.merge(user_product_order_id_df, reviews_df[['order_id', 'review_score']], on=['order_id'], how='inner').drop(['customer_id', 'order_id'], axis=1)

In [7]:
### filtering on both criteria
data = final_df.copy()

# get total counts of no. of occurence of product
data["count"] = data.groupby("product_id").transform("count")["customer_unique_id"]

# fetch top 100 movies based on count
product_id = (
    data.drop_duplicates("product_id")
    .sort_values("count", ascending=False)
    .iloc[:500]["product_id"]
)

# filter out data as per the product_id
data = data[data["product_id"].isin(product_id)].reset_index(drop=True)

# get total counts of no. of occurence of customer
data["count"] = data.groupby("customer_unique_id").transform("count")["product_id"]

# fetch top 1000 products based on count
customer_id = (
        data.drop_duplicates("customer_unique_id")
        .sort_values("count", ascending=False)
        .iloc[:1000]["customer_unique_id"]
    )

data = data[data["customer_unique_id"].isin(customer_id)].reset_index(drop=True)

data

Unnamed: 0,customer_unique_id,product_id,review_score,count
0,7973a6ba9c81ecaeb3d628c33c7c7c48,7c1bd920dbdf22470b68bde975dd3ccf,5,3
1,831a032a3327e2b8325faf9d37953870,216bb0e0cd43ffd832e0973d35e0377e,5,6
2,831a032a3327e2b8325faf9d37953870,216bb0e0cd43ffd832e0973d35e0377e,5,6
3,831a032a3327e2b8325faf9d37953870,216bb0e0cd43ffd832e0973d35e0377e,5,6
4,831a032a3327e2b8325faf9d37953870,216bb0e0cd43ffd832e0973d35e0377e,5,6
...,...,...,...,...
3096,5bc738a48bdb9ca1f798d56de2cce84c,53759a2ecddad2bb87a079a1f1519f73,5,5
3097,5bc738a48bdb9ca1f798d56de2cce84c,53759a2ecddad2bb87a079a1f1519f73,5,5
3098,5bc738a48bdb9ca1f798d56de2cce84c,53759a2ecddad2bb87a079a1f1519f73,5,5
3099,5097a5312c8b157bb7be58ae360ef43c,d1c427060a0f73f6b889a5c7c61f2ac4,2,2


In [19]:
df = data.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")
translate_dict = get_translation_dict(cat_name_translation)

recommendationengine = RecommendationEngine(df, products_df, order_items_df, translate_dict)

customer_idx = 0
display(recommendationengine.get_recommendation(customer_idx=customer_idx, nr_of_items=2))
display(recommendationengine.get_bought_items(customer_idx=customer_idx, nr_of_items=2))
print()
customer_idx = 10
display(recommendationengine.get_bought_items(customer_idx=customer_idx, nr_of_items=2))
display(recommendationengine.get_recommendation(customer_idx=customer_idx, nr_of_items=2))

Unnamed: 0,product_category_name,product_id,score
1,construction_tools_lights,349ce46a0e2e20054aa9d80c48af8816,5.0
0,sports_leisure,6a2909ac21d16b721e4795e7e8ff3e68,4.998252


Unnamed: 0,product_id,rating,product_category_name,price
0,368c6c730842d78016ad823897a372db,5.0,garden_tools,49.9





Unnamed: 0,product_id,rating,product_category_name,price
0,aca2eb7d00ea1a7b8ebd4e68314663af,4.0,furniture_decor,69.9


Unnamed: 0,product_category_name,product_id,score
0,cool_stuff,fe6a9515d655fa7936b8a7c841039f34,4.999903
1,health_beauty,67bd616e1ba0d3d3e8545f3113b0140d,4.999903


In [65]:
# df = data.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")
# df_imputed = df.fillna(df.mean(axis=0))
# similarity_matrix = cosine_similarity(df_imputed.values)

# portuguese_cat_names = cat_name_translation.to_dict()['product_category_name']
# english_cat_names = cat_name_translation.to_dict()['product_category_name_english']
# translate_dict = {}

# for p_key in portuguese_cat_names:
#     if portuguese_cat_names[p_key] not in translate_dict:
#         translate_dict[portuguese_cat_names[p_key]] = english_cat_names[p_key]

In [67]:
# customer_idx = 0
# recommendations = get_recommendation(customer_idx, df, df_imputed, similarity_matrix, products_df, translate_dict, top_n=10)
# bought_items = get_bought_items(customer_idx, df, products_df, translate_dict, order_items_df, top_n=10)

# display(bought_items)
# display(recommendations)

Unnamed: 0,product_id,rating,product_category_name,price
0,368c6c730842d78016ad823897a372db,5.0,garden_tools,49.9


Unnamed: 0,product_id,product_category_name,inferred_rating
211,f4d705aa95ccca448e5b0deb6e5290ba,bed_bath_table,4.631555
212,bbaef2eadf31fe3ea6702077398be06c,perfumery,4.631555
289,c6336fa91fbd87c359e44f5dca5a90ed,sports_leisure,4.631555
387,0bb7cb61f1957f79dac582ab66ccdc1f,housewares,4.631555
523,6a23ae3187f1dac41b7210a8c7739d6b,furniture_decor,4.631555
560,593236d0ff46b4299b4787fb8d43f7f0,industry_commerce_and_business,4.631555
578,386486367c1f9d4f587a8864ccb6902b,bed_bath_table,4.631555
793,53b36df67ebb7c41585e8d54d6772e08,watches_gifts,4.631555
890,ace5d86cf1ac63cdb76f49e5cd23d2f8,furniture_decor,4.631555
1009,c857b96593773e940454e76efa8eabb3,furniture_decor,4.631555


In [91]:
# Creating user-item matrix
df = data.pivot_table(index="customer_unique_id", columns="product_id", values="review_score")

# Secondary matrix where NaN values are replaced with produc average
df_imputed = df.fillna(df.mean(axis=0))

# User based similarity
similarity_matrix = cosine_similarity(df_imputed.values)
# similarity_matrix_normalized = normalize_matrix(similarity_matrix)

In [140]:
# customer_idx = 30
# customer_id = df.iloc[customer_idx].name

# # similarity_scores = list(enumerate(similarity_matrix_normalized[customer_idx]))
# similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
# unrated_products = df.iloc[customer_idx][df.iloc[customer_idx].isna()].index

# weights = [x[1] for x in similarity_scores]
# product_ratings = (df[unrated_products].T * weights).T
# product_ratings = product_ratings.iloc[[x[0] for x in similarity_scores]].mean()

# nr_of_recommendations = 2
# recommendations = product_ratings.sort_values(ascending=False)[:nr_of_recommendations]

# recommendations_test = (
#     products_df[
#         products_df["product_id"].isin(
#             recommendations.reset_index().sort_values(0, ascending=False)['product_id']
#             )
#     ][["product_category_name", "product_id"]])

# recommendations = pd.DataFrame({
#     'product_id': recommendations.index, 'score': recommendations.values})
# recommendations_test['product_category_name'] = recommendations_test['product_category_name'].map(translate_dict)
# recommendations_final = pd.merge(recommendations_test, recommendations, on='product_id', how='inner')
# bought_items = get_bought_items(customer_idx, df, products_df, translate_dict, order_items_df, top_n=10)

# display(bought_items)
# display(recommendations_final)

Unnamed: 0,product_id,rating,product_category_name,price
0,d678178aa4291cd25a755a90188375c8,2.0,furniture_decor,35.0


Unnamed: 0,product_category_name,product_id,score
0,home_construction,679a3e5e1d2bb68982be5734c6e55e63,4.997368
1,cool_stuff,fe6a9515d655fa7936b8a7c841039f34,4.997368


In [134]:
# customer_idx = 10
# customer_id = df.iloc[customer_idx].name

# # similarity_scores = list(enumerate(similarity_matrix_normalized[customer_idx]))
# similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
# unrated_products = df.iloc[customer_idx][df.iloc[customer_idx].isna()].index

# weights = [x[1] for x in similarity_scores]
# product_ratings = (df[unrated_products].T * weights).T
# product_ratings = product_ratings.iloc[[x[0] for x in similarity_scores]].mean()

# nr_of_recommendations = 1
# recommendations = product_ratings.sort_values(ascending=False)[:nr_of_recommendations]

# recommendations_test = (
#     products_df[
#         products_df["product_id"].isin(
#             recommendations.reset_index().sort_values(0, ascending=False)['product_id']
#             )
#     ][["product_category_name", "product_id"]])

# recommendations = pd.DataFrame({
#     'product_id': recommendations.index, 'score': recommendations.values})
# recommendations_test['product_category_name'] = recommendations_test['product_category_name'].map(translate_dict)
# recommendations_final = pd.merge(recommendations_test, recommendations, on='product_id', how='inner')
# display(recommendations_final)

# bought_items = get_bought_items(customer_idx, df, products_df, translate_dict, order_items_df, top_n=10)
# display(bought_items)

Unnamed: 0,product_category_name,product_id,score
0,cool_stuff,fe6a9515d655fa7936b8a7c841039f34,4.999903


Unnamed: 0,product_id,rating,product_category_name,price
0,aca2eb7d00ea1a7b8ebd4e68314663af,4.0,furniture_decor,69.9
