# Algorithm Set Up

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from surprise import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset
from surprise import dump
from surprise import Reader

import io
import os
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

#Load Data
orders = pd.read_csv('D:\Data Science\olist_orders_dataset.csv')
order_payments = pd.read_csv('D:\Data Science\olist_order_payments_dataset.csv')
customers = pd.read_csv('D:\Data Science\olist_customers_dataset.csv')
order_items = pd.read_csv('D:\Data Science\olist_order_items_dataset.csv')
order_reviews = pd.read_csv('D:\Data Science\olist_order_reviews_dataset.csv')
order_products = pd.read_csv('D:\Data Science\olist_products_dataset.csv')

#Filter "One Item Order"
order_items = order_items[(order_items.order_item_id == 1)]

#merge other tables
order_items = pd.merge(order_items, order_reviews, on='order_id')
order_items = pd.merge(order_items, orders, on='order_id')
order_items = pd.merge(order_items, customers, on='customer_id')

#Select the 3 mandatory columns
cols = ['customer_unique_id', 'product_id', 'review_score']
dataset = order_items[cols]

#count the reviews given by unique customer
reviews_count = pd.DataFrame(dataset.groupby('customer_unique_id')['review_score'].count())

#Filter users with more than 1 review
reviews_count = reviews_count[reviews_count['review_score'] > 1]

#merge the table
dataset = pd.merge(reviews_count, dataset, on='customer_unique_id')

#Select the 3 mandatory colums
cols = ['customer_unique_id', 'product_id', 'review_score_y']
dataset = dataset[cols]

#display Information
dataset.info()
dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6799 entries, 0 to 6798
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_unique_id  6799 non-null   object
 1   product_id          6799 non-null   object
 2   review_score_y      6799 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 212.5+ KB


Unnamed: 0,customer_unique_id,product_id,review_score_y
0,000bfa1d2f1a41876493be685390d6d3,bb15f9ba2ec6e36ab6c9e88d17430d64,5
1,000bfa1d2f1a41876493be685390d6d3,bb15f9ba2ec6e36ab6c9e88d17430d64,4
2,004288347e5e88a27ded2bb23747066c,a2bd2eae20998a24c22b110334928b02,5
3,004288347e5e88a27ded2bb23747066c,6e1b14d3cbb5fb3a2c00351007127dfd,5
4,004b45ec5c64187465168251cd1c9c2f,b0961721fd839e9982420e807758a2a6,1
...,...,...,...
6794,ff922bdd6bafcdf99cb90d7f39cea5b3,e54cb69cc7bf5d21921991aae48501fb,3
6795,ff922bdd6bafcdf99cb90d7f39cea5b3,b8762d562d810c2f8d7ad7e9c12957be,5
6796,ff922bdd6bafcdf99cb90d7f39cea5b3,fd3a4121e687a6b84e7af92606d5d718,5
6797,ffe254cc039740e17dd15a5305035928,18fc07433ae4bcd5a8bdf658b2c6e40d,1


# Load Data

In [2]:
#set Review Scale
reader = Reader(rating_scale=(1,5))

#load dataset (E-Commerce) into right format
dataset_ecommerce = Dataset.load_from_df(dataset[['customer_unique_id', 'product_id', 'review_score_y']], reader)

#create Trainset, Testset (test_size = 25%)
trainset, testset = train_test_split(dataset_ecommerce, test_size=.25)

# GridSearchCV - Identify best parameters to minimize RSME and fit Algorithm

## KNNBasic

In [3]:
#try differenet similarity options
sim_options = {"name": ["msd", "cosine"], "min_support": [1,2,3,4,5,6,7], "user_based": [False, True]}
param_grid = {"sim_options": sim_options}

#start GridSearchCV with KNNBasic
gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse"], cv=3)
gs.fit(dataset_ecommerce)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [4]:
#use the best sim_options for KNNBasic
algo_KNNBasic = gs.best_estimator['rmse']

#fit the algorithm for the trainset
algo_KNNBasic.fit(trainset)

#predict the 25% testset
predictions_KNNBasic = algo_KNNBasic.test(testset, verbose = False)

Computing the cosine similarity matrix...
Done computing similarity matrix.


## KNNWithMeans

In [5]:
#try differenet similarity options
sim_options = {"name": ["msd", "cosine"], "min_support": [1,2,3,4,5,6,7], "user_based": [False, True]}
param_grid = {"sim_options": sim_options}

#start GridSearchCV with KNNBasic
gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=3)
gs.fit(dataset_ecommerce)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [6]:
#use the best sim_options for KNNBasic
algo_KNNWithMeans = gs.best_estimator['rmse']

#fit the algorithm for the trainset
algo_KNNWithMeans.fit(trainset)

#predict the 25% testset
predictions_KNNWithMeans = algo_KNNWithMeans.test(testset, verbose = False)

Computing the cosine similarity matrix...
Done computing similarity matrix.


## KNNBaseline

In [7]:
#try differenet similarity options
sim_options = {"name": ["msd", "cosine"], "min_support": [1,2,3,4,5,6,7], "user_based": [False, True]}
param_grid = {"sim_options": sim_options}

#start GridSearchCV with KNNBasic
gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse"], cv=3)
gs.fit(dataset_ecommerce)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

NameError: name 'KNNBaseline' is not defined

In [None]:
#use the best sim_options for KNNBasic
algo_KNNBaseline = gs.best_estimator['rmse']

#fit the algorithm for the trainset
algo_KNNBaseline.fit(trainset)

#predict the 25% testset
predictions_KNNBaseline = algo_KNNBaseline.test(testset, verbose = False)

In [None]:
#get Number of Items reviewed by given User
def get_Iu(uid):
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:
        return 0
    
#number of users that have reviewed given item   
def get_Ui(iid):
     try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
     except ValueError:
        return 0

#create new dataFrame
results_KNNBasic = pd.DataFrame(predictions_KNNBasic, columns=['uid','iid','rui', 'est', 'details'])
results_KNNBasic['Iu'] = results_KNNBasic.uid.apply(get_Iu) #number of items reviewed by given user
results_KNNBasic['Ui'] = results_KNNBasic.iid.apply(get_Ui) #number of users that have reviewed given item
results_KNNBasic['err'] = abs(results_KNNBasic.est - results_KNNBasic.rui)