In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'data\Electronics_Dataset.csv', header = None)

In [3]:
df.columns = ['user_id', 'prod_id', 'rating', 'timestamp']

In [4]:
df.shape

(7824482, 4)

In [5]:
df.columns

Index(['user_id', 'prod_id', 'rating', 'timestamp'], dtype='object')

In [6]:
df.head()

Unnamed: 0,user_id,prod_id,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


# packges

In [7]:
import matplotlib.pyplot as plt
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise import SVD

In [8]:
df.user_id.nunique()

4201696

In [9]:
df.prod_id.nunique()

476002

In [10]:
df.rating.value_counts()

rating
5.0    4347541
4.0    1485781
1.0     901765
3.0     633073
2.0     456322
Name: count, dtype: int64

In [11]:
df.groupby(by = 'user_id')['rating'].count().mean()

1.8622199226217222

In [12]:
df['prod_id'].value_counts().nlargest(10)

prod_id
B0074BW614    18244
B00DR0PDNE    16454
B007WTAJTO    14172
B0019EHU8G    12285
B006GWO5WK    12226
B003ELYQGG    11617
B003ES5ZUU    10276
B007R5YDYA     9907
B00622AG6S     9823
B0002L5R78     9487
Name: count, dtype: int64

# remove the timestamp

In [13]:
data = df.drop(columns = ['timestamp'])

In [14]:
data.head()

Unnamed: 0,user_id,prod_id,rating
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


# colabrative filtering

In [15]:
rating_count = data.groupby(by = 'prod_id')['rating'].count()

In [16]:
popular_products = rating_count[rating_count >= 1000].index

In [17]:
rec_data = data[data['prod_id'].isin(popular_products)]

In [18]:
reader = Reader(rating_scale = (1,5))

In [19]:
reader

<surprise.reader.Reader at 0x1c998aac2b0>

In [20]:
reader_data = Dataset.load_from_df(rec_data, reader)

In [21]:
trainset, testset = train_test_split(reader_data, test_size = 0.25, random_state = 19)

In [22]:
# normal predictor

In [23]:
from surprise import NormalPredictor

In [24]:
algo = NormalPredictor()
algo.fit(trainset)

<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x1c998aad3f0>

In [25]:
test_pred = algo.test(testset)

In [26]:
accuracy.rmse(test_pred)

RMSE: 1.5875


1.58752560508856

# KNN Baseline

In [27]:
from surprise import KNNBaseline

In [28]:
sim_option = {'name' : "cosine", 'user_based' : False}

In [29]:
algo_knn = KNNBaseline(sim_options=sim_option)

In [30]:
algo_knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1c998a5ee30>

In [31]:
train_algo = algo_knn.test(trainset.build_testset())

In [32]:
test_algo = algo.test(testset)

In [33]:
Rmse_Knn_train = accuracy.rmse(train_algo)
Rmse_knn_test = accuracy.rmse(test_algo)

RMSE: 0.3908
RMSE: 1.5852


# define the param grid

In [37]:
param_grid = { 'k' :[10, 50, 100], 
              'sim_options': {'name':['msd', 'cosine'],
                             'user_based': [False]
                             }}

In [38]:
gs = GridSearchCV(KNNBaseline, param_grid = param_grid, measures=['rmse'], cv = 3)

In [39]:
gs.fit(reader_data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine sim

In [42]:
print(f"The best Score is {round(gs.best_score['rmse'], 2)}, and the best parameter is {gs.best_params['rmse']}.")

The best Score is 1.23, and the best parameter is {'k': 50, 'sim_options': {'name': 'cosine', 'user_based': False}}.


# SVD

In [43]:
svd_algo = SVD()

In [44]:
svd_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ca59d06410>

In [45]:
train_pred = svd_algo.test(trainset.build_testset())
test_pred = svd_algo.test(testset)

In [46]:
RMSE_train = accuracy.rmse(train_pred)
RMSE_test = accuracy.rmse(test_pred)

RMSE: 0.5395
RMSE: 1.2029


In [47]:
# Param Grid
para_grid = {"n_epochs": [5,10,15],
            "lr_all": [0.002, 0.005, 0.007],
            "reg_all" : [0.4, 0.6]}

gs = GridSearchCV(SVD, para_grid, measures = ['rmse'], cv = 3)

In [48]:
gs.fit(reader_data)

In [49]:
print(f"The best Score is {round(gs.best_score['rmse'], 2)}, and the best parameter is {gs.best_params['rmse']}.")

The best Score is 1.2, and the best parameter is {'n_epochs': 15, 'lr_all': 0.007, 'reg_all': 0.4}.


In [50]:
import random 

random.seed(0)

In [51]:
random_test_samples = random.sample(testset, 10)

In [52]:
prediction = [svd_algo.predict(uid, iid, r_ui) for (uid, iid, r_ui) in random_test_samples]

In [57]:
lst = []
for pred in prediction:
    print(f"User: {pred.uid}, Item:{pred.iid}, Acutual Rating: {pred.r_ui}, Predction Rating : {pred.est:.2f} and the difference is {float(pred.r_ui) - float(pred.est)}")
    lst.append({"User": pred.uid, "Item": pred.iid, "Acutual Rating": pred.r_ui, "Predction Rating": pred.est ,"difference" : float(pred.r_ui) - float(pred.est)})

User: A3G4LC66GBUPRS, Item:B003ZSP0WW, Acutual Rating: 4.0, Predction Rating : 4.53 and the difference is -0.5250296931904561
User: AXCQ917STWHIX, Item:B00622AG6S, Acutual Rating: 5.0, Predction Rating : 4.35 and the difference is 0.6507419059433301
User: AXK9SEMV8OHJO, Item:B007OY5V68, Acutual Rating: 5.0, Predction Rating : 4.42 and the difference is 0.5796695535498015
User: A36QCY2E4SW11S, Item:B0088LYCZC, Acutual Rating: 4.0, Predction Rating : 3.93 and the difference is 0.06840122157174111
User: A127O6CWXTMSTU, Item:B000IJY8DS, Acutual Rating: 3.0, Predction Rating : 4.34 and the difference is -1.3437955077520494
User: A200BPGA53VKZ0, Item:B003SX0P1A, Acutual Rating: 5.0, Predction Rating : 4.50 and the difference is 0.4988903724281073
User: A3A1V7N6QNHA2D, Item:B004PEIG12, Acutual Rating: 2.0, Predction Rating : 4.10 and the difference is -2.095798949066654
User: A2V8I8BXL8OZ2M, Item:B005I7KIUW, Acutual Rating: 5.0, Predction Rating : 3.98 and the difference is 1.0206750531699322

In [58]:
import pandas as pd

df = pd.DataFrame(lst)


In [59]:
df

Unnamed: 0,User,Item,Acutual Rating,Predction Rating,difference
0,A3G4LC66GBUPRS,B003ZSP0WW,4.0,4.52503,-0.52503
1,AXCQ917STWHIX,B00622AG6S,5.0,4.349258,0.650742
2,AXK9SEMV8OHJO,B007OY5V68,5.0,4.42033,0.57967
3,A36QCY2E4SW11S,B0088LYCZC,4.0,3.931599,0.068401
4,A127O6CWXTMSTU,B000IJY8DS,3.0,4.343796,-1.343796
5,A200BPGA53VKZ0,B003SX0P1A,5.0,4.50111,0.49889
6,A3A1V7N6QNHA2D,B004PEIG12,2.0,4.095799,-2.095799
7,A2V8I8BXL8OZ2M,B005I7KIUW,5.0,3.979325,1.020675
8,ATIY8WPSZBXCE,B008R7EVE4,5.0,4.40214,0.59786
9,A1K9LHOWOCYVBZ,B003ZX8B3W,4.0,3.792315,0.207685
