In [1]:
import os
import pandas as pd
import surprise

In [2]:
# Option 1: reading data from a dataframe

In [3]:
df = pd.read_csv('sample_data.csv')
df.head()

Unnamed: 0,user,rating,item
0,1,2,1
1,2,2,1
2,3,3,2
3,4,3,2
4,5,1,1


In [4]:
reader = surprise.dataset.Reader(line_format='user rating item', rating_scale=(1,5))

In [5]:
data = surprise.dataset.Dataset.load_from_df(df, reader)

In [6]:
data.raw_ratings

[(1, 2, 1.0, None),
 (2, 2, 1.0, None),
 (3, 3, 2.0, None),
 (4, 3, 2.0, None),
 (5, 1, 1.0, None)]

In [10]:
# Option 2: load directly from a text file

In [7]:
reader = surprise.dataset.Reader(line_format='user rating item', rating_scale=(1,5), sep=',', skip_lines=1)

In [8]:
data1 = surprise.dataset.Dataset.load_from_file('sample_data.csv', reader=reader)

In [9]:
data1.raw_ratings

[('1', '1', 2.0, None),
 ('2', '1', 2.0, None),
 ('3', '2', 3.0, None),
 ('4', '2', 3.0, None),
 ('5', '1', 1.0, None)]

In [12]:
os.getcwd()

'C:\\Users\\Swastik\\Desktop\\Dono Consulting\\Deloitte_Training\\recommendation-systems'

In [13]:
os.chdir('C:\\Users\\Swastik\\Desktop\\Dono Consulting\\Deloitte_Training\\recommendation-systems\\ml-latest-small')

In [14]:
mr = pd.read_csv('ratings.csv')

In [15]:
mr.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [16]:
mr.shape

(100004, 4)

In [17]:
mr.drop('timestamp', axis=1, inplace=True)

In [18]:
mr.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [19]:
mr.rename(columns={'userId':'user','movieId':'item'}, inplace=True)

In [20]:
reader = surprise.dataset.Reader(line_format='user item rating', rating_scale=(1,5))

In [21]:
mr_train = surprise.dataset.Dataset.load_from_df(mr, reader=reader)

In [22]:
mr_train

<surprise.dataset.DatasetAutoFolds at 0x1727c7e6f40>

In [23]:
mr_trainset = mr_train.build_full_trainset()

In [24]:
mr_trainset

<surprise.trainset.Trainset at 0x1725a05b7c0>

## Memory Based Recommendation Systems

In [70]:
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans

In [None]:
#user-based

In [46]:
knnbasic_user = KNNBasic(k=40, min_k=1, sim_options={'name':'cosine', 'user_based':True})

In [47]:
knnbasic_user.fit(mr_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1727c7e6850>

In [38]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [64]:
mr.query('user == 500')['item'].unique()

array([    1,     2,    19,    34,    39,    48,    62,   110,   158,
         231,   260,   317,   318,   329,   337,   344,   356,   362,
         364,   367,   480,   497,   500,   520,   551,   586,   588,
         593,   595,   596,   597,   616,   700,   708,   736,   783,
         784,   919,  1013,  1025,  1035,  1073,  1088,  1089,  1097,
        1193,  1197,  1207,  1210,  1219,  1225,  1246,  1265,  1282,
        1367,  1380,  1407,  1441,  1485,  1517,  1569,  1580,  1704,
        1721,  1739,  1777,  1923,  1947,  1959,  1968,  2005,  2053,
        2054,  2078,  2081,  2139,  2141,  2144,  2145,  2150,  2273,
        2321,  2324,  2355,  2420,  2470,  2502,  2541,  2571,  2572,
        2599,  2657,  2671,  2683,  2694,  2706,  2710,  2724,  2762,
        2797,  2915,  2918,  2959,  3114,  3174,  3247,  3253,  3255,
        3397,  3408,  3418,  3421,  3448,  3481,  3624,  3717,  3751,
        3752,  3755,  3793,  3863,  3897,  3948,  3949,  3977,  4014,
        4018,  4022,

In [50]:
knnbasic_user.predict(uid=1, iid=31, r_ui=2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=3.1834796860227086, details={'actual_k': 40, 'was_impossible': False})

In [65]:
knnbasic_user.predict(uid=500, iid=3)

Prediction(uid=500, iid=3, r_ui=None, est=3.2849287719972278, details={'actual_k': 40, 'was_impossible': False})

In [None]:
#item-based

In [51]:
knnbasic_item = KNNBasic(k=40, min_k=1, sim_options={'name':'cosine', 'user_based':False})

In [52]:
knnbasic_item.fit(mr_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1727ea6fd30>

In [53]:
knnbasic_item.predict(uid=1, iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.547471538910294, details={'actual_k': 20, 'was_impossible': False})

In [62]:
knnbasic_item.predict(uid=500, iid=70183)

Prediction(uid=500, iid=70183, r_ui=None, est=3.0125, details={'actual_k': 40, 'was_impossible': False})

In [None]:
# K-Fold CV

In [66]:
from surprise.model_selection import KFold
from surprise import accuracy

In [69]:
kf = KFold(n_splits=3)

for trainset, testset in kf.split(mr_train):
    knnbasic_item.fit(trainset)
    predictions = knnbasic_item.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9995
MAE:  0.7772
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9912
MAE:  0.7726
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9944
MAE:  0.7724


In [71]:
knnwithmeans_item = KNNWithMeans(k=40, min_k=1, sim_options={'name':'cosine', 'user_based':False})

In [72]:
kf = KFold(n_splits=3)

for trainset, testset in kf.split(mr_train):
    knnwithmeans_item.fit(trainset)
    predictions = knnwithmeans_item.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9387
MAE:  0.7200
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9355
MAE:  0.7163
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9309
MAE:  0.7137


In [None]:
#Grid Search

In [75]:
param_grid = {'k':[10,20],
             'sim_options':{'name':['pearson','cosine'], 'user_based':[False]}}

In [76]:
algo = KNNWithMeans

In [77]:
from surprise.model_selection import GridSearchCV

In [78]:
grid_search = GridSearchCV(algo, param_grid=param_grid, measures=['rmse','mae'])

In [79]:
grid_search.fit(mr_train)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [80]:
print(grid_search.best_params['rmse'])
print(grid_search.best_score['rmse'])

{'k': 20, 'sim_options': {'name': 'cosine', 'user_based': False}}
0.935475896745986


In [81]:
print(grid_search.best_params['mae'])
print(grid_search.best_score['mae'])

{'k': 20, 'sim_options': {'name': 'pearson', 'user_based': False}}
0.7134133809337223


In [None]:
#Top 5 recommendations for an item

In [109]:
model = KNNWithMeans(k=20, sim_options={'name': 'cosine', 'user_based': False})
model.fit(mr_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1727ff01c10>

In [88]:
mr.tail()

Unnamed: 0,user,item,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5


In [91]:
mr_trainset.to_inner_iid(6268)

7005

In [92]:
model.get_neighbors(mr_trainset.to_inner_iid(6268),5)

[9, 19, 25, 28, 29]

In [93]:
for i in model.get_neighbors(mr_trainset.to_inner_iid(6268),5):
    print(mr_trainset.to_raw_iid(i))

1343
3671
52
144
150


In [None]:
#Top 5 recommendations for an user

In [94]:
model = KNNWithMeans(k=20, sim_options={'name': 'cosine', 'user_based': True})
model.fit(mr_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1727ec40490>

In [106]:
mr.tail()

Unnamed: 0,user,item,rating
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5
100003,671,6565,3.5


In [101]:
for i in model.get_neighbors(mr_trainset.to_inner_uid(671),5):
    print(mr_trainset.to_raw_uid(i))

1
35
46
76
113


UDF:

Provide raw user id and item id

Finds 5 most similar not yet rated by the user

In [113]:
def NNbyUserItem(myuser,myitem):
    counter = 0;
    for i in model.get_neighbors(mr_trainset.to_inner_iid(myitem),len(mr)):
        if(len(mr.loc[(mr['item']==mr_trainset.to_raw_iid(i)) & (mr['user'] ==  myuser)])):
            if(counter < 5):
                print(mr_trainset.to_raw_iid(i))
                counter = counter+1 
            else:
                break

In [114]:
NNbyUserItem(671,6565)

1223
6268
6212
529
6269


## Model Based Recommendation Systems

In [115]:
from surprise import SVD

In [117]:
model = SVD(n_factors=20)

In [118]:
model.fit(mr_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1727ff01670>

In [119]:
model.predict(uid=1, iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.3403933662796854, details={'was_impossible': False})

In [120]:
from surprise import NMF

In [127]:
model1 = NMF(n_factors=20, biased=True)

In [128]:
model1.fit(mr_trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1727ff01520>

In [129]:
model1.predict(uid=1, iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.4482172508755493, details={'was_impossible': False})

In [130]:
kf = KFold(n_splits=3)

for trainset, testset in kf.split(mr_train):
    model.fit(trainset)
    predictions = model.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

RMSE: 0.8993
MAE:  0.6917
RMSE: 0.8972
MAE:  0.6910
RMSE: 0.8935
MAE:  0.6891


In [131]:
kf = KFold(n_splits=3)

for trainset, testset in kf.split(mr_train):
    model1.fit(trainset)
    predictions = model1.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

RMSE: 1.4008
MAE:  1.0774
RMSE: 1.2290
MAE:  0.9261
RMSE: 1.3021
MAE:  0.9907


In [132]:
algo = SVD

In [134]:
param_grid = {'n_factors':[15,20,30,40],
             'n_epochs':[20,25],
             'lr_all':[0.005,0.010]}

In [135]:
grid_search = GridSearchCV(algo, param_grid=param_grid, measures=['rmse','mae'])

In [136]:
grid_search.fit(mr_train)

In [137]:
print(grid_search.best_params['rmse'])
print(grid_search.best_score['rmse'])

{'n_factors': 15, 'n_epochs': 25, 'lr_all': 0.005}
0.8917043350075217


In [138]:
print(grid_search.best_params['mae'])
print(grid_search.best_score['mae'])

{'n_factors': 15, 'n_epochs': 25, 'lr_all': 0.005}
0.6858481579330711


In [139]:
final_model = SVD(n_factors=15, n_epochs=25, lr_all=0.005)

In [140]:
final_model.fit(mr_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1727ec40700>

In [141]:
final_model.predict(uid=1, iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.3187145247993017, details={'was_impossible': False})