In [None]:
conda install -c conda forge scikit-surprise

In [1]:
import pandas as pd
import os
import surprise

In [2]:
# To read a file using surprise, one needs to ensure that data is in specific format
# 1. Reading from df
# 2. Reading from a text file

# Reading data from a dataframe
df = pd.read_csv("sample_data.csv")
df

Unnamed: 0,user,rating,item
0,1,2,1
1,2,2,1
2,3,3,2
3,4,3,2
4,5,1,1


In [3]:
# Surprise expects data to have 3 cols - user, rating, item. The spelling of these col names should be as given.
# if col names are different, remember to change them before trying to read them using surprise.

# create reader object
reader = surprise.dataset.Reader(line_format='user rating item', rating_scale=(1,5))

In [4]:
# converting dataframe 
data = surprise.dataset.Dataset.load_from_df(df, reader=reader)

In [5]:
data.raw_ratings

[(1, 2, 1.0, None),
 (2, 2, 1.0, None),
 (3, 3, 2.0, None),
 (4, 3, 2.0, None),
 (5, 1, 1.0, None)]

In [7]:
# reading from text file
reader1 = surprise.dataset.Reader(line_format='user rating item', sep=",", rating_scale=(1,5), skip_lines=1)

In [8]:
data1 = surprise.dataset.Dataset.load_from_file("sample_data.csv", reader=reader1)

In [11]:
###################################################################################################################

In [9]:
data_dir = r"C:\Users\fabi\Desktop\DONO\Manipal_Deloitte\Recommendation System\ml-latest-small"
os.chdir(data_dir)

In [10]:
mr = pd.read_csv("ratings.csv")

In [12]:
mr.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [13]:
mr.drop('timestamp', axis=1, inplace=True)

In [14]:
mr.rename(columns={'userId':'user','movieId':'item','rating':'rating'}, inplace=True)

In [15]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [16]:
reader = surprise.dataset.Reader(line_format='user item rating', rating_scale=(0,5))

In [17]:
mr_train = surprise.dataset.Dataset.load_from_df(mr, reader=reader)

In [19]:
mr_trainset = mr_train.build_full_trainset()

In [20]:
import surprise.prediction_algorithms.knns as knns

In [21]:
# user-based collaborative filtering model using cosine sim
knnbasic = knns.KNNBasic(k=40, min_k=1, sim_options={'name':'cosine', 'user_based':True})

In [22]:
knnbasic.fit(mr_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x214c5aaa100>

In [23]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [24]:
knnbasic.predict(uid=1, iid=31, r_ui=2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=3.1834796860227086, details={'actual_k': 40, 'was_impossible': False})

In [25]:
# item based model using cosine sim
knn_item = knns.KNNBasic(k=40, min_k=1, sim_options={'name':'cosine', 'user_based':False})

In [26]:
knn_item.fit(mr_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x214c560e0a0>

In [27]:
knn_item.predict(uid=1, iid=31)

Prediction(uid=1, iid=31, r_ui=None, est=2.547471538910294, details={'actual_k': 20, 'was_impossible': False})

In [28]:
# item based model using correlation metric with avg effects
knn_means = knns.KNNWithMeans(k=40, min_k=1, sim_options={'name':'pearson','user_based':False})

In [29]:
knn_means.fit(mr_trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x214c7062460>

In [30]:
knn_means.predict(uid=1, iid=31, r_ui=2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=2.18133813941489, details={'actual_k': 17, 'was_impossible': False})

In [31]:
from surprise.model_selection import KFold
from surprise import accuracy

In [32]:
# splitting data into 3 folds and evaluating model performance out of sample
kf = KFold(n_splits=3)

In [33]:
for trainset, testset in kf.split(mr_train):
    knn_item.fit(trainset)
    predictions = knn_item.test(testset)
    
    #evaluation
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9966
MAE:  0.7741
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9961
MAE:  0.7734
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9922
MAE:  0.7730


In [34]:
knnwithmeans = knns.KNNWithMeans(k=40, sim_options={'name':'cosine','user_based':False})

In [35]:
for trainset, testset in kf.split(mr_train):
    knnwithmeans.fit(trainset)
    predictions = knnwithmeans.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9307
MAE:  0.7125
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9378
MAE:  0.7173
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9353
MAE:  0.7183


In [36]:
# creating search grid
param_grid = {'k':[10,20], 'sim_options':{'name':['msd','cosine'], 'user_based':[False]}}

In [37]:
algo = knns.KNNWithMeans

In [38]:
from surprise.model_selection import GridSearchCV

In [39]:
grid_search = GridSearchCV(algo, param_grid=param_grid, measures=['RMSE','MAE'])

In [40]:
grid_search.fit(mr_train)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix.

In [41]:
print(grid_search.best_params['rmse'])

{'k': 20, 'sim_options': {'name': 'msd', 'user_based': False}}


In [42]:
print(grid_search.best_params['mae'])

{'k': 20, 'sim_options': {'name': 'msd', 'user_based': False}}


In [43]:
print(grid_search.best_score['rmse'])

0.9238265125824545


In [44]:
print(grid_search.best_score['mae'])

0.7083711411956226


In [46]:
model = knns.KNNWithMeans(k=20, sim_options={'name':'msd','user_based':False})
model.fit(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x214c64122b0>

In [47]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [48]:
mr_trainset.to_inner_iid(1061)

2

In [49]:
model.get_neighbors(mr_trainset.to_inner_iid(1061), 5)

[51, 80, 95, 269, 292]

In [50]:
for i in [51, 80, 95, 269, 292]:
    print(mr_trainset.to_raw_iid(i))

314
537
720
2348
2867


In [51]:
model = knns.KNNWithMeans(k=20, sim_options={'name':'msd','user_based':True})
model.fit(mr_trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x214c97680a0>

In [52]:
mr_trainset.to_inner_uid(1)

0

In [53]:
model.get_neighbors(mr_trainset.to_inner_uid(1), 5)

[8, 32, 67, 95, 98]

In [54]:
for i in [8, 32, 67, 95, 98]:
    print(mr_trainset.to_raw_uid(i))

9
33
68
96
99
