In [1]:
import pandas as pd
import numpy as np
import surprise
import os

ratings = pd.read_csv("C:/Users/User/Desktop/cdac advance modules/pml lecture/class  practice/program_dataset/ratings_small.csv")

In [2]:
ratings=ratings.drop('timestamp',axis=1)

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:

lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()

print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))

Ratings range between 0.5 and 5.0


In [5]:
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))
data = surprise.Dataset.load_from_df(ratings,reader)

In [6]:
similarity_options = {'name': 'cosine', 'user_based': True}

In [11]:
# Default k = 40
algo = surprise.SVD()
output = algo.fit(data.build_full_trainset())

In [12]:
pred = algo.predict(uid='100',iid='6')
score = pred.est
print(score)

3.543608255669773


In [13]:
iids = ratings['movieId'].unique()

In [14]:
iids100 = ratings.loc[ratings['userId'] == 100 ,'movieId']
print("List of movieId that uid={0} has rated:".format(100))
print(iids100)

List of movieId that uid=100 has rated:
15273       1
15274       3
15275       6
15276       7
15277      25
15278      32
15279      52
15280      62
15281      86
15282      88
15283      95
15284     135
15285     141
15286     608
15287     648
15288     661
15289     708
15290     733
15291     736
15292     745
15293     780
15294     786
15295     802
15296    1073
15297    1356
Name: movieId, dtype: int64


In [15]:
iids_to_predict = np.setdiff1d(iids,iids100)
print("List of movieId which uid={0} did not rate(in all {1}) :".format(100,len(iids_to_predict)))
print(iids_to_predict)

List of movieId which uid=100 did not rate(in all 9041) :
[     2      4      5 ... 162542 162672 163949]


In [16]:
### ratings arbitrarily set to 0
testset = [[100,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)
predictions[5]

Prediction(uid=100, iid=10, r_ui=0.0, est=3.4093366978736537, details={'was_impossible': False})

In [17]:
pred_ratings = np.array([pred.est for pred in predictions])

In [18]:
# Finding the index of maximum predicted rating
i_max = pred_ratings.argmax()

In [19]:
# Recommending the item with maximum predicted rating
iid_recommend_most = iids_to_predict[i_max] 
print("Top item to be recommended for user {0} is {1} with predicted rating as {2}".format(50,iid_recommend_most,pred_ratings[i_max]))

Top item to be recommended for user 50 is 858 with predicted rating as 4.474165418227534


In [20]:
# Getting top 10 items to be recommended for uid = 50
import heapq
i_sorted_10 = heapq.nlargest(10, 
                             range(len(pred_ratings)), 
                             pred_ratings.take)
top_10_items = iids_to_predict[i_sorted_10]
print(top_10_items)

[ 858  318  969   50 1228 4973 2064 1945 1212 6016]


In [21]:
############ Tuning ############

from surprise.model_selection import GridSearchCV
param_grid = {'k': np.arange(10,60,5)}

In [22]:
from surprise.model_selection.split import KFold
kfold = KFold(n_splits=5, random_state=2021, shuffle=True)
gs = GridSearchCV(surprise.SVD, param_grid, 
                  measures=['rmse', 'mae'], cv=kfold)

In [23]:
gs.fit(data)

TypeError: __init__() got an unexpected keyword argument 'k'

In [45]:
# best RMSE score
print(gs.best_score['rmse'])

0.9639033988203384


In [46]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'k': 15}


In [47]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']

In [48]:
algo.fit(data.build_full_trainset())

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x23a43afb760>

In [49]:
######################################

pred = algo.predict(uid='66',iid='207')
score = pred.est
print(score)

3.543608255669773


In [52]:
iids = ratings['movieId'].unique()
iids50 = ratings.loc[ratings['userId'] == 100 ,'movieId']
print("List of iid that uid={0} has rated:".format(100))
print(iids50)

List of iid that uid=100 has rated:
15273       1
15274       3
15275       6
15276       7
15277      25
15278      32
15279      52
15280      62
15281      86
15282      88
15283      95
15284     135
15285     141
15286     608
15287     648
15288     661
15289     708
15290     733
15291     736
15292     745
15293     780
15294     786
15295     802
15296    1073
15297    1356
Name: movieId, dtype: int64


In [53]:
iids_to_predict = np.setdiff1d(iids,iids50)
print("List of iid which uid={0} did not rate(in all {1}) :".format(66,len(iids_to_predict)))
print(iids_to_predict)

List of iid which uid=66 did not rate(in all 9041) :
[     2      4      5 ... 162542 162672 163949]


In [54]:
### ratings arbitrarily set to 0
testset = [[66,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)
predictions[5]

Prediction(uid=66, iid=10, r_ui=0.0, est=3.988561327862884, details={'actual_k': 15, 'was_impossible': False})

In [55]:
pred_ratings = np.array([pred.est for pred in predictions])

In [56]:
# Finding the index of maximum predicted rating
i_max = pred_ratings.argmax()

In [57]:
# Recommending the item with maximum predicted rating
iid_recommend_most = iids_to_predict[i_max] 
print("Top item to be recommended for user {0} is {1} with predicted rating as {2}".format(50,iid_recommend_most,pred_ratings[i_max]))

Top item to be recommended for user 50 is 53 with predicted rating as 5.0


In [58]:
# Getting top 10 items to be recommended for uid = 50
import heapq
i_sorted_10 = heapq.nlargest(10, range(len(pred_ratings)), pred_ratings.take)
top_10_items = iids_to_predict[i_sorted_10]
print(top_10_items)

[ 53 183 301 309 559 702 764 820 845 876]
