In [1]:
!pip install scikit-surprise



In [3]:
import pandas as pd
import numpy as np
import surprise
import os

In [15]:
os.chdir(r"C:\Users\dai\Desktop\S-AI\MachineLearning\Day14\ml-100k")
ratings = pd.read_csv("u.data",sep='\t',names = ['uid','iid','rating','ts'])
ratings.drop('ts',axis=1,inplace=True)
ratings.columns


Index(['uid', 'iid', 'rating'], dtype='object')

In [17]:
ratings.head()

Unnamed: 0,uid,iid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [19]:
lowest_rating = ratings['rating'].min()
highest_rating = ratings['rating'].max()
print("Ratings range between {0} and {1}".format(lowest_rating,highest_rating))

Ratings range between 1 and 5


In [21]:
reader = surprise.Reader(rating_scale = (lowest_rating,highest_rating))
data = surprise.Dataset.load_from_df(ratings,reader)
type(data)

surprise.dataset.DatasetAutoFolds

Tuning for best parameters

In [24]:
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import KFold

param_grid = {'lr_all':np.linspace(0.001,1,3), 'reg_all': np.linspace(0.01,0.8,3),
              'n_factors':[40,30]}
kfold = KFold(random_state=23,n_splits=5,shuffle=True)
gs = GridSearchCV(surprise.SVD, param_grid,joblib_verbose =3, measures=['rmse', 'mae'],
                  cv=kfold,n_jobs=-1)
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   13.4s finished


Best Score:

In [26]:
print(gs.best_score['rmse'])

0.9603163429257175


Best Parameter:

In [28]:
print(gs.best_params['rmse'])

{'lr_all': 0.001, 'reg_all': 0.01, 'n_factors': 30}


We can now use the algorithm that yields the best rmse:

In [30]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e759ec8800>

The recommendations can be generated for any user with the object **algo**.

Total Items:

In [33]:
iids = ratings['iid'].unique()
print(iids)

[ 242  302  377 ... 1637 1630 1641]


The list of items rated by user 50:

In [35]:
u_iid = ratings[ratings['uid']==50]['iid'].unique()
print(u_iid)

[ 246  823  253  475 1084  286    9  125  123  325  508  288  319  324
  276 1008 1010  268  544   15  327  124  547  100]


List of the items not rated by user 50:

In [37]:
iids_to_predict = np.setdiff1d(iids, u_iid)
print(iids_to_predict)

[   1    2    3 ... 1680 1681 1682]


Extracting the estimated rating from iids_to_predict

In [39]:
testset = [[50,iid,0.] for iid in iids_to_predict]
predictions = algo.test(testset)

In [40]:
predictions[0]

Prediction(uid=50, iid=1, r_ui=0.0, est=3.948418051682834, details={'was_impossible': False})

In [41]:
pred_ratings = np.array([pred.est for pred in predictions])

Getting the item with highest expected rating

In [43]:
i_max = np.argmax(pred_ratings)
print("Item:",iids_to_predict[i_max],"is the item with highest expected rating as",pred_ratings[i_max])

Item: 483 is the item with highest expected rating as 4.553533110034638


In [44]:
exp_ratings = pd.DataFrame({'Item_ID': iids_to_predict, 'Exp_Rating':pred_ratings})
exp_ratings.sort_values(by=['Exp_Rating','Item_ID'], ascending=[False, True], inplace=True)
exp_ratings.head(10)

Unnamed: 0,Item_ID,Exp_Rating
465,483,4.553533
305,318,4.474209
340,357,4.469616
162,169,4.462123
391,408,4.454425
61,64,4.442262
582,603,4.430074
10,12,4.428469
95,98,4.404656
462,480,4.393576


In [64]:
movies = pd.read_csv('movies_list.csv',encoding = 'latin-1')


In [68]:
movies.columns


Index(['movie id ', ' movie title ', ' release date ', ' video release date ',
       'IMDb URL ', ' unknown ', ' Action ', ' Adventure ', ' Animation ',
       'Children's ', ' Comedy ', ' Crime ', ' Documentary ', ' Drama ',
       ' Fantasy ', 'Film-Noir ', ' Horror ', ' Musical ', ' Mystery ',
       ' Romance ', ' Sci-Fi ', 'Thriller ', ' War ', ' Western'],
      dtype='object')

In [72]:
movies = movies[['movie id ', ' movie title ']].rename(columns={'movie id ':'Item_ID'})

In [74]:
m_names = movies.merge(exp_ratings, left_on='Item_ID', right_on='Item_ID').sort_values(by=['Exp_Rating','Item_ID'], ascending=[False, True])
m_names.head(5)

Unnamed: 0,Item_ID,movie title,Exp_Rating
465,483,Casablanca (1942),4.553533
305,318,Schindler's List (1993),4.474209
340,357,One Flew Over the Cuckoo's Nest (1975),4.469616
162,169,"Wrong Trousers, The (1993)",4.462123
391,408,"Close Shave, A (1995)",4.454425
