In [1]:
import os
import pandas as pd
import numpy as np
import surprise

In [2]:
data_dir=r"C:\Users\fabi\Desktop\DONO\Manipal_Deloitte\Recommendation System\ml-latest-small"

In [3]:
os.chdir(data_dir)

In [4]:
mr=pd.read_csv("ratings.csv")
mr.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


I will get rid of the timestamp column. I will rename these columns to the names that the surprise expects.

In [5]:
mr.drop('timestamp',axis=1,inplace=True)
mr.rename(columns={'userId':'user','movieId':'item','rating':'rating'},inplace=True)

Then I’ll create a reader object. I’ll specify the line format, first column is the column of user IDs, second column is the column of item IDs and the third column is the column of ratings and the ratings are on a scale of 1 to 5. 

In [6]:
# user, item, rating on scale of 1 to 5
reader=surprise.dataset.Reader(line_format='user item rating', rating_scale=(1,5))

Now I will create a training object. From that I will create a train set. 

In [7]:
mr_train=surprise.dataset.Dataset.load_from_df(mr,reader=reader)
mr_trainset=mr_train.build_full_trainset()

Now from surprise I will import a class called SVD. This will help me in creating model based recommenders using Singular Value Decomposition. Let me import this.

In [8]:
## Create a neighbourhood based user and item based collaborative filtering model
from surprise import SVD

Now let’s say I want to factorize my user item matrix into 20 factors. So hence I will supply a value of 20 here. And I will use the fit method of this model object that I just created to train my model. 

In [9]:
model=SVD(n_factors=20)

In [10]:
model.fit(mr_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c95e92edd8>

Now let’s take a look at our raw data once again. 

In [11]:
mr.head()

Unnamed: 0,user,item,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


Assume I want to make prediction for user whose raw ID is 1 and item ID is 31. Then based on the SVD model, the prediction comes out to be 2.3, now you can notice that this prediction is quite near the actual one.

In [12]:
model.predict(uid=1,iid=31,r_ui=2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=2.3679850924710713, details={'was_impossible': False})

Let’s try and build a matrix factorization model based on Non Negative Matrix Factorization. I’ll import the NMF class from surprise module. I am taking 20 factors here and I am training my model. 

In [13]:
from surprise import NMF

In [27]:
model1=NMF(n_factors=20,biased=True) 
# Whether to factor in baseline effect

In [28]:
model1.fit(mr_trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1c968918080>

Let’s make a prediction for user ID 1 with item ID 31 and according to NMF the prediction is around 1. So it seems like at least for this user the SVD’s model is much more accurate.

In [30]:
model1.predict(uid=1,iid=31,r_ui=2.5)

Prediction(uid=1, iid=31, r_ui=2.5, est=1, details={'was_impossible': False})

Now I can compare the performance of both these models with the current value of hyperparameters by splitting my data into 3 folds as we'd done earlier.

In [31]:
from surprise.model_selection import KFold
from surprise import accuracy

In [32]:
## Evaluate Model performance at current values of hyperparameters
# mr_train.split(n_folds=3)
# surprise.evaluate(SVD(n_factors=20),mr_train)

kf = KFold(n_splits=3)


model=SVD(n_factors=20)

for trainset, testset in kf.split(mr_train):
    model.fit(trainset)
    predictions = model.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)


RMSE: 0.8983
MAE:  0.6933
RMSE: 0.8946
MAE:  0.6896
RMSE: 0.8978
MAE:  0.6908


In [33]:
# mr_train.split(n_folds=3)
# surprise.evaluate(NMF(n_factors=20,biased=True),mr_train)

kf = KFold(n_splits=3)


model1=NMF(n_factors=20,biased=True)

for trainset, testset in kf.split(mr_train):
    model1.fit(trainset)
    predictions = model1.test(testset)
    
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)


RMSE: 1.2383
MAE:  0.9367
RMSE: 1.2970
MAE:  0.9807
RMSE: 1.4230
MAE:  1.0964


Now clearly if we just consider 20 factors, then SVD seems to be a better model compared to NMF as you can see the Root Mean Squared Error and Mean Absolute Error is comparatively high.

Now let’s see how we can do grid search. Here I am defining the grid only on the parameter number of factors and I will use the SVD algorithm. 

And I will use a GridSearch method, supply the algorithm I am using which is SVD, supply the parameter grid. Let’s say I want to report the GridSearch results based on Root Mean Squared Error and Mean Absolute Error. 

In [34]:
## Doing grid search for SVD model on number of factors
param_grid={"n_factors":[15,20,25,30]}
algo=SVD

In [35]:
from surprise.model_selection import GridSearchCV

In [36]:
# grid_search=surprise.GridSearch(algo,param_grid=param_grid,measures=["RMSE","MAE"])

grid_search = GridSearchCV(algo,param_grid=param_grid, measures=['rmse', 'mae'])

Now let me run the fit method on my training dataset. Now this grid search procedure will take some time to run. 

In [37]:
grid_search.fit(mr_train)

Once this is run let us see which combination of hyperparameter which in this case is the number of factors yields the most accurate results. 

That I can figure out by using the best parameters method. And let’s check what were the best parameters based on Root Mean Squared Error and Mean Absolute Error.


In [38]:
print(grid_search.best_params['rmse'])
print(grid_search.best_params['mae'])

{'n_factors': 15}
{'n_factors': 15}


 So the best parameters turns out to be when we consider 20 factors by rmse and 20 factors by MAE. 
 Let’s see what is a best score for Root Mean Squared Error and Mean Absolute Error ? That turns out to be 0.89 and 0.68.


In [39]:
print(grid_search.best_score['rmse'])
print(grid_search.best_score['mae'])

0.8915639762561962
0.6871454049941618
