In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links.csv


In [5]:
from surprise import Dataset 
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV,RandomizedSearchCV
from surprise.model_selection import cross_validate

In [7]:
data = pd.read_csv("/kaggle/input/the-movies-dataset/ratings_small.csv")

In [8]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Loading the dataset in surprise format

In [9]:
reader = Reader()
ratings = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

In [10]:
train_ratings, test_ratings = train_test_split(ratings, test_size=.20, random_state = 42)
print("Size of trainset: ", train_ratings.n_ratings)
print("Size of testset: ", len(test_ratings))

Size of trainset:  80003
Size of testset:  20001


### (C).1 PMF MAE and RMSE

In [8]:
from surprise import SVD

In [11]:
svd_model = SVD(biased=False,random_state = 42) # Biased=False means PMF
svd_model.fit(train_ratings)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7cbe4d605750>

In [12]:
train_predictions = svd_model.test(train_ratings.build_testset())
test_predictions = svd_model.test(test_ratings)
print("RMSE on training data : ", accuracy.rmse(train_predictions, verbose = False))
print("RMSE on test data: ", accuracy.rmse(test_predictions, verbose = False))

RMSE on training data :  0.575208261615619
RMSE on test data:  1.0096286778951746


In [13]:
print("MAE on training data : ", accuracy.mae(train_predictions, verbose = False))
print("MAE on test data: ", accuracy.mae(test_predictions, verbose = False))

MAE on training data :  0.4450931710223031
MAE on test data:  0.7787860957821384


In [16]:
cross_validate(svd_model, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0022  1.0100  1.0095  1.0067  1.0138  1.0084  0.0038  
MAE (testset)     0.7739  0.7795  0.7771  0.7774  0.7814  0.7778  0.0025  
Fit time          1.74    1.54    1.72    1.77    1.76    1.71    0.08    
Test time         0.65    0.19    0.19    0.19    0.18    0.28    0.18    


{'test_rmse': array([1.00221783, 1.00997377, 1.00954269, 1.00665107, 1.01377914]),
 'test_mae': array([0.77386821, 0.77952794, 0.77707411, 0.77735549, 0.78136993]),
 'fit_time': (1.7430894374847412,
  1.5424432754516602,
  1.719308614730835,
  1.7742862701416016,
  1.7565340995788574),
 'test_time': (0.6452898979187012,
  0.18867135047912598,
  0.19130635261535645,
  0.18657922744750977,
  0.18463873863220215)}

### (C)2. User-based Collaborative Filtering MAE and RMSE

In [11]:
from surprise import KNNBasic

In [18]:
knn_model = KNNBasic(random_state = 42,verbose = False)

In [19]:
cross_validate(knn_model, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9720  0.9657  0.9684  0.9679  0.9712  0.9690  0.0023  
MAE (testset)     0.7498  0.7394  0.7452  0.7434  0.7466  0.7449  0.0034  
Fit time          0.15    0.20    0.19    0.21    0.19    0.19    0.02    
Test time         3.06    2.56    2.61    2.54    2.52    2.66    0.20    


{'test_rmse': array([0.97195403, 0.96569437, 0.9683536 , 0.96791959, 0.97115785]),
 'test_mae': array([0.74984351, 0.73944658, 0.74520255, 0.74343044, 0.74656799]),
 'fit_time': (0.14653420448303223,
  0.1995856761932373,
  0.19376587867736816,
  0.21204590797424316,
  0.18881797790527344),
 'test_time': (3.0585620403289795,
  2.5612099170684814,
  2.6099166870117188,
  2.5424280166625977,
  2.523038387298584)}

### (C).3 Item Based Collaborative Filtering RMSE and MAE

In [20]:
knn_model_item_based = KNNBasic(user_based = False, random_state = 42)

In [21]:
cross_validate(knn_model, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9734  0.9640  0.9692  0.9655  0.9652  0.9675  0.0034  
MAE (testset)     0.7476  0.7406  0.7455  0.7387  0.7441  0.7433  0.0032  
Fit time          0.17    0.20    0.20    0.18    0.19    0.19    0.01    
Test time         2.56    3.03    2.61    2.48    2.57    2.65    0.19    


{'test_rmse': array([0.97339892, 0.96395119, 0.96921207, 0.9655386 , 0.96518681]),
 'test_mae': array([0.74757439, 0.7405524 , 0.74545405, 0.73873255, 0.74405933]),
 'fit_time': (0.1657402515411377,
  0.196458101272583,
  0.2014293670654297,
  0.18140912055969238,
  0.18596553802490234),
 'test_time': (2.5579440593719482,
  3.025379180908203,
  2.6063246726989746,
  2.4840188026428223,
  2.5688834190368652)}

### (e)1 Cosine User Based and Item based

In [22]:
sim_options = {
    "name": "cosine",
    "user_based": True
}

In [25]:
knn_model_cosine_user = KNNBasic(sim_options=sim_options,random_state = 42,verbose = False)

In [26]:
cross_validate(knn_model_cosine_user, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9968  0.9899  0.9809  0.9986  1.0023  0.9937  0.0076  
MAE (testset)     0.7690  0.7653  0.7582  0.7719  0.7756  0.7680  0.0060  
Fit time          0.24    0.27    0.24    0.28    0.25    0.26    0.02    
Test time         2.99    2.57    2.48    2.39    2.59    2.60    0.20    


{'test_rmse': array([0.99683178, 0.98988385, 0.9808627 , 0.99856261, 1.00231748]),
 'test_mae': array([0.76899356, 0.76527327, 0.75819417, 0.77185567, 0.77559942]),
 'fit_time': (0.23797011375427246,
  0.27268338203430176,
  0.24173927307128906,
  0.27557849884033203,
  0.2548999786376953),
 'test_time': (2.9854249954223633,
  2.5654666423797607,
  2.4807028770446777,
  2.388105869293213,
  2.591151475906372)}

In [27]:
sim_options = {
    "name": "cosine",
    "user_based": False #Item based
}

In [28]:
knn_model_cosine_item = KNNBasic(sim_options=sim_options,random_state = 42,verbose = False)

In [31]:
cross_validate(knn_model_cosine_item, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9905  0.9878  1.0000  0.9951  1.0012  0.9949  0.0052  
MAE (testset)     0.7712  0.7710  0.7783  0.7736  0.7798  0.7748  0.0036  
Fit time          8.15    7.64    7.49    7.11    7.49    7.58    0.34    
Test time         11.28   11.55   12.11   11.43   11.63   11.60   0.28    


{'test_rmse': array([0.99048372, 0.987793  , 0.99996064, 0.99510812, 1.00117253]),
 'test_mae': array([0.77124038, 0.77104059, 0.77834485, 0.77364099, 0.77983882]),
 'fit_time': (8.151725769042969,
  7.644961595535278,
  7.487639427185059,
  7.11131477355957,
  7.485038995742798),
 'test_time': (11.278547525405884,
  11.546887636184692,
  12.108307123184204,
  11.433055877685547,
  11.629339933395386)}

### (e)2 MSD User based and Item based
Default is MSD (Refer to the first rmse and mae calculated)

### (e)3 Pearson User based and Item based

In [30]:
sim_options = {
    "name": "pearson",
    "user_based": True
}

In [32]:
knn_model_pearson_user = KNNBasic(sim_options=sim_options,random_state = 42,verbose = False)

In [33]:
cross_validate(knn_model_pearson_user, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9893  0.9993  1.0117  0.9993  0.9920  0.9983  0.0077  
MAE (testset)     0.7662  0.7740  0.7812  0.7745  0.7687  0.7729  0.0052  
Fit time          0.26    0.33    0.35    0.33    0.36    0.33    0.03    
Test time         2.46    2.50    2.47    3.03    2.51    2.59    0.22    


{'test_rmse': array([0.989343  , 0.9993338 , 1.01166687, 0.99925396, 0.9920366 ]),
 'test_mae': array([0.76617186, 0.77402426, 0.78121801, 0.77451258, 0.76872336]),
 'fit_time': (0.2631053924560547,
  0.32567858695983887,
  0.34678077697753906,
  0.3256552219390869,
  0.36463308334350586),
 'test_time': (2.4644219875335693,
  2.4976541996002197,
  2.4699761867523193,
  3.0347540378570557,
  2.505429983139038)}

In [34]:
sim_options = {
    "name": "pearson",
    "user_based": False
}

In [35]:
knn_model_pearson_item = KNNBasic(sim_options=sim_options,random_state = 42,verbose = False)

In [36]:
cross_validate(knn_model_pearson_item, ratings, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9880  0.9868  0.9890  0.9877  0.9955  0.9894  0.0031  
MAE (testset)     0.7639  0.7681  0.7669  0.7678  0.7708  0.7675  0.0022  
Fit time          10.01   10.09   9.25    9.99    10.22   9.91    0.34    
Test time         11.25   11.50   11.30   11.79   12.30   11.63   0.39    


{'test_rmse': array([0.98802157, 0.98679636, 0.98903777, 0.98771929, 0.99552643]),
 'test_mae': array([0.76389313, 0.76812217, 0.76690409, 0.76783853, 0.7707514 ]),
 'fit_time': (10.011876344680786,
  10.092943906784058,
  9.24608325958252,
  9.98780632019043,
  10.218714475631714),
 'test_time': (11.253539800643921,
  11.498635530471802,
  11.298888683319092,
  11.789819955825806,
  12.301005363464355)}

### (f)1 Neighbors impact User based

In [13]:
param_grid = {'k': list(range(10,45,5))}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv = 5)
gs.fit(ratings)
gs.best_params['rmse']

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

{'k': 15}

In [14]:
gs.best_score

{'rmse': 0.9626090724468354}

In [18]:
results_df_gs_user_based = pd.DataFrame.from_dict(gs.cv_results)

In [19]:
results_df_gs_user_based

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k
0,0.969546,0.967448,0.978888,0.956724,0.955182,0.965557,0.00875,4,0.171503,0.003951,2.083663,0.147162,{'k': 10},10
1,0.966788,0.965254,0.97438,0.954305,0.952319,0.962609,0.00822,1,0.176233,0.015565,2.236153,0.197594,{'k': 15},15
2,0.967181,0.964941,0.974046,0.95514,0.953041,0.96287,0.007799,2,0.173187,0.006141,2.312648,0.183911,{'k': 20},20
3,0.968437,0.966528,0.974365,0.95644,0.954599,0.964074,0.00747,3,0.17088,0.008208,2.404888,0.162342,{'k': 25},25
4,0.970506,0.968029,0.975667,0.958354,0.956129,0.965737,0.007395,5,0.172327,0.001714,2.473793,0.146396,{'k': 30},30
5,0.971887,0.969313,0.976893,0.960178,0.957468,0.967148,0.007271,6,0.174561,0.010844,2.549596,0.156783,{'k': 35},35
6,0.973053,0.970589,0.977785,0.961168,0.959136,0.968346,0.007108,7,0.170229,0.006363,2.589124,0.140103,{'k': 40},40


### (f)2 Neighbors impact Item based

In [21]:
param_grid = {'k': list(range(10,45,5)),
'sim_options': {
        'user_based': [False]  #item based
    }}
gs_item = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv = 5)
gs_item.fit(ratings)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [22]:
gs_item.best_params['rmse']

{'k': 40, 'sim_options': {'user_based': False}}

In [24]:
results_df_gs_item_based = pd.DataFrame.from_dict(gs_item.cv_results)

In [25]:
results_df_gs_item_based

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,0.973628,0.968106,0.978091,0.974159,0.977059,0.974209,0.003486,7,4.716217,0.145512,10.149672,0.446275,"{'k': 10, 'sim_options': {'user_based': False}}",10,{'user_based': False}
1,0.957202,0.946576,0.960946,0.957193,0.959489,0.956281,0.005058,6,4.682695,0.024247,10.498348,0.144783,"{'k': 15, 'sim_options': {'user_based': False}}",15,{'user_based': False}
2,0.947599,0.93763,0.95205,0.949083,0.950437,0.94736,0.005082,5,4.717436,0.046291,10.942802,0.28831,"{'k': 20, 'sim_options': {'user_based': False}}",20,{'user_based': False}
3,0.941651,0.931617,0.946807,0.943402,0.944679,0.941631,0.005282,4,4.695482,0.040152,11.209648,0.05127,"{'k': 25, 'sim_options': {'user_based': False}}",25,{'user_based': False}
4,0.938916,0.92786,0.942054,0.940671,0.940816,0.938063,0.005199,3,4.694311,0.025878,11.721492,0.224753,"{'k': 30, 'sim_options': {'user_based': False}}",30,{'user_based': False}
5,0.936778,0.926255,0.939771,0.93922,0.939201,0.936245,0.005101,2,4.727239,0.030481,12.115991,0.287659,"{'k': 35, 'sim_options': {'user_based': False}}",35,{'user_based': False}
6,0.93544,0.924845,0.938501,0.937953,0.937974,0.934943,0.00516,1,4.730197,0.046266,12.200934,0.036658,"{'k': 40, 'sim_options': {'user_based': False}}",40,{'user_based': False}
