In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 280kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617557 sha256=8862cb5cb9b67bcaeca209f92ab1a13e26a627b1c38fd8fdd877764f5f162d5f
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [6]:
import pandas as pd

In [7]:
rating_data = pd.read_csv("drive/My Drive/colab notebooks/ratings.csv")

In [8]:
rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
rating_data = rating_data.drop("timestamp", axis=1)
rating_data

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
from surprise import Dataset, Reader, KNNBasic,accuracy
from surprise.model_selection import cross_validate

In [10]:
reader = Reader(rating_scale= (1,5))
data = Dataset.load_from_df(rating_data[['userId', 'movieId', 'rating']], reader = reader)

## User based similarity algorithm

In [13]:
item_based_cosine_similarity = {
    'name' : 'pearson',
    'user_based' : True # True for user-based and False for item-based
}

knn = KNNBasic(k = 15,
               min_k = 5,
               sim_options = item_based_cosine_similarity)

In [14]:
from surprise.model_selection import cross_validate

cv_results = cross_validate(knn,
                            data,
                            measures = ['rmse', 'mae'],
                            cv = 5,
                            verbose = False)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [16]:
import numpy as np

In [17]:
np.mean(cv_results.get('test_rmse'))

0.9736660684287495

## Find the best model

In [18]:
from surprise.model_selection.search import GridSearchCV

In [19]:
param_grids = {'k': [10,20],
               'sim_options': {'name': ['cosine', 'pearson'], 'user_based': [True, False]}}

grid_cv = GridSearchCV(KNNBasic,
                       param_grids,
                       measures = ['rmse'],
                       cv = 5,
                       refit = True)

grid_cv.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

In [20]:
print(grid_cv.best_score['rmse'])

0.9746398506444216


In [21]:
results_data = pd.DataFrame.from_dict(grid_cv.cv_results)
results_data[['param_k', 'param_sim_options', 'mean_test_rmse', 'rank_test_rmse']]

Unnamed: 0,param_k,param_sim_options,mean_test_rmse,rank_test_rmse
0,10,"{'name': 'cosine', 'user_based': True}",0.987151,5
1,10,"{'name': 'cosine', 'user_based': False}",1.021622,8
2,10,"{'name': 'pearson', 'user_based': True}",0.986742,4
3,10,"{'name': 'pearson', 'user_based': False}",1.011665,7
4,20,"{'name': 'cosine', 'user_based': True}",0.97464,1
5,20,"{'name': 'cosine', 'user_based': False}",0.993847,6
6,20,"{'name': 'pearson', 'user_based': True}",0.976096,2
7,20,"{'name': 'pearson', 'user_based': False}",0.985559,3


In [22]:
grid_cv.predict(1,2)

Prediction(uid=1, iid=2, r_ui=None, est=3.7233653693643416, details={'actual_k': 20, 'was_impossible': False})