In [1]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly

import random
import numpy as np

from pathlib import Path

from surprise.model_selection import train_test_split

from surprise import KNNWithMeans  #Memory based, Collaborative Based Filtering
from surprise import SVD  #Model Based, Matrix Factorization

from surprise.model_selection import GridSearchCV

from surprise import accuracy

In [2]:
# Loads the builtin Movielens-100k data
data = Dataset.load_builtin("ml-100k")

raw_ratings = data.raw_ratings
len(raw_ratings)

100000

In [3]:

# A = 75% of the data, B = 25% of the data
threshold = int(.75 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings  # data is now the set A

In [4]:
#Memory based, Collaborative-based filtering
sim_options = {
    "name": ["cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs_knn = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=5)


In [5]:
gs_knn.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [6]:
print(gs_knn.best_score["rmse"])
print(gs_knn.best_params["rmse"])


0.9559091173903586
{'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}


Centered-KNN algorithm works best if you go with item-based approach and use cosine similarity as the similarity metric with minimum support 3.
Cosine considers common and absolute ratings.
However some people like to rate high, even they do not like the item very much. However, some people tend to rate low, even they like the items very much. The traditional cosine similarity does not account for the preference of the user’s rating.)

In [7]:
# We can now use the algorithm that yields the best rmse:
algo_knn = gs_knn.best_estimator['rmse']

trainset = data.build_full_trainset()

testset1 = data.construct_testset(B_raw_ratings)  # testset is now the set B


In [8]:

algo_knn.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x287792e4278>

In [9]:

predictions = algo_knn.test(testset1)


In [10]:
print('Unbiased accuracy on B,', end=' ')
# Compute RMSE
accuracy.rmse(predictions)
# Compute MSE
accuracy.mse(predictions)
# Compute MAE
accuracy.mae(predictions)

Unbiased accuracy on B, RMSE: 0.9402
MSE: 0.8840
MAE:  0.7419


0.7418984725084533

In [11]:
#Model based, Matrix Factorization, SVD

param_grid = {
    "n_epochs": [5, 10, 15],
    "lr_all": [0.002, 0.005, 0.008],
    "reg_all": [0.2, 0.4, 0.6]
}
gs_svd = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)


In [12]:
gs_svd.fit(data) #data is A raw ratings


In [13]:
print(gs_svd.best_score["rmse"])
print(gs_svd.best_params["rmse"])


0.9525474875952877
{'n_epochs': 15, 'lr_all': 0.008, 'reg_all': 0.2}


In [14]:
# We can now use the algorithm that yields the best rmse:
algo_svd = gs_svd.best_estimator['rmse']

trainset = data.build_full_trainset()
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B


In [15]:

algo_svd.fit(trainset)  # retrain on the whole set A


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2877b62cf98>

In [22]:
predictions_B = algo_svd.test(testset)


In [23]:
# Compute unbiased accuracy on B
print('Unbiased accuracy on B,', end=' ')
# Compute RMSE
accuracy.rmse(predictions_B)
# Compute MSE
accuracy.mse(predictions_B)
# Compute MAE
accuracy.mae(predictions_B)

Unbiased accuracy on B, RMSE: 0.9428
MSE: 0.8889
MAE:  0.7535


0.7534864095281624