## Surprise

### Loading Data

In [1]:
PATH = "../data/recsys_data/all_tracks_ratings.csv"

In [2]:
import os

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser(PATH)

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t", rating_scale=(0, 10), skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)


In [3]:
# Ref: https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

import pandas as pd
from surprise import SVD, SVDpp, NMF, CoClustering, BaselineOnly

benchmark = []
names = ["SVD", "SVDpp", "NMF", "CoClustering", "BaselineOnly (ASL)", "BaselineOnly (SGD)"]
# Iterate over all algorithms
for algorithm, name in zip([SVD(), SVDpp(), NMF(), CoClustering(), BaselineOnly(verbose=False), BaselineOnly(verbose=False, bsl_options={"method": "sgd"})], names):
    # Perform cross validation
    results = cross_validate(algorithm, data, verbose=False, n_jobs=-1)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = pd.concat([tmp, pd.Series([name], index=['Algorithm'])])
    benchmark.append(tmp)

results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 
results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVD,1.750855,1.539501,56.316595,5.760737
SVDpp,1.759045,1.508094,108.731924,15.542759
BaselineOnly (ASL),1.785014,1.621151,2.668497,3.54493
BaselineOnly (SGD),1.78668,1.606699,5.614651,3.611229
CoClustering,1.939653,1.632344,47.381435,3.384535
NMF,2.578905,2.203351,114.744027,4.356955


Best algorithms seem to be __SVD__, __SVDpp__ & __BaselineOnly__. We'll skip SVDpp because it takes too long.

Tuning hyperparameters for Singular Value Decomposition (SVD)

In [None]:
from surprise.model_selection.search import GridSearchCV, RandomizedSearchCV

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.2, 0.4, 0.6]
}

search = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
search.fit(data)

print(f"Best score (RMSE): {search.best_score['rmse']}")
print(f"Best parameters:")
search.best_params


Tuning hyperparameters for Alternating Least Squares (ALS)

In [4]:
param_grid = {
    'method': 'als',
    'n_epochs': [10, 20, 30],
    'reg_u': list(range(10, 41, 5)),
    'reg_i': list(range(10, 41, 5)),
}

search = GridSearchCV(BaselineOnly, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
search.fit(data)

print(f"Best score (RMSE): {search.best_score['rmse']}")
print(f"Best parameters:")
search.best_params

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


(1.8253821459566428, 0.000502012168072076)

Tuning hyperparameters for Stochastic Gradient Descent (SGD)

In [55]:
bsl_options = {
    'method': 'sgd',
    'n_epochs': 100,
    'reg': 0.02,
    'learning_rate': 0.00001
}

algo = BaselineOnly(bsl_options=bsl_options)
cv_scores = cross_validate(algo, data, measures=[
                           'RMSE'], cv=5, n_jobs=-1, verbose=False)['test_rmse']

(cv_scores.mean(), cv_scores.std())


Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...


(0.08651354430561885, 8.363508998821686e-05)