## Surprise

### Loading Data

In [1]:
PATH = "../data/recsys_data/all_tracks_ratings.csv"

In [2]:
import os
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD, SVDpp, NMF, CoClustering, BaselineOnly

# path to dataset file
file_path = os.path.expanduser(PATH)

# As we're loading a custom dataset, we need to define a reader.
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t", rating_scale=(1, 5), skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)


In [22]:
# Ref: https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

benchmark = []
names = ["SVD", "SVDpp", "NMF", "CoClustering", "BaselineOnly (ASL)", "BaselineOnly (SGD)"]
# Iterate over all algorithms
for algorithm, name in zip([SVD(), SVDpp(), NMF(), CoClustering(), BaselineOnly(verbose=False), BaselineOnly(verbose=False, bsl_options={"method": "sgd"})], names):
    # Perform cross validation
    results = cross_validate(algorithm, data, verbose=False, n_jobs=-1)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = pd.concat([tmp, pd.Series([name], index=['Algorithm'])])
    benchmark.append(tmp)

results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 
results

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVDpp,1.50375,1.288133,50.628867,9.211165
SVD,1.5095,1.316164,26.605718,4.056807
BaselineOnly (SGD),1.516556,1.324447,2.459481,2.841568
BaselineOnly (ASL),1.516728,1.336572,1.001729,2.02112
CoClustering,1.667045,1.376692,20.455109,2.043114
NMF,1.696881,1.361316,51.674445,2.350688


Best algorithms seem to be __SVD__, __SVDpp__ & __BaselineOnly__.

Tuning hyperparameters for Singular Value Decomposition (SVD)

In [3]:
from surprise.model_selection.search import GridSearchCV, RandomizedSearchCV

In [5]:
param_grid = {
    'n_factors': [100, 200],
    "n_epochs": list(range(20, 41, 5)),
    "lr_all": [0.005, 0.05, 0.1, 0.2],
    "reg_all": [0.02, 0.2, 0.4]
}

search = RandomizedSearchCV(SVD, param_grid, cv=3, n_iter=10, n_jobs=3)
search.fit(data)

print(f"Results: {search.best_score}")
print(f"Best parameters:")
search.best_params

Results: {'rmse': 1.4659266724913167, 'mae': 1.2531741049182772}
Best parameters:


{'rmse': {'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.1, 'reg_all': 0.2},
 'mae': {'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.1, 'reg_all': 0.2}}

In [8]:
best_svd_options = {
    'n_factors': 200,
    'n_epochs': 30,
    'lr_all': 0.1,
    'reg_all': 0.2
}

cv = cross_validate(SVD(**best_svd_options), data, cv=3, n_jobs=-1)

pd.DataFrame.from_dict(cv).mean(axis=0)

test_rmse     1.466108
test_mae      1.253331
fit_time     49.248589
test_time     5.613638
dtype: float64

Tuning hyperparameters for Alternating Least Squares (ALS)

In [10]:
param_grid = {
    'bsl_options':
        {
            'method': ['als'],
            'n_epochs': [20, 30, 40, 50],
            'reg_i': list(range(10, 41, 5)),
            'reg_u': list(range(10, 41, 5)),
        },
    'verbose': [False]
}

search = GridSearchCV(BaselineOnly, param_grid, cv=3, n_jobs=3)
search.fit(data)

print(f"Results: {search.best_score}")
print(f"Best parameters:")
search.best_params

Results: {'rmse': 1.520624223129533, 'mae': 1.339626584083473}
Best parameters:


{'rmse': {'bsl_options': {'method': 'als',
   'n_epochs': 30,
   'reg_i': 10,
   'reg_u': 10},
  'verbose': False},
 'mae': {'bsl_options': {'method': 'als',
   'n_epochs': 30,
   'reg_i': 10,
   'reg_u': 10},
  'verbose': False}}

Tuning hyperparameters for Stochastic Gradient Descent (SGD)

In [12]:
param_grid = {
    'bsl_options':
        {
            'method': ['sgd'],
            'n_epochs': [100, 150, 200],
            'reg': [0.02, 0.05, 0.1, 0.2],
            'learning_rate': [0.001, 0.01, 0.1]
        },
    'verbose': [False]
}

search = GridSearchCV(BaselineOnly, param_grid, cv=3, n_jobs=3)
search.fit(data)

print(f"Results: {search.best_score}")
print(f"Best parameters:")
search.best_params


Results: {'rmse': 1.5204474491753495, 'mae': 1.3214845939520232}
Best parameters:


{'rmse': {'bsl_options': {'method': 'sgd',
   'n_epochs': 100,
   'reg': 0.2,
   'learning_rate': 0.001},
  'verbose': False},
 'mae': {'bsl_options': {'method': 'sgd',
   'n_epochs': 200,
   'reg': 0.02,
   'learning_rate': 0.001},
  'verbose': False}}