In [1]:
import numpy as np
import surprise
import pandas as pd

In [2]:
from surprise import Dataset, SVD, SVDpp
from surprise.model_selection import cross_validate, GridSearchCV

# Simple baseline model

In [3]:
ds = Dataset.load_builtin('ml-1m')
algo = SVD()

In [4]:
cv = cross_validate(algo, ds, measures=['rmse'], cv=5)
print(cv['test_rmse'].mean())
cv

0.8740934827056093


{'test_rmse': array([0.87250628, 0.87538485, 0.87280855, 0.87385462, 0.87591312]),
 'fit_time': (46.35530424118042,
  47.255136251449585,
  47.09720182418823,
  46.90306091308594,
  46.79565119743347),
 'test_time': (2.5323126316070557,
  2.383711814880371,
  2.1933224201202393,
  2.357881546020508,
  2.375429391860962)}

In [5]:
cv['test_rmse'].mean()

0.8740934827056093

# Trying to grid search

In [6]:
param_grid = {
    'n_factors': [50,100,150, 200],
    'n_epochs': [5, 10, 20],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.01, 0.02, 0.1, 0.5],
}

In [7]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
gs.fit(ds)

In [8]:
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8707587405920876
{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


# SVD++

In [9]:
algo_pp = SVDpp()

In [10]:
cv = cross_validate(algo_pp, ds, measures=['rmse'], cv=5)
print(cv['test_rmse'].mean())
cv

0.8616147163592475


{'test_rmse': array([0.86176536, 0.86230745, 0.86162892, 0.86073603, 0.86163583]),
 'fit_time': (3415.5415620803833,
  3230.333493947983,
  3105.4247336387634,
  3110.5035548210144,
  3183.7637417316437),
 'test_time': (57.29182839393616,
  57.341612577438354,
  63.113587379455566,
  63.366161584854126,
  69.78458285331726)}

In [11]:
cv['test_rmse'].mean()

0.8616147163592475