In [1]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVDpp

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)
full_train_data = data.build_full_trainset()

In [9]:
import numpy as np
import scipy.stats as st
from surprise.model_selection import GridSearchCV
from surprise import SVDpp

FOLD = 4
param_grid = {
    "n_epochs": [531],
    "n_factors": [1],
    "cache_ratings": [True],
    "lr_bi": [0.022103729193167332],
    "lr_bu": [5.964210634776729e-05],
    "lr_pu": [0.0024279870152229197],
    "lr_qi": [0.0015257756139312275],
    "lr_yj": [0.0025538838921406533],
    "reg_bi": [-0.000271436779456606252],
    "reg_bu": [0.0015210497941112663],
    "reg_pu": [1.7668113905419636e-05],
    "reg_qi": [0.0030026717984680497],
    "reg_yj": [0.0014833009888479895],
    "init_mean": np.linspace(-1, 1, 20),
    "init_std_dev": np.linspace(0.1, 1, 10),
}

rs = GridSearchCV(
    SVDpp, param_grid, measures=["rmse"], cv=FOLD, n_jobs=-1, joblib_verbose=5
)
rs.fit(data)
print(rs.best_score["rmse"])
print(rs.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 10.7min


0.666664041934693
{'rmse': {'n_epochs': 531, 'n_factors': 1, 'cache_ratings': True, 'lr_bi': 0.022103729193167332, 'lr_bu': 5.964210634776729e-05, 'lr_pu': 0.0024279870152229197, 'lr_qi': 0.0015257756139312275, 'lr_yj': 0.0025538838921406533, 'reg_bi': -0.00027143677945660625, 'reg_bu': 0.0015210497941112663, 'reg_pu': 1.7668113905419636e-05, 'reg_qi': 0.0030026717984680497, 'reg_yj': 0.0014833009888479895, 'init_mean': -0.1578947368421053, 'init_std_dev': 0.1}}


[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 13.6min finished


In [12]:
import numpy as np
import scipy.stats as st
from surprise.model_selection import GridSearchCV
from surprise import SVDpp

FOLD = 6
# reg epochs
param_grid = {
    "cache_ratings": [True],
    "n_epochs": [200],
    "n_factors": [1],
    "lr_bi": st.norm(0.01954, 0.006),
    "lr_bu": st.norm(0.00006, 0.00002),
    "lr_pu": st.norm(0.00309, 0.001),
    "lr_qi": st.norm(0.00092, 0.0003),
    "lr_yj": st.norm(0.00281791, 0.001),
    "reg_bi": st.norm(-0.00027, 0.00009),
    "reg_bu": st.norm(0.00138, 0.0004),
    "reg_pu": st.norm(0.00002, 0.00000666666),
    "reg_qi": st.norm(0.00293, 0.001),
    "reg_yj": st.norm(0.00016986, 0.00005),
}

rs = RandomizedSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
    n_iter=200,
)
rs.fit(data)
print(rs.best_score["rmse"])
print(rs.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  8.7min


0.6574157862154492
{'rmse': {'cache_ratings': True, 'lr_bi': 0.014477713141555899, 'lr_bu': 4.274998978357359e-05, 'lr_pu': 0.0046089389950346085, 'lr_qi': 0.0015801266391564078, 'lr_yj': 0.002616185296350129, 'n_epochs': 200, 'n_factors': 1, 'reg_bi': -0.00025130079220361833, 'reg_bu': 0.0014858307111292178, 'reg_pu': 1.846631322934097e-05, 'reg_qi': 0.0017449972506546265, 'reg_yj': 0.00013693438506739134}}


[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  9.2min finished


In [14]:
import numpy as np
import scipy.stats as st
from surprise.model_selection import GridSearchCV
from surprise import SVDpp

FOLD = 6
# reg epochs
param_grid = {
    "cache_ratings": [True],
    "n_epochs": np.arange(1, 501),
    "n_factors": [1],
    "lr_bi": [0.014477713141555899],
    "lr_bu": [4.274998978357359e-05],
    "lr_pu": [0.0046089389950346085],
    "lr_qi": [0.0015801266391564078],
    "lr_yj": [0.002616185296350129],
    "reg_bi": [-0.00025130079220361833],
    "reg_bu": [0.0014858307111292178],
    "reg_pu": [1.846631322934097e-05],
    "reg_qi": [0.0017449972506546265],
    "reg_yj": [0.00013693438506739134],
}

rs = GridSearchCV(
    SVDpp, param_grid, measures=["rmse"], cv=FOLD, n_jobs=-1, joblib_verbose=5
)
rs.fit(data)
print(rs.best_score["rmse"])
print(rs.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 21.0min


0.658981590365794
{'rmse': {'cache_ratings': True, 'n_epochs': 402, 'n_factors': 1, 'lr_bi': 0.014477713141555899, 'lr_bu': 4.274998978357359e-05, 'lr_pu': 0.0046089389950346085, 'lr_qi': 0.0015801266391564078, 'lr_yj': 0.002616185296350129, 'reg_bi': -0.00025130079220361833, 'reg_bu': 0.0014858307111292178, 'reg_pu': 1.846631322934097e-05, 'reg_qi': 0.0017449972506546265, 'reg_yj': 0.00013693438506739134}}


[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 27.8min finished


In [17]:
from cv_logs import ParameterSearch

import numpy as np
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.model_selection import RandomizedSearchCV

ITERATIONS = 200
FOLD = 5

param_grid = {
    "n_factors": np.arange(1, 250, 2),
    "cache_ratings": [True],
    "n_epochs": [100],
    "lr_all": 10 ** (np.arange(-5, 0, 0.4)),
    "reg_all": 10 ** (np.arange(-5, 0, 0.4)),
    # "reg_bu": [0],
    # "reg_bi": [0],
}
rs = RandomizedSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    n_iter=ITERATIONS,
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 39.0min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 56.2min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 82.1min


0.720059558260935
{'n_factors': 51, 'cache_ratings': True, 'n_epochs': 100, 'lr_all': 0.01584893192461124, 'reg_all': 0.10000000000000082}


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 94.0min finished
