In [2]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVDpp

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)
full_train_data = data.build_full_trainset()

In [3]:
from cv_logs import ParameterSearch, save_model
from predict import predict

import numpy as np
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.model_selection import RandomizedSearchCV

ITERATIONS = 1000
FOLD = 4

param_grid = {
    "n_factors": np.arange(1, 151, 1),
    "cache_ratings": ["True"],
    "n_epochs": np.arange(50, 1001, 50),
    "lr_all": 10 ** (np.arange(-5, 0, 0.25)),
    "reg_all": 10 ** (np.arange(-5, 0, 0.25)),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = RandomizedSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    n_iter=ITERATIONS,
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
ps.write(f"SVDpp-cv{FOLD}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"SVDpp-cv{FOLD}.csv")
save_model(best_n, f"SVDpp-cv{FOLD}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 36.1min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed: 44.6min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 52.9min
[Parallel(n_jobs=-1)]: Done 3018 tasks      | elapsed: 62.2min
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed: 71.8min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 82.1min finished


0.6765337174501169
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 350, 'lr_all': 0.0017782794100389228, 'reg_all': 0.001, 'reg_bu': 0, 'reg_bi': 0}


In [4]:
from cv_logs import ParameterSearch, save_model
from predict import predict

import numpy as np
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.model_selection import RandomizedSearchCV

ITERATIONS = 1000
FOLD = 2

param_grid = {
    "n_factors": np.arange(1, 151, 1),
    "cache_ratings": ["True"],
    "n_epochs": np.arange(50, 1001, 50),
    "lr_all": 10 ** (np.arange(-5, 0, 0.25)),
    "reg_all": 10 ** (np.arange(-5, 0, 0.25)),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = RandomizedSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    n_iter=ITERATIONS,
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
ps.write(f"SVDpp-cv{FOLD}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"SVDpp-cv{FOLD}.csv")
save_model(best_n, f"SVDpp-cv{FOLD}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 19.9min finished


0.7459400348764844
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 450, 'lr_all': 0.0031622776601683794, 'reg_all': 0.0031622776601683794, 'reg_bu': 0, 'reg_bi': 0}
