In [3]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVDpp

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)
full_train_data = data.build_full_trainset()

In [4]:
from cv_logs import ParameterSearch, save_model
from predict import predict

import numpy as np
from surprise.model_selection import GridSearchCV

FOLD = 3
# Find best n_factor
# best: 1
ROUND = 1

param_grid = {
    "n_factors": np.arange(1, 200),
    "cache_ratings": ["True"],
    "n_epochs": [1500],
    "lr_all": [0.0063095734448019684],
    "reg_all": [0.10000000000000082],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
ps.write(f"SVDpp-bcd{ROUND}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score: 0.66082 -> Best so far
predict(best_n, f"SVDpp_bcd{ROUND}.csv")
save_model(best_n, f"SVDpp-bcd{ROUND}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 18.2min


0.6862497170793826
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 1500, 'lr_all': 0.0063095734448019684, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}


[Parallel(n_jobs=-1)]: Done 597 out of 597 | elapsed: 34.4min finished


In [9]:
# find best n_epochs
# best: 641
ROUND = 2

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": np.arange(1, 2000),
    "lr_all": [0.0063095734448019684],
    "reg_all": [0.10000000000000082],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score: 0.66031 -> Higher test RMSE but lower public score ??
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3018 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 4584 tasks      | e

0.6913403248287664
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 641, 'lr_all': 0.0063095734448019684, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}


In [10]:
# find best lr_all
# best: 0.009000000000000001
ROUND = 3

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [641],
    "lr_all": np.arange(0.001, 0.101, 0.001),
    "reg_all": [0.10000000000000082],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score: 0.66096 -> Higher
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   43.8s finished


0.685283643383678
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 641, 'lr_all': 0.009000000000000001, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}


In [11]:
# find best reg_all
# best: 0.06
ROUND = 4

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [641],
    "lr_all": [0.009000000000000001],
    "reg_all": np.arange(0.001, 0.991, 0.001),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score: 0.65893 -> Improvement, finally lower than 0.66
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 2970 out of 2970 | elapsed:  6.6min finished


0.689529886902589
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 641, 'lr_all': 0.009000000000000001, 'reg_all': 0.06, 'reg_bu': 0, 'reg_bi': 0}


In [12]:
# from top 5 of all previous step -> GridSearch
# 5^4 rounds = 625
ROUND = 5

param_grid = {
    "n_factors": [1, 2, 42, 56, 55],
    "cache_ratings": ["True"],
    "n_epochs": [641, 560, 461, 499, 612],
    "lr_all": [0.009, 0.007, 0.008, 0.004, 0.011],
    "reg_all": [0.06, 0.059, 0.057, 0.066, 0.053],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score:
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 1875 out of 1875 | elapsed: 14.5min finished


0.6810640884676071
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 461, 'lr_all': 0.004, 'reg_all': 0.057, 'reg_bu': 0, 'reg_bi': 0}


In [None]:
# n_factors -> Consistently with 1
# conclude best n_factors = 1
ROUND = 6
pass

In [13]:
# find best n_epochs
# best: 584
ROUND = 7

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": np.arange(1, 3000),
    "lr_all": [0.004],
    "reg_all": [0.057],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score:
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3018 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 4584 tasks      | e

0.6885794095967649
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 584, 'lr_all': 0.004, 'reg_all': 0.057, 'reg_bu': 0, 'reg_bi': 0}


In [16]:
# find best lr_all
# best: 0.009 -> same as last time
ROUND = 8

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [584],
    "lr_all": np.arange(0.001, 0.101, 0.00025),
    "reg_all": [0.057],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score:
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  2.3min finished


0.6900425508617829
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 584, 'lr_all': 0.009000000000000001, 'reg_all': 0.057, 'reg_bu': 0, 'reg_bi': 0}


In [18]:
# find best reg_all
# best: 0.0655
ROUND = 9

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [584],
    "lr_all": [0.009],
    "reg_all": np.arange(0.001, 1.5, 0.00025),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score:
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 3018 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 4584 tasks      | e

0.6892550074157593
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 584, 'lr_all': 0.009, 'reg_all': 0.0655, 'reg_bu': 0, 'reg_bi': 0}


In [19]:
# find from top 5 of all params
ROUND = 10

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [584, 752, 675, 623, 591],
    "lr_all": [0.009, 0.00875, 0.01125, 0.005, 0.00825],
    "reg_all": [0.0655, 0.064, 0.06225, 0.06375, 0.074],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# Public score:
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:   48.2s finished


0.6900472311890717
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 584, 'lr_all': 0.005, 'reg_all': 0.064, 'reg_bu': 0, 'reg_bi': 0}
