In [1]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVDpp

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)
full_train_data = data.build_full_trainset()

In [7]:
from cv_logs import ParameterSearch, save_model
from predict import predict

import numpy as np
from surprise.model_selection import GridSearchCV

# 0.6765337174501169
# {'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 350, 'lr_all': 0.0017782794100389228, 'reg_all': 0.001, 'reg_bu': 0, 'reg_bi': 0}
FOLD = 4
# Find best n_epochs
# best:
ROUND = 1

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": np.arange(1, 2000),
    "lr_all": [0.0017782794100389228],
    "reg_all": [0.001],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 501 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 666 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 864 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 1098 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 1368 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1674 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2016 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2394 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2808 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3258 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 3744 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 4266 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 4824 tasks      | 

0.6765267550457383
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 567, 'lr_all': 0.0017782794100389228, 'reg_all': 0.001, 'reg_bu': 0, 'reg_bi': 0}


In [8]:
# Find best lr_all
# best: 0.0022500000000000003
ROUND = 2

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [567],
    "lr_all": np.arange(0.0001, 0.01, 0.00005),
    "reg_all": [0.001],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 out of 792 | elapsed:  2.0min finished


0.6765042378988935
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 567, 'lr_all': 0.0022500000000000003, 'reg_all': 0.001, 'reg_bu': 0, 'reg_bi': 0}


In [9]:
# Find best reg_all
# best: 0.008350000000000002
ROUND = 3

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [567],
    "lr_all": [0.0022500000000000003],
    "reg_all": np.arange(0.0001, 0.01, 0.00005),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 out of 792 | elapsed:  2.0min finished


0.6733492230734728
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 567, 'lr_all': 0.0022500000000000003, 'reg_all': 0.008350000000000002, 'reg_bu': 0, 'reg_bi': 0}


In [10]:
# Find best from top 5
ROUND = 4

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [567, 519, 548, 701, 764],
    "lr_all": [0.00225, 0.0019, 0.00235, 0.00195, 0.0026],
    "reg_all": [0.00835, 0.00945, 0.00705, 0.0051, 0.00695],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.4min finished


0.6785746313067377
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 567, 'lr_all': 0.0026, 'reg_all': 0.00945, 'reg_bu': 0, 'reg_bi': 0}


In [11]:
# Find best n_epochs
# best:
ROUND = 5

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": np.arange(1, 1000),
    "lr_all": [0.0026],
    "reg_all": [0.00945],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 738 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 936 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 1170 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done 1440 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1746 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2088 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2466 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 2880 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3330 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 3816 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 3996 out of 3996 | elapsed:  8.6min finished


0.6716045856187244
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 482, 'lr_all': 0.0026, 'reg_all': 0.00945, 'reg_bu': 0, 'reg_bi': 0}


In [12]:
# Find best lr_all
# best: 0.0025
ROUND = 6

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [482],
    "lr_all": np.arange(0.00001, 0.01, 0.00001),
    "reg_all": [0.00945],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 3018 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 3996 out of 3996 | elapsed:  8.4min finished


0.6748920417113103
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 482, 'lr_all': 0.0025, 'reg_all': 0.00945, 'reg_bu': 0, 'reg_bi': 0}


In [13]:
# Find best reg_all
# best: 0.008150000000000001
ROUND = 7

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [482],
    "lr_all": [0.0025],
    "reg_all": np.arange(0.00001, 0.01, 0.00001),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 858 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2154 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 3018 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 3504 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 3996 out of 3996 | elapsed:  8.4min finished


0.6734784160695896
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 482, 'lr_all': 0.0025, 'reg_all': 0.008150000000000001, 'reg_bu': 0, 'reg_bi': 0}


In [14]:
# Find best from top 5 of all params
ROUND = 8

param_grid = {
    "n_factors": [1],
    "cache_ratings": ["True"],
    "n_epochs": [482, 575, 452, 424, 577],
    "lr_all": [0.0025, 0.00251, 0.00263, 0.00285, 0.00298],
    "reg_all": [0.00815, 0.00998, 0.00533, 0.00731, 0.00942],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
model_name = f"SVDpp-bcd{ROUND}-cv{FOLD}"
ps.write(f"{model_name}.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVDpp(**rs.best_params["rmse"]).fit(full_train_data)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.1min finished


0.6767424866505474
{'n_factors': 1, 'cache_ratings': 'True', 'n_epochs': 577, 'lr_all': 0.00263, 'reg_all': 0.00942, 'reg_bu': 0, 'reg_bi': 0}
