In [2]:
import pandas as pd
from surprise import Dataset, Reader

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)
all_train = data.build_full_trainset()

In [3]:
from cv_logs import ParameterSearch, save_model
from predict import predict


import numpy as np
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import GridSearchCV

FOLD = 3
# find best n_factor
# best: 1
ROUND = 1

param_grid = {
    "n_factors": np.arange(1, 200),
    "n_epochs": [200],
    "lr_all": [0.03981071705535002],
    "reg_all": [0.10000000000000082],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVD,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
ps = ParameterSearch(rs.cv_results)
ps.write("svd-bcd1.csv")
print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVD(**rs.best_params["rmse"]).fit(all_train)
# public score: 0.66533 -> best so far for SVD
predict(best_n, "svd-bcd1.csv")
save_model(best_n, "svd-bcd1")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.8min


0.6924387356256961
{'n_factors': 1, 'n_epochs': 200, 'lr_all': 0.03981071705535002, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}


[Parallel(n_jobs=-1)]: Done 597 out of 597 | elapsed:  4.7min finished


In [6]:
# find best n_epochs
# best: 72
ROUND = 2

param_grid = {
    "n_factors": [1],
    "n_epochs": np.arange(1, 2500),
    "lr_all": [0.03981071705535002],
    "reg_all": [0.10000000000000082],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVD,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
model_name = f"SVD-bcd{ROUND}"
ps = ParameterSearch(rs.cv_results)
ps.write(f"{model_name}.csv")

print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVD(**rs.best_params["rmse"]).fit(all_train)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 3026 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | ela

0.6907647745692946
{'n_factors': 1, 'n_epochs': 72, 'lr_all': 0.03981071705535002, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}


In [7]:
# find best lr_all
# best: 0.033
ROUND = 3

param_grid = {
    "n_factors": [1],
    "n_epochs": [72],
    "lr_all": np.arange(0.001, 0.101, 0.001),
    "reg_all": [0.10000000000000082],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVD,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
model_name = f"SVD-bcd{ROUND}"
ps = ParameterSearch(rs.cv_results)
ps.write(f"{model_name}.csv")

print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVD(**rs.best_params["rmse"]).fit(all_train)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   23.3s finished


0.6964187158416534
{'n_factors': 1, 'n_epochs': 72, 'lr_all': 0.033, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}


In [8]:
# find best reg_all
# best: 0.047
ROUND = 4

param_grid = {
    "n_factors": [1],
    "n_epochs": [72],
    "lr_all": [0.033],
    "reg_all": np.arange(0.001, 0.991, 0.001),
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVD,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
model_name = f"SVD-bcd{ROUND}"
ps = ParameterSearch(rs.cv_results)
ps.write(f"{model_name}.csv")

print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVD(**rs.best_params["rmse"]).fit(all_train)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2970 out of 2970 | elapsed:  4.3min finished


0.6847205602506046
{'n_factors': 1, 'n_epochs': 72, 'lr_all': 0.033, 'reg_all': 0.047, 'reg_bu': 0, 'reg_bi': 0}


In [9]:
# find from top 5 best params
ROUND = 5

param_grid = {
    "n_factors": [1, 2, 69, 47, 45],
    "n_epochs": [72, 101, 100, 103, 78],
    "lr_all": [0.033, 0.038, 0.031, 0.037, 0.027],
    "reg_all": [0.047, 0.033, 0.048, 0.031, 0.032],
    "reg_bu": [0],
    "reg_bi": [0],
}
rs = GridSearchCV(
    SVD,
    param_grid,
    measures=["rmse"],
    cv=FOLD,
    n_jobs=-1,
    joblib_verbose=5,
)
rs.fit(data)
model_name = f"SVD-bcd{ROUND}"
ps = ParameterSearch(rs.cv_results)
ps.write(f"{model_name}.csv")

print(rs.best_score["rmse"])
print(rs.best_params["rmse"])
best_n = SVD(**rs.best_params["rmse"]).fit(all_train)
# public score:?
predict(best_n, f"{model_name}.csv")
save_model(best_n, model_name)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 1875 out of 1875 | elapsed:  5.4min finished


0.693165548309456
{'n_factors': 1, 'n_epochs': 100, 'lr_all': 0.027, 'reg_all': 0.047, 'reg_bu': 0, 'reg_bi': 0}
