In [1]:
import pandas as pd
from surprise import Dataset, Reader

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)
all_train = data.build_full_trainset()

In [5]:
from cv_logs import ParameterSearch

from surprise import SVD
from surprise.model_selection import RandomizedSearchCV
import numpy as np

ITERATION = 1000
FOLDS = 3
param_grid = {
    "n_factors": np.arange(5, 500, 25),
    "n_epochs": [200],
    "biased": [True, False],
    "lr_all": 10 ** (np.arange(-5, 0, 0.4)),
    "reg_all": 10 ** (np.arange(-5, 0, 0.4)),
    "reg_bu": [0],
    "reg_bi": [0],
}
gs = RandomizedSearchCV(
    SVD,
    param_grid,
    measures=["rmse"],
    cv=FOLDS,
    n_jobs=-1,
    n_iter=ITERATION,
    joblib_verbose=3,
)
gs.fit(data)
ps = ParameterSearch(gs.cv_results)
ps.write("svd_1.csv")
print(gs.best_score["rmse"])
print(gs.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 23.0min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 37.2min


0.716466446380316
{'rmse': {'n_factors': 80, 'n_epochs': 200, 'biased': True, 'lr_all': 0.03981071705535002, 'reg_all': 0.10000000000000082, 'reg_bu': 0, 'reg_bi': 0}}


[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 43.7min finished


In [6]:
from predict import predict
from cv_logs import save_model

svd_1 = SVD(**gs.best_params["rmse"]).fit(all_train)

# Public score: 0.66628
predict(svd_1, "svd_1.csv")
save_model(svd_1, "svd_1")

In [6]:
from surprise import SVD

svd = SVD(n_factors=200, n_epochs=500, lr_all=0.001, reg_all=0.185, biased=False)
svd.fit(all_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11a01be50>

In [15]:
import pandas as pd

predictions = pd.DataFrame({}, columns=["Id", "rating"])

for Id, (user_id, item_id) in enumerate(X.values):
    predictions.loc[-1] = [Id, svd.predict(user_id, item_id).est]
    predictions.index = predictions.index + 1
predictions = predictions.set_index("Id").sort_index()
predictions.index = predictions.index.astype(int)
predictions.to_csv("svd.csv")

In [1]:
predictions

NameError: name 'predictions' is not defined