In [146]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Reader, Dataset, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
import pandas as pd
import numpy as np

In [147]:
def evaluate(clf, test):
    error = 0
    for i, j ,y in test.values:
        pred = clf.predict(int(i), int(j)).est
        error += (pred - y) ** 2
    return error / test.shape[0] / 2

In [148]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


reader = Reader(rating_scale=(1, 5))

train = Dataset.load_from_df(train_df[["User ID", "Movie ID", "Rating"]], reader)

param_grid = {"n_epochs": [100, 150, 200], "lr_all": [0.001, 0.003, 0.005], "reg_all": [0.05, 0.1, 0.2], "n_factors": [20]}
clf = GridSearchCV(SVD, param_grid, measures=["mse"], cv=5, refit=True, n_jobs=-1)

clf.fit(train)

In [149]:
print(clf.best_params)

{'mse': {'n_epochs': 150, 'lr_all': 0.003, 'reg_all': 0.1, 'n_factors': 20}}


In [150]:
clf = SVD(n_epochs=150, lr_all=0.003, reg_all=0.1, n_factors=20)

In [156]:
trainset = train.build_full_trainset()
clf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12c2a3bf0>

In [157]:
evaluate(clf, test_df)

0.4093883124893598

In [159]:
qi = clf.qi
latent_factors = []

for i in range(1, 1683):
    try:
        id = trainset.to_inner_iid(i)
        latent_factors.append(qi[id])
    except:
        latent_factors.append(np.zeros(20))

latent_factors = np.vstack(latent_factors)

In [160]:
np.save("surprise", latent_factors)