# Test for Recommender System using SciKit-Surprise

### import libraries

In [4]:
# Useful starting lines
%matplotlib inline
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
import time
%load_ext autoreload
%autoreload 2

In [None]:
# import scikit-surprise stuff
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate, GridSearchCV

In [22]:
import pandas as pd

In [39]:
# import costum stuff
from utility import *

### Import train and test set

In [8]:
# prepare train set file in correct format for scikit-surprise
filepathcsv = "../data/data_train.csv"
filepath = "../data/data_train.dat"
convertCSV2Surprise(filepathcsv,filepath)

In [9]:
# import train data
csvreader = Reader(line_format=u"user item rating",sep=";",rating_scale=(1,5),skip_lines=1)
data = Dataset.load_from_file(filepath,csvreader)

In [10]:
# prepare test set file in correct format for scikit-surprise
filepathcsv = "../data/sample_submission.csv"
filepath = "../data/sample_submission.dat"
convertCSV2Surprise(filepathcsv,filepath)

In [11]:
# import test data
testdata = Dataset.load_from_file(filepath,csvreader)

In [12]:
# chose algorithm
algo = SVD()

In [None]:
# test sequential execution
t = time.clock()
cross_validate(algo,data,measures=['RMSE','MAE'], cv=2, verbose=True)
t = time.clock()-t
print("Sequential execution time: ",t)

In [None]:
# testing parallel execution
t = time.clock()
cross_validate(algo,data,measures=['RMSE','MAE'], cv=2, verbose=True,n_jobs=-1)
t = time.clock()-t
print("Parallel execution time: ",t)

In [19]:
grid = \
{\
"n_factors": [i for i in range(10,101,10)],\
#"lr_all": [0.01,0.02,0.005,0.01,0.015,0.02,0.05,0.1],\
"reg_all": [0.01,0.02,0.03,0.04,0.05],\
#"n_epochs": [5,10,20,30,40,50],\
#"biased": [True,False]\
}

gridsearch = GridSearchCV(algo_class=SVD,param_grid=grid,measures=['RMSE','MAE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [20]:
gridsearch.fit(data)
print(gridsearch.best_score['rmse'])
print(gridsearch.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 61.0min finished


1.0062433265372734
{'n_factors': 10, 'reg_all': 0.05}


In [23]:
results_cv = pd.DataFrame.from_dict(gridsearch.cv_results)

In [25]:
results_cv.to_csv(r"../data/SVD-cv-nfact-regall.csv")

In [31]:
full_train = data.build_full_trainset()
best_algo = gridsearch.best_estimator["rmse"]
best_algo.fit(full_train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe9c1880390>

In [97]:
predictions = loadData2df("../data/sample_submission.csv")
generatePredictions(best_algo,predictions)
exportPredictions("../data/submission_bestOfSVDC-V1.csv",predictions)

TODO:

* ~~GridSearchCV~~

* build_full_trainset + algo.predict() -> kaggle

* test other algorithms

* ~~code export to .csv~~

* write intro (Netflix prize, Simon Fuchs, other applications,scikit-surpise (why reinvent the wheel?), short summary of content)

* important: in report show that we did understand the different methods!

* compare speed sgd, 