# Test for Recommender System using SciKit-Surprise

### import libraries

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
import time
import datetime
%load_ext autoreload
%autoreload 2

In [2]:
# import scikit-surprise stuff
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate, GridSearchCV

In [3]:
# import pandas for DataFrame
import pandas as pd

In [4]:
# import costum stuff
from utility import *

### Import train and test set

In [5]:
# prepare train set file in correct format for scikit-surprise
data_df = loadData2df

In [6]:
datafilepath = "../data/data_train.csv"
data_df = loadData2df(datafilepath)

In [7]:
data_ds = loadData2ds(data_df)

In [8]:
predinfilepath = "../data/sample_submission.csv"
pred_df = loadData2df(predinfilepath)

In [9]:
full_train_ds = data_ds.build_full_trainset()

### Select and train algorithm

#### Preselected algorithm

In [10]:
#algo = SVD(n_factors=20, n_epochs=10, init_mean=0, init_std_dev=0.1, lr_all=0.007, reg_all=0.02, verbose = True)

In [11]:
#algo.fit(full_train_ds)

#### Algorithm from gridsearch

In [12]:
grid = \
{\
 #"n_factors": [i for i in range(20,100,5)],\
"n_factors": [25],\
"lr_all": [0.007],\
#"reg_all": [0.02],\
"reg_bu":[0.01,0.02,0.03,0.04,0.05],\
"reg_bi":[0.01,0.02,0.03,0.04,0.05],\
#"reg_bu":[0.02],\
#"reg_bi":[0.02],\
"reg_pu":[0.01,0.02,0.03,0.04,0.05],\
"reg_qi":[0.01,0.02,0.03,0.04,0.05],\
#"reg_pu":[0.04],\
#"reg_qi":[0.03],\
"n_epochs": [20],\
"biased": [True]\
}

gridsearch = GridSearchCV(algo_class=SVD,param_grid=grid,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [13]:
gridsearch.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 67.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 157.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 280.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 440.6min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 645.6min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 901.2min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 1345.8min
[Parallel(n_jobs=-1)]: Done 3125 out of 3125 | elapsed: 1603.2min finished


In [14]:
print(gridsearch.best_score['rmse'])
print(gridsearch.best_params['rmse'])

0.9970629058989271
{'n_factors': 25, 'lr_all': 0.007, 'reg_bu': 0.05, 'reg_bi': 0.02, 'reg_pu': 0.05, 'reg_qi': 0.05, 'n_epochs': 20, 'biased': True}


In [16]:
results_cv = pd.DataFrame.from_dict(gridsearch.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [17]:
results_cv_path = "../data/cvresults/cv.csv"
results_cv_path = addDateAndTime(results_cv_path)
print(results_cv_path)

../data/cvresults/cv_2018-12-18_0859.csv


In [18]:
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

In [None]:
results_cv_path = "../data/cvresults/cv_bestparams.csv"
results_cv_path = addDateAndTime(results_cv_path)
print(results_cv_path)

In [None]:
bestparam_cv.to_csv(path_or_buf=results_cv_path,mode="w")

In [19]:
algo = gridsearch.best_estimator["rmse"]
algo.fit(full_train_ds)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff7355a44a8>

### Make predictions

In [20]:
generatePredictions(algo,pred_df)

### Export predictions and algo parameters

In [21]:
exportAlgoParameters(algo,add_date=True,add_time=True)
exportPredictions(pred_df,add_date=True,add_time=True)

TODO:

* vary different reg parameters

* ~~GridSearchCV~~

* build_full_trainset + algo.predict() -> kaggle

* test other algorithms

* ~~code export to .csv~~

* write intro (Netflix prize, Simon Fuchs, other applications,scikit-surpise (why reinvent the wheel?), short summary of content)

* important: in report show that we did understand the different methods!

* compare speed sgd, 

In [28]:
algo.default_prediction()

3.8572805008190647

In [30]:
data_df.max()

userId    10000.0
itemId     1000.0
rating        5.0
dtype: float64

In [35]:
algo.predict(10001,1)

Prediction(uid=10001, iid=1, r_ui=None, est=3.257981945109676, details={'was_impossible': False})

In [36]:
algo.predict(10002,1)

Prediction(uid=10002, iid=1, r_ui=None, est=3.257981945109676, details={'was_impossible': False})

In [37]:
algo.predict(10002,1001)

Prediction(uid=10002, iid=1001, r_ui=None, est=3.8572805008190647, details={'was_impossible': False})

In [38]:
algo.predict(1231,1001)

Prediction(uid=1231, iid=1001, r_ui=None, est=3.869582683860822, details={'was_impossible': False})

In [39]:
algo.predict(1231,1002)

Prediction(uid=1231, iid=1002, r_ui=None, est=3.869582683860822, details={'was_impossible': False})