# Test for Recommender System using SciKit-Surprise

### import libraries

In [1]:
# Useful starting lines
%matplotlib inline
import random as rd
import numpy as np
import time
import datetime
import matplotlib as plt
%load_ext autoreload
%autoreload 2

In [76]:
# import scikit-surprise stuff
from surprise import SVD
from surprise import BaselineOnly
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate, GridSearchCV

In [3]:
# import pandas for DataFrame
import pandas as pd

In [4]:
# import costum stuff
from utility import *

In [5]:
rd.seed(10)

### Import train and test set

In [6]:
# prepare train set file in correct format for scikit-surprise
datafilepath = "../data/data_train.csv"
data_full_df = loadData2df(datafilepath)

In [7]:
#data_df = filterDf(data_full_df,20,50)

In [8]:
data_df = data_full_df

In [9]:
data_ds = loadData2ds(data_df)

In [10]:
predinfilepath = "../data/sample_submission.csv"
pred_df = loadData2df(predinfilepath)

In [11]:
full_train_ds = data_ds.build_full_trainset()

### Select and train algorithm

#### Preselected algorithm

In [12]:
#algo = SVD(n_factors=20, n_epochs=10, init_mean=0, init_std_dev=0.1, lr_all=0.007, reg_all=0.02, verbose = True)

In [13]:
#algo.fit(full_train_ds)

#### Algorithm from gridsearch

In [14]:
gridSVD = \
{\
 #"n_factors": [i for i in range(20,100,5)],\
"n_factors": [25],\
"lr_all": [0.007],\
#"reg_all": [0.02],\
"reg_bu":[0.01,0.02,0.03,0.04,0.05],\
"reg_bi":[0.01,0.02,0.03,0.04,0.05],\
#"reg_bu":[0.02],\
#"reg_bi":[0.02],\
"reg_pu":[0.01,0.02,0.03,0.04,0.05],\
"reg_qi":[0.01,0.02,0.03,0.04,0.05],\
#"reg_pu":[0.04],\
#"reg_qi":[0.03],\
"n_epochs": [20],\
"biased": [True]\
}

In [15]:
best_params = {\
'n_factors': 25,\
'lr_all': 0.007,\
'reg_bu': 0.05,\
'reg_bi': 0.02,\
'reg_pu': 0.05,\
'reg_qi': 0.05,\
'n_epochs': 20,\
'biased': True}

In [16]:
gridSVD = {\
'n_factors': [25],\
'lr_all': [0.007],\
'reg_bu': [0.05],\
'reg_bi': [0.02],\
'reg_pu': [0.05],\
'reg_qi': [0.05],\
'n_epochs': [50],\
'biased': [True]}

In [17]:
gridsearch = GridSearchCV(algo_class=SVD,param_grid=gridSVD,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [18]:
gridsearch.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


In [19]:
print(gridsearch.best_score['rmse'])
print(gridsearch.best_params['rmse'])

1.009855091361983
{'n_factors': 25, 'lr_all': 0.007, 'reg_bu': 0.05, 'reg_bi': 0.02, 'reg_pu': 0.05, 'reg_qi': 0.05, 'n_epochs': 50, 'biased': True}


In [23]:
min(gridsearch.best_params["rmse"]["n_epochs"],200)

50

In [None]:
results_cv = pd.DataFrame.from_dict(gridsearch.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [None]:
results_cv_path = "../data/cvresults/cv.csv"
results_cv_path = addDateAndTime(results_cv_path)
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

In [None]:
#results_cv_path = "../data/cvresults/cv_bestparams.csv"
#results_cv_path = addDateAndTime(results_cv_path)
#bestparam_cv.to_csv(path_or_buf=results_cv_path,mode="w")

### Make predictions

In [None]:
algo = gridsearch.best_estimator["rmse"]
algo.fit(full_train_ds)

In [None]:
generatePredictions(algo,pred_df)

### Export predictions and algo parameters

In [None]:
exportAlgoParameters(algo,add_date=True,add_time=True)
exportPredictions(pred_df,add_date=True,add_time=True)

TODO:

* vary different reg parameters

* ~~GridSearchCV~~

* build_full_trainset + algo.predict() -> kaggle

* test other algorithms

* ~~code export to .csv~~

* write intro (Netflix prize, Simon Fuchs, other applications,scikit-surpise (why reinvent the wheel?), short summary of content)

* important: in report show that we did understand the different methods!

* compare speed sgd, 

##### n_factor search

In [24]:
g_nfact = {\
'n_factors': [1,2,5,10,15,20,25,30,35,40,45,50],\
'lr_all': [0.007],\
'reg_bu': [0.05],\
'reg_bi': [0.02],\
'reg_pu': [0.05],\
'reg_qi': [0.05],\
'n_epochs': [20],\
'biased': [True]}

In [25]:
gs_nfact = GridSearchCV(algo_class=SVD,param_grid=g_nfact,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [26]:
gs_nfact.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 26.4min finished


In [27]:
print(gs_nfact.best_score['rmse'])
print(gs_nfact.best_params['rmse'])

0.9972140858254267
{'n_factors': 50, 'lr_all': 0.007, 'reg_bu': 0.05, 'reg_bi': 0.02, 'reg_pu': 0.05, 'reg_qi': 0.05, 'n_epochs': 20, 'biased': True}


In [28]:
results_cv = pd.DataFrame.from_dict(gs_nfact.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [29]:
results_cv_path = "../data/cvresults/cv-n_factors.csv"
results_cv_path = addDateAndTime(results_cv_path)
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

##### n_epochs search

In [30]:
g_nep = {\
'n_factors': [gs_nfact.best_params["rmse"]["n_factors"]],\
'lr_all': [0.007],\
'reg_bu': [0.05],\
'reg_bi': [0.02],\
'reg_pu': [0.05],\
'reg_qi': [0.05],\
'n_epochs': [10,20,50,100,200,500,1000],\
'biased': [True]}

In [31]:
gs_nep = GridSearchCV(algo_class=SVD,param_grid=g_nep,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [32]:
gs_nep.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed: 169.4min finished


In [33]:
print(gs_nep.best_score['rmse'])
print(gs_nep.best_params['rmse'])

0.9975508219697622
{'n_factors': 50, 'lr_all': 0.007, 'reg_bu': 0.05, 'reg_bi': 0.02, 'reg_pu': 0.05, 'reg_qi': 0.05, 'n_epochs': 20, 'biased': True}


In [34]:
results_cv = pd.DataFrame.from_dict(gs_nep.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [35]:
results_cv_path = "../data/cvresults/cv-n_epochs.csv"
results_cv_path = addDateAndTime(results_cv_path)
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

##### reg_pu and reg_qi search

In [36]:
g_reg = {\
'n_factors': [gs_nfact.best_params["rmse"]["n_factors"]],\
'lr_all': [0.007],\
'reg_bu': [0.05],\
'reg_bi': [0.02],\
'reg_pu': [0.01,0.05,0.1,0.5,1,5],\
'reg_qi': [0.01,0.05,0.1,0.5,1,5],\
'n_epochs': [min(gs_nep.best_params["rmse"]["n_epochs"],100)],\
'biased': [True]}

In [37]:
gs_reg = GridSearchCV(algo_class=SVD,param_grid=g_reg,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [38]:
gs_reg.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 84.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 97.9min finished


In [39]:
print(gs_reg.best_score['rmse'])
print(gs_reg.best_params['rmse'])

0.9948947915686087
{'n_factors': 50, 'lr_all': 0.007, 'reg_bu': 0.05, 'reg_bi': 0.02, 'reg_pu': 0.5, 'reg_qi': 0.01, 'n_epochs': 20, 'biased': True}


In [40]:
results_cv = pd.DataFrame.from_dict(gs_reg.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [41]:
results_cv_path = "../data/cvresults/cv-reg-pu-qi.csv"
results_cv_path = addDateAndTime(results_cv_path)
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

In [49]:
algo = gs_reg.best_estimator["rmse"]

In [50]:
algo.fit(full_train_ds)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f219fd8b8d0>

In [52]:
generatePredictions(algo,pred_df)

In [53]:
exportAlgoParameters(algo,add_date=True,add_time=True)
exportPredictions(pred_df,add_date=True,add_time=True)

##### n_epochs lr=0.01, reg =0.1

In [56]:
g_nepch2 = {\
'n_factors': [10],\
'lr_all': [0.01],\
'reg_all': [0.1],\
#'reg_bu': [0.05],\
#'reg_bi': [0.02],\
#'reg_pu': [0.01,0.05,0.1,0.5,1,5],\
#'reg_qi': [0.01,0.05,0.1,0.5,1,5],\
'n_epochs': [5,10,15,20,50,100,150,200],\
'biased': [True]}

In [57]:
gs_nepch2 = GridSearchCV(algo_class=SVD,param_grid=g_nepch2,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [58]:
gs_nepch2.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 37.1min finished


In [64]:
print(gs_nepch2.best_score['rmse'])
print(gs_nepch2.best_params['rmse'])

0.9942575327315921
{'n_factors': 10, 'lr_all': 0.01, 'reg_all': 0.1, 'n_epochs': 150, 'biased': True}


In [65]:
results_cv = pd.DataFrame.from_dict(gs_nepch2.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [66]:
results_cv_path = "../data/cvresults/cv-gs_nepch2.csv"
results_cv_path = addDateAndTime(results_cv_path)
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

In [67]:
results_cv_path

'../data/cvresults/cv-gs_nepch2_2018-12-19_1753.csv'

In [68]:
best_algo = gs_nepch2.best_estimator["rmse"]

In [69]:
best_algo.fit(full_train_ds)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f21b1c95eb8>

In [70]:
generatePredictions(algo,pred_df)

In [72]:
exportPredictions(pred_df,outfilepath="../data/submissions/SVD-jules.csv",add_date=True,add_time=True)

#### Baseline

In [83]:
g_bl = {'bsl_options': {'method': ['als'],
              'reg_u': [1, 2,5,10,15,20],
              'reg_i': [1, 2,5,10,15,20],
              'n_epochs': [10]}}

In [84]:
gs_bl = GridSearchCV(algo_class=BaselineOnly,param_grid=g_bl,measures=['RMSE'], cv=5, n_jobs=-1,refit=True,joblib_verbose=2)

In [85]:
gs_bl.fit(data_ds)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 51.7min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 61.6min finished


Estimating biases using als...


In [86]:
print(gs_bl.best_score['rmse'])
print(gs_bl.best_params['rmse'])

0.999267991510688
{'bsl_options': {'method': 'als', 'reg_u': 15, 'reg_i': 1, 'n_epochs': 10}}


In [87]:
results_cv = pd.DataFrame.from_dict(gs_bl.cv_results)
#bestparam_cv = pd.DataFrame.from_dict(gridsearch.best_params["rmse"])

In [88]:
results_cv_path = "../data/cvresults/cv-bl.csv"
results_cv_path = addDateAndTime(results_cv_path)
results_cv.to_csv(path_or_buf=results_cv_path,mode="w")

In [89]:
results_cv_path

'../data/cvresults/cv-bl_2018-12-20_0116.csv'

In [90]:
best_algo = gs_bl.best_estimator["rmse"]

In [91]:
best_algo.fit(full_train_ds)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f21b1c95eb8>

In [92]:
generatePredictions(algo,pred_df)

In [93]:
exportPredictions(pred_df,outfilepath="../data/submissions/bl.csv",add_date=True,add_time=True)

In [95]:
results_cv

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_bsl_options
0,0.998224,1.0006,1.001413,1.000364,0.999318,0.999983,0.001105,30,0.986609,0.066055,3.471516,0.058226,"{'bsl_options': {'method': 'als', 'reg_u': 1, ...","{'method': 'als', 'reg_u': 1, 'reg_i': 1, 'n_e..."
1,0.998224,1.000597,1.001417,1.000367,0.999314,0.999984,0.001107,31,1.033023,0.010019,3.389617,0.069689,"{'bsl_options': {'method': 'als', 'reg_u': 1, ...","{'method': 'als', 'reg_u': 1, 'reg_i': 2, 'n_e..."
2,0.998232,1.000598,1.001435,1.000387,0.999314,0.999993,0.00111,32,0.950849,0.123102,3.453298,0.086903,"{'bsl_options': {'method': 'als', 'reg_u': 1, ...","{'method': 'als', 'reg_u': 1, 'reg_i': 5, 'n_e..."
3,0.998266,1.00062,1.001481,1.000438,0.999334,1.000028,0.001115,34,0.794171,0.232949,2.51353,0.788055,"{'bsl_options': {'method': 'als', 'reg_u': 1, ...","{'method': 'als', 'reg_u': 1, 'reg_i': 10, 'n_..."
4,0.998318,1.00066,1.001542,1.000504,0.999373,1.000079,0.001119,35,0.554456,0.004899,1.782659,0.057791,"{'bsl_options': {'method': 'als', 'reg_u': 1, ...","{'method': 'als', 'reg_u': 1, 'reg_i': 15, 'n_..."
5,0.998381,1.000714,1.001613,1.00058,0.999426,1.000143,0.001122,36,0.570167,0.02261,1.898418,0.11706,"{'bsl_options': {'method': 'als', 'reg_u': 1, ...","{'method': 'als', 'reg_u': 1, 'reg_i': 20, 'n_..."
6,0.998082,1.000462,1.001295,1.000249,0.99918,0.999854,0.001113,25,0.585266,0.022362,1.937198,0.211826,"{'bsl_options': {'method': 'als', 'reg_u': 2, ...","{'method': 'als', 'reg_u': 2, 'reg_i': 1, 'n_e..."
7,0.998082,1.00046,1.0013,1.000253,0.999177,0.999854,0.001115,26,0.580255,0.036663,1.762989,0.047871,"{'bsl_options': {'method': 'als', 'reg_u': 2, ...","{'method': 'als', 'reg_u': 2, 'reg_i': 2, 'n_e..."
8,0.998091,1.000461,1.001319,1.000274,0.999177,0.999865,0.001119,27,0.577056,0.035606,1.997072,0.269277,"{'bsl_options': {'method': 'als', 'reg_u': 2, ...","{'method': 'als', 'reg_u': 2, 'reg_i': 5, 'n_e..."
9,0.998126,1.000484,1.001367,1.000327,0.999199,0.999901,0.001124,28,0.561497,0.014636,1.744278,0.054005,"{'bsl_options': {'method': 'als', 'reg_u': 2, ...","{'method': 'als', 'reg_u': 2, 'reg_i': 10, 'n_..."
