In [1]:
#Imports
import numpy as np
import pandas as pd
import random
import plots
from matplotlib import pyplot as plt
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly
from surprise import KNNBaseline
from surprise import SlopeOne
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

my_seed = 200
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
#Helpers functions to go from the crowdai format to the surprise one

def get_users(line):
    row, col = line.split("_")
    row = row.replace("r", "")
    return int(row)

def get_items(line):
    row, col = line.split("_")
    col = col.replace("c", "")
    return int(col)

def to_surprise(data):
    data['userID'] = data['Id'].apply(get_users)
    data['itemID'] = data['Id'].apply(get_items)
    data = data.drop('Id', axis=1)
    data = data.rename(columns={'Prediction':'rating'})[['userID','itemID','rating']]
    return data

In [3]:
#Load data train file
train = pd.read_csv('data_train.csv')
train = to_surprise(train)

In [4]:
#Split data into models training part and blender training part
models = train.sample(frac=0.8, random_state=200)
blend = train.drop(models.index)

In [5]:
#Load both datasets into surprise as datasets and trainsets objects
reader = Reader(rating_scale=(1, 5))
models_surp = Dataset.load_from_df(models, reader)
models_surp_train = models_surp.build_full_trainset()
blend_surp = Dataset.load_from_df(blend, reader)
blend_surp_train = blend_surp.build_full_trainset()
#Load blend train set as a testset for models performance evaluation and blending process
blend_surp_test = blend_surp_train.build_testset()

We grid search the best hyperparameters for each models individually on the models training set. We evaluate each combination based on a K=3 Fold CV procedure (folds are set to be the same every time for reproducibility). Then we will pick the combinations yielding the smallest average RMSE over the folds and refit the models on the whole models training data. Note that some models do not require tuning (global mean, user/item mean, slopeone) and will therefore be fitted directly on the whole models train dataset.

Procédure pour chaque algo à tune:
- Etablir la grid pour l'algo
- Run grid search sur l'algo
- Extraire meilleure combinaison de paramètres
- Refit sur tout le set

In [7]:
#Baseline
grid_baseline = {'bsl_options': {'method': ['sgd'],
                              'reg': [10**-i for i in range(1,4)]}
                }
gs_baseline = GridSearchCV(BaselineOnly, grid_baseline, measures=['rmse'], 
                           cv=KFold(n_splits=3, random_state=200, shuffle=False))
gs_baseline.fit(models_surp)
print('Best Hyperparameters: ', gs_baseline.best_params['rmse'])
algo_baseline = gs_baseline.best_estimator['rmse']
algo_baseline.fit(models_surp_train)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Best Hyperparameters:  {'bsl_options': {'method': 'sgd', 'reg': 0.01}}
Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x11be3d9e8>

In [None]:
#SVD with baseline
grid_SVDb = {'reg_all': [10**-i for i in range(1,4)], 'biased':[True], 'n_factors':[20,50,70,100,150,200], 'random_state':[200]}
gs_SVDb = GridSearchCV(SVD, grid_SVDb, measures=['rmse'], 
                       cv=KFold(n_splits=3, random_state=200, shuffle=False))
gs_SVDb.fit(models_surp)
print('Best Hyperparameters: ', gs_SVDb.best_params['rmse'])
algo_SVDb = gs_SVD.best_estimator['rmse']
algo_SVDb.fit(models_surp_train)

In [None]:
#SVD without baseline
grid_SVD = {'reg_all':[10**-i for i in range(1,4)], 'biased':[False], 'n_factors':[20,50,70,100,150,200], 'random_state':[200]}
gs_SVD = GridSearchCV(SVD, grid_SVD, measures=['rmse'], 
                      cv=KFold(n_splits=3, random_state=200, shuffle=False))
gs_SVD.fit(models_surp)
print('Best Hyperparameters: ', gs_SVD.best_params['rmse'])
algo_SVD = gs_SVD.best_estimator['rmse']
algo_SVD.fit(models_surp_train)

In [None]:
#SVD++
grid_SVDpp = {'reg_all':[0.01], 'n_factors':[20], 'random_state':[200]}
gs_SVDpp = GridSearchCV(SVDpp, grid_SVDpp, measures=['rmse'], 
                        cv=KFold(n_splits=3, random_state=200, shuffle=False))
gs_SVDpp.fit(models_surp)
print('Best Hyperparameters: ', gs_SVDpp.best_params['rmse'])
algo_SVDpp = gs_SVDpp.best_estimator['rmse']
algo_SVDpp.fit(models_surp_train)

In [10]:
#Slope One
algo_slope_one = SlopeOne()
algo_slope_one.fit(models_surp_train)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x1180126d8>

In [11]:
#KNN user
grid_knn_user = {'bsl_options': {'method': ['sgd'],
                              'reg': [10**-i for i in range(1,4)]},
                              'k': [40, 80, 120],
                              'sim_options': {'name': ['pearson_baseline'],
                              'min_support': [1],
                              'user_based': [True]}
                }
gs_knn_user = GridSearchCV(KNNBaseline, grid_knn_user, measures=['rmse'], 
                        cv=KFold(n_splits=3, random_state=200, shuffle=False))
gs_knn_user.fit(models_surp)
print('Best Hyperparameters: ', gs_knn_user.best_params['rmse'])
algo_knn_user = gs_knn_user.best_estimator['rmse']
algo_knn_user.fit(models_surp_train)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

In [19]:
#KNN movie
grid_knn_movie = {'bsl_options': {'method': ['sgd'],
                              'reg': [10**-i for i in range(1,4)]},
                              'k': [40, 80, 120],
                              'sim_options': {'name': ['pearson_baseline'],
                              'min_support': [1],
                              'user_based': [False]}
                }
gs_knn_movie = GridSearchCV(KNNBaseline, grid_knn_movie, measures=['rmse'], 
                        cv=KFold(n_splits=3, random_state=200, shuffle=False))
gs_knn_movie.fit(models_surp)
print('Best Hyperparameters: ', gs_knn_movie.best_params['rmse'])
algo_knn_movie = gs_knn_movie.best_estimator['rmse']
algo_knn_movie.fit(models_surp_train)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Best Hyperparameters:  {'bsl_options': {'method': 'sgd', 'reg': 0.01}, 'k': 40, 'sim_options': {'name': 'pearson_baseline', 'min_support': 1, 'user_based': False}}
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x13dd09278>

Now that every algorithm has been fitted on the whole models train dataset we will evaluate their performance (RMSE) on the blend train dataset. This set is therefore also used as a validation set for individual models.

In [8]:
#Baseline
predictions_baseline = algo_baseline.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_baseline, verbose=False))

RMSE on validation set:  1.0034616447261646


In [None]:
#SVD with baseline
predictions_SVDb = algo_SVDb.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_SVDb, verbose=False))

In [None]:
#SVD without baseline
predictions_SVD = algo_SVD.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_SVD, verbose=False))

In [None]:
#SVD++
predictions_SVDpp = algo_SVDpp.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_SVDpp, verbose=False))

In [None]:
#Slope One
predictions_slope_one = algo_slope_one.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_slope_one, verbose=False))

In [None]:
#KNN user
predictions_knn_user = algo_knn_user.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_knn_user, verbose=False))

In [20]:
#KNN movie
predictions_knn_movie = algo_knn_movie.test(blend_surp_test)
print('RMSE on validation set: ', accuracy.rmse(predictions_knn_movie, verbose=False))

RMSE on validation set:  0.9916991795856803


In [None]:
#Recover ids and estimations for each algorithm
uids = [pred.uid for pred in predictions_baseline]
mids = [pred.iid for pred in predictions_baseline]
est_baseline = [pred.est for pred in predictions_baseline]
est_SVDb = [pred.est for pred in predictions_SVDb]
est_SVD = [pred.est for pred in predictions_SVD]
est_SVDpp = [pred.est for pred in predictions_SVDpp]
est_slope_one = [pred.est for pred in predictions_slope_one]
est_knn_user = [pred.est for pred in predictions_knn_user]
est_knn_movie = [pred.est for pred in predictions_knn_movie]

In [None]:
#Build dataframe containing ids and estimations for each algorithm

We will now use the blend train set to train our model blending algorithm. We model the estimated rating as a linear combination of estimated ratings for each model. We will resort to ridge regression to compute the weights of our model. The best ridge hyperparameter is picked based on a 3 fold CV procedure (objective function = average RMSE) operated on the blend train set. 