# Baselines

This notebook is for running and comparing the baselines.


In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import ElasticNet
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix
from tqdm import tqdm

from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import (
    Reader,
    Dataset,
    SVD,
    SVDpp,
    NMF,
    accuracy,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    KNNBaseline,
)

## Paths

In [2]:
data_root = Path("../data")

## Data

In [3]:
train_df = pd.read_csv(data_root / "train.csv")
train_df

Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0
...,...,...,...
100518,15374,69658,2.0
100519,11063,69658,2.5
100520,18444,29981,1.0
100521,5917,38009,1.0


In [4]:
test_df = pd.read_csv(data_root / "test.csv")
del test_df["id"]
test_df

Unnamed: 0,book_id,user_id
0,3786,40484
1,1985,47039
2,2290,60111
3,118657,64447
4,1560,2953
...,...,...
29362,2802,12312
29363,53552,25725
29364,4065,77178
29365,1290,23201


In [5]:
rating_scale = (train_df['rating'].min(), train_df['rating'].max())
reader = Reader(rating_scale=rating_scale)
data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)
train_set, val_set = train_test_split(data, test_size=0.2)

## SVD

In [7]:
param_grid = {
    "n_factors": [1, 2, 3, 5, 7, 10],
    "n_epochs": [80, 90, 100],
    "lr_all": [0.0025, 0.0050, 0.0075],
    "reg_all": [0.08, 0.10, 0.12],
}

gs = GridSearchCV(
    SVD,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

svd_results_df = pd.DataFrame.from_dict(gs.cv_results)
svd_results_df.to_csv("results/svd_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:  

0.8901022687174465
{'n_factors': 1, 'n_epochs': 80, 'lr_all': 0.005, 'reg_all': 0.1}


[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  1.4min finished


## SVD++

In [10]:
param_grid = {
    "n_factors": [1, 2, 3, 5, 7, 10],
    "n_epochs": [80, 90, 100],
    "lr_all": [0.0025, 0.0050, 0.0075],
    "reg_all": [0.08, 0.10, 0.12],
}

gs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

svd_pp_results_df = pd.DataFrame.from_dict(gs.cv_results)
svd_pp_results_df.to_csv("results/svd++_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:  

0.89100188625215
{'n_factors': 1, 'n_epochs': 80, 'lr_all': 0.005, 'reg_all': 0.1}


[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 15.8min finished


## NMF

In [20]:
param_grid = {
    'n_factors': [1, 2, 3, 4, 5],
    'n_epochs': [7, 10, 12],
    'biased': [True, False],
}

gs = GridSearchCV(
    NMF,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

nmf_results_df = pd.DataFrame.from_dict(gs.cv_results)
nmf_results_df.to_csv("results/nmf_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    7.7s


0.9782056857093199
{'n_factors': 1, 'n_epochs': 12, 'biased': True}


[Parallel(n_jobs=-1)]: Done 147 out of 150 | elapsed:    9.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    9.3s finished


## KNN

In [21]:
param_grid = {
    "k": [20, 40, 60],
    "sim_options": {
        "name": ["msd", "cosine", "pearson"],
        "min_support": [1, 2, 3, 4, 5],
        "user_based": [True, False],
    },
    "verbose": [False],
}

gs = GridSearchCV(
    KNNBasic,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

knn_basic_results_df = pd.DataFrame.from_dict(gs.cv_results)
knn_basic_results_df.to_csv("results/knn_basic_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 1

1.0765469243213293
{'k': 60, 'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}, 'verbose': False}


[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 34.7min finished


In [22]:
param_grid = {
    "k": [20, 40, 60],
    "sim_options": {
        "name": ["msd", "cosine", "pearson"],
        "min_support": [1, 2, 3, 4, 5],
        "user_based": [True, False],
    },
    "verbose": [False],
}

gs = GridSearchCV(
    KNNWithMeans,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

knn_with_means_results_df = pd.DataFrame.from_dict(gs.cv_results)
knn_with_means_results_df.to_csv("results/knn_with_means_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 1

1.0205884606077729
{'k': 60, 'sim_options': {'name': 'pearson', 'min_support': 5, 'user_based': True}, 'verbose': False}


[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 34.7min finished


In [23]:
param_grid = {
    "k": [20, 40, 60],
    "sim_options": {
        "name": ["msd", "cosine", "pearson"],
        "min_support": [1, 2, 3, 4, 5],
        "user_based": [True, False],
    },
    "verbose": [False],
}

gs = GridSearchCV(
    KNNWithZScore,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

knn_with_z_score_results_df = pd.DataFrame.from_dict(gs.cv_results)
knn_with_z_score_results_df.to_csv("results/knn_with_z_score_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 1

1.0174763920843606
{'k': 20, 'sim_options': {'name': 'pearson', 'min_support': 5, 'user_based': True}, 'verbose': False}


[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 33.0min finished


In [24]:
param_grid = {
    "k": [20, 40, 60],
    "sim_options": {
        "name": ["msd", "cosine", "pearson"],
        "min_support": [1, 2, 3, 4, 5],
        "user_based": [True, False],
    },
    "verbose": [False],
}

gs = GridSearchCV(
    KNNBaseline,
    param_grid,
    measures=["rmse", "mae"],
    n_jobs=-1,
    joblib_verbose=10,
)
gs.fit(data)

knn_baseline_results_df = pd.DataFrame.from_dict(gs.cv_results)
knn_baseline_results_df.to_csv("results/knn_baseline_results.csv")

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   51.2s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 1

0.9389018146610839
{'k': 20, 'sim_options': {'name': 'pearson', 'min_support': 5, 'user_based': False}, 'verbose': False}


[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 29.8min finished
