# Baselines

This notebook is for running and comparing the baselines.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import ElasticNet
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix
from tqdm import tqdm

## Paths

In [2]:
data_root = Path("./data")

## Data

In [3]:
books_df = pd.read_csv(data_root / "books.csv")
books_df["ISBN"] = books_df["ISBN"].astype(str)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,786914041,248348
16595,62117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [4]:
train_df = pd.read_csv(data_root / "train.csv")
train_df

Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0
...,...,...,...
100518,15374,69658,2.0
100519,11063,69658,2.5
100520,18444,29981,1.0
100521,5917,38009,1.0


In [5]:
test_df = pd.read_csv(data_root / "test.csv")
del test_df["id"]
test_df

Unnamed: 0,book_id,user_id
0,3786,40484
1,1985,47039
2,2290,60111
3,118657,64447
4,1560,2953
...,...,...
29362,2802,12312
29363,53552,25725
29364,4065,77178
29365,1290,23201


## Methods with No Additional Data

### Methods with Surprise Library

In [None]:
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import (
    Reader,
    Dataset,
    SVD,
    accuracy,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    KNNBaseline,
)

In [10]:
rating_scale = (train_df['rating'].min(), train_df['rating'].max())
reader = Reader(rating_scale=rating_scale)
data = Dataset.load_from_df(train_df[['user_id', 'book_id', 'rating']], reader)
train_set, val_set = train_test_split(data, test_size=0.2)

#### SVD

In [11]:
alg = SVD()
alg.fit(train_set)
predictions = alg.test(val_set)
rmse = accuracy.rmse(predictions)

RMSE: 0.9378


In [None]:
param_grid = {
    'n_factors': [5, 7, 10, 12, 15, 20],
    'n_epochs': [70, 80, 90, 100],
    'lr_all': [0.0025, 0.0050, 0.0075],
    'reg_all': [0.08, 0.10, 0.12, 0.14],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], n_jobs=-1, joblib_verbose=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:  

0.8902055991575161
{'n_factors': 10, 'n_epochs': 80, 'lr_all': 0.005, 'reg_all': 0.1}


[Parallel(n_jobs=-1)]: Done 12500 out of 12500 | elapsed: 32.5min finished


#### KNN

In [30]:
param_grid = {
    'k': [5, 10, 20, 40],
    'sim_options': {
        'name': ['msd', 'cosine', 'pearson'],
        'min_support': [1, 3, 5],
        'user_based': [True, False],
    },
    'verbose': [False],
}

In [31]:
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], n_jobs=-1, joblib_verbose=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 161 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed: 1

1.077874776994718
{'k': 40, 'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}, 'verbose': False}


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 26.8min finished


In [None]:
gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], n_jobs=-1, joblib_verbose=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse', 'mae'], n_jobs=-1, joblib_verbose=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], n_jobs=-1, joblib_verbose=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

## SLIM

In [9]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

train_df["user_id"] = train_df["user_id"].astype(str)
train_df["book_id"] = train_df["book_id"].astype(str)
train_df["rating"] = train_df["rating"].astype(float)

user_ids = train_df["user_id"].unique()
item_ids = train_df["book_id"].unique()

user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_id_to_idx = {item_id: idx for idx, item_id in enumerate(item_ids)}

num_users = len(user_ids)
num_items = len(item_ids)

train_df["user_idx"] = train_df["user_id"].map(user_id_to_idx)
train_df["item_idx"] = train_df["book_id"].map(item_id_to_idx)

ratings = train_df[["user_idx", "item_idx", "rating"]].values

alpha = 1.0
l1_ratio = 0.5

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_list = []

for fold, (train_indices, test_indices) in enumerate(
    tqdm(kf.split(ratings), total=kf.get_n_splits(), desc="Cross-validation folds"), 1
):
    print(f"\nStarting fold {fold}")

    train_data = ratings[train_indices]
    test_data = ratings[test_indices]

    R_train = csr_matrix(
        (train_data[:, 2], (train_data[:, 0], train_data[:, 1])),
        shape=(num_users, num_items),
    )

    R_test = csr_matrix(
        (test_data[:, 2], (test_data[:, 0], test_data[:, 1])),
        shape=(num_users, num_items),
    )

    W = lil_matrix((num_items, num_items))

    for j in tqdm(range(num_items), desc="Learning item similarities", leave=False):
        r_j = R_train[:, j].toarray().ravel()

        indices = list(range(num_items))
        indices.remove(j)
        R_minus_j = R_train[:, indices]

        if np.std(r_j) > 0:
            model = ElasticNet(
                alpha=alpha,
                l1_ratio=l1_ratio,
                positive=True,
                fit_intercept=False,
                max_iter=5000,
                selection="cyclic",
            )
            model.fit(R_minus_j, r_j)

            coef = model.coef_
            coef = np.insert(coef, j, 0)
            W[j, :] = coef
        else:
            continue

    W = W.tocsr()

    R_pred = R_test.dot(W.T)

    test_users = test_data[:, 0].astype(int)
    test_items = test_data[:, 1].astype(int)
    y_true = test_data[:, 2]
    y_pred = R_pred[test_users, test_items].A1

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    rmse_list.append(rmse)
    print(f"Fold {fold} RMSE: {rmse}")

average_rmse = np.mean(rmse_list)
print(f"\nAverage RMSE over {kf.get_n_splits()} folds: {average_rmse}")

Cross-validation folds:   0%|          | 0/5 [00:00<?, ?it/s]


Starting fold 1


Cross-validation folds:  20%|██        | 1/5 [00:55<03:42, 55.51s/it]

Fold 1 RMSE: 2.6088990284717815

Starting fold 2


Cross-validation folds:  40%|████      | 2/5 [01:50<02:45, 55.29s/it]

Fold 2 RMSE: 2.589458860409602

Starting fold 3


Cross-validation folds:  60%|██████    | 3/5 [02:46<01:51, 55.76s/it]

Fold 3 RMSE: 2.605679809919655

Starting fold 4


Cross-validation folds:  80%|████████  | 4/5 [03:42<00:55, 55.80s/it]

Fold 4 RMSE: 2.614608360772764

Starting fold 5


Cross-validation folds: 100%|██████████| 5/5 [04:37<00:00, 55.46s/it]

Fold 5 RMSE: 2.622463149892249

Average RMSE over 5 folds: 2.6082218418932106





In [17]:
from scipy.sparse import coo_matrix
import implicit
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.als import AlternatingLeastSquares

In [19]:
# Map user_id and book_id to indices
user_ids = train_df['user_id'].astype('category')
item_ids = train_df['book_id'].astype('category')
train_df['user_idx'] = user_ids.cat.codes
train_df['item_idx'] = item_ids.cat.codes

# Create the user-item interaction matrix (item-user for implicit)
data = train_df['rating'].astype(float)
rows = train_df['item_idx']
cols = train_df['user_idx']
interaction_matrix = coo_matrix((data, (rows, cols)))

model = AlternatingLeastSquares(factors=50, regularization=0.01)
model.fit(interaction_matrix)



  0%|          | 0/15 [00:00<?, ?it/s]