In [39]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [40]:
from os import sep
from pathlib import Path

cols = ['user_id', 'item_id', 'rating', 'category','color']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
def get_project_root() -> Path:
    return Path(sys.path[5]).parent.parent.parent.parent

# Data location
ROOT_DIR = get_project_root()
DATA_DIR = os.path.join(ROOT_DIR, 'GitHub/Recommender-System-for-AR-Glasses/data/articles.csv')

articles_data = pd.read_csv(DATA_DIR, names = cols, sep=',', usecols=[0, 1, 2, 6, 15], engine='python')

X = articles_data[['user_id', 'item_id']]
y = articles_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(articles_data, frac_new_users=0.2)

articles_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,20221918,108775015,5
1,20221967,108775044,2
2,20221954,108775051,3
3,20221958,110065001,1
4,20221950,110065002,4
5,20221942,110065011,3
6,20221933,111565001,2
7,20221905,111565003,5
8,20221908,111586001,4
9,20221929,111593001,1


Unnamed: 0,category,color
0,Vest top,Black
1,Vest top,White
2,Vest top,White
3,Bra,Black
4,Bra,White
5,Bra,Beige
6,Underwear Tights,Black
7,Socks,Beige
8,Leggings/Tights,Black
9,Underwear Tights,Black


# Simple model with global mean

This is similar to just the global standard deviation

In [26]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.415936


## SGD

In [27]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4020229690614137
Epoch  2 / 20  -  train_rmse: 1.3876686782701726
Epoch  3 / 20  -  train_rmse: 1.3738152258499443
Epoch  4 / 20  -  train_rmse: 1.3598321743680402
Epoch  5 / 20  -  train_rmse: 1.34683370106335
Epoch  6 / 20  -  train_rmse: 1.332715236671701
Epoch  7 / 20  -  train_rmse: 1.31960957562715
Epoch  8 / 20  -  train_rmse: 1.3057486768660436
Epoch  9 / 20  -  train_rmse: 1.2940027838010086
Epoch  10 / 20  -  train_rmse: 1.2809063363406434
Epoch  11 / 20  -  train_rmse: 1.2679835169201474
Epoch  12 / 20  -  train_rmse: 1.255245096321815
Epoch  13 / 20  -  train_rmse: 1.2432664933687965
Epoch  14 / 20  -  train_rmse: 1.229997961635302
Epoch  15 / 20  -  train_rmse: 1.2179612913549678
Epoch  16 / 20  -  train_rmse: 1.205747307687648
Epoch  17 / 20  -  train_rmse: 1.1933579513027155
Epoch  18 / 20  -  train_rmse: 1.181791530585119
Epoch  19 / 20  -  train_rmse: 1.1700159382694257
Epoch  20 / 20  -  train_rmse: 1.158355969618671

Test RMSE: 1.4193


In [38]:
baseline_model.recommend(user=20221974)

Unnamed: 0,user_id,item_id,rating_pred
67130,20221974,681111012,4.339756
26112,20221974,622966014,4.339756
68936,20221974,678571012,4.339756
59551,20221974,600229026,4.339756
10539,20221974,666006005,4.339756
22077,20221974,796794002,4.339756
47702,20221974,678260005,4.339756
10594,20221974,821018002,4.339756
10472,20221974,809074002,4.339756
65445,20221974,737777001,4.339756


## ALS

In [29]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.47110718469516555
Epoch  2 / 20  -  train_rmse: 0.47110718491518583
Epoch  3 / 20  -  train_rmse: 0.47110718512281335
Epoch  4 / 20  -  train_rmse: 0.4711071852882583
Epoch  5 / 20  -  train_rmse: 0.4711071854104981
Epoch  6 / 20  -  train_rmse: 0.4711071854972952
Epoch  7 / 20  -  train_rmse: 0.4711071855574756
Epoch  8 / 20  -  train_rmse: 0.47110718559863374
Epoch  9 / 20  -  train_rmse: 0.4711071856265367
Epoch  10 / 20  -  train_rmse: 0.4711071856453203
Epoch  11 / 20  -  train_rmse: 0.4711071856579312
Epoch  12 / 20  -  train_rmse: 0.47110718566638377
Epoch  13 / 20  -  train_rmse: 0.4711071856720197
Epoch  14 / 20  -  train_rmse: 0.4711071856758107
Epoch  15 / 20  -  train_rmse: 0.4711071856783053
Epoch  16 / 20  -  train_rmse: 0.47110718567998444
Epoch  17 / 20  -  train_rmse: 0.4711071856811095
Epoch  18 / 20  -  train_rmse: 0.4711071856818524
Epoch  19 / 20  -  train_rmse: 0.4711071856823407
Epoch  20 / 20  -  train_rmse: 0.4711071856826673

Te

# Matrix Factorization

## Linear Kernel

In [30]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4123422525795397
Epoch  2 / 20  -  train_rmse: 1.4079901290403019
Epoch  3 / 20  -  train_rmse: 1.4038397838714407
Epoch  4 / 20  -  train_rmse: 1.3997664791766085
Epoch  5 / 20  -  train_rmse: 1.3957175861403615
Epoch  6 / 20  -  train_rmse: 1.39168817817266
Epoch  7 / 20  -  train_rmse: 1.3876544655348892
Epoch  8 / 20  -  train_rmse: 1.3836006792600155
Epoch  9 / 20  -  train_rmse: 1.3795078695737297
Epoch  10 / 20  -  train_rmse: 1.3753694521839754
Epoch  11 / 20  -  train_rmse: 1.371160938221507
Epoch  12 / 20  -  train_rmse: 1.3668790604488068
Epoch  13 / 20  -  train_rmse: 1.3624936158160827
Epoch  14 / 20  -  train_rmse: 1.3580033482390952
Epoch  15 / 20  -  train_rmse: 1.3533937929471138
Epoch  16 / 20  -  train_rmse: 1.3486467833874654
Epoch  17 / 20  -  train_rmse: 1.3437479205282943
Epoch  18 / 20  -  train_rmse: 1.3386826849169062
Epoch  19 / 20  -  train_rmse: 1.3334356784527925
Epoch  20 / 20  -  train_rmse: 1.3280001644902102

Test RMSE: 

## Getting list of recommendations for a user

In [31]:
user = 20221974
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
80599,20221974,656677012,3.509661
24988,20221974,567728001,3.500327
48142,20221974,664871001,3.490327
9080,20221974,747696001,3.48506
42604,20221974,401020001,3.483425
40668,20221974,548111010,3.480262
63374,20221974,637515010,3.479864
791,20221974,891050002,3.471648
8258,20221974,902508001,3.458801
3333,20221974,715828005,3.45354


## Sigmoid kernel

In [32]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 2.034817956838736
Epoch  2 / 20  -  train_rmse: 1.5263551071137147
Epoch  3 / 20  -  train_rmse: 1.4032789295442796
Epoch  4 / 20  -  train_rmse: 1.3890578138330538
Epoch  5 / 20  -  train_rmse: 1.378604524755767
Epoch  6 / 20  -  train_rmse: 1.3688567318940805
Epoch  7 / 20  -  train_rmse: 1.3586792672025598
Epoch  8 / 20  -  train_rmse: 1.3484580652286358
Epoch  9 / 20  -  train_rmse: 1.337836468306542
Epoch  10 / 20  -  train_rmse: 1.326897341985218
Epoch  11 / 20  -  train_rmse: 1.315147598137661
Epoch  12 / 20  -  train_rmse: 1.3027353715144516
Epoch  13 / 20  -  train_rmse: 1.2895141597240152
Epoch  14 / 20  -  train_rmse: 1.2754253590874913
Epoch  15 / 20  -  train_rmse: 1.260425641090361
Epoch  16 / 20  -  train_rmse: 1.2438950967386315
Epoch  17 / 20  -  train_rmse: 1.2269820165574754
Epoch  18 / 20  -  train_rmse: 1.2084137423972772
Epoch  19 / 20  -  train_rmse: 1.1886975410947411
Epoch  20 / 20  -  train_rmse: 1.1676851959750638

Test RMSE: 1.4

## RBF Kernel

In [33]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4413356532514567
Epoch  2 / 20  -  train_rmse: 1.4154050770262954
Epoch  3 / 20  -  train_rmse: 1.392121538131348
Epoch  4 / 20  -  train_rmse: 1.3740825408844348
Epoch  5 / 20  -  train_rmse: 1.351693120215384
Epoch  6 / 20  -  train_rmse: 1.32244456718193
Epoch  7 / 20  -  train_rmse: 1.2979135920417963
Epoch  8 / 20  -  train_rmse: 1.2804791523446284
Epoch  9 / 20  -  train_rmse: 1.258898521265899
Epoch  10 / 20  -  train_rmse: 1.2362757367776251
Epoch  11 / 20  -  train_rmse: 1.2120430136670712
Epoch  12 / 20  -  train_rmse: 1.1947973631923807
Epoch  13 / 20  -  train_rmse: 1.1736099599162642
Epoch  14 / 20  -  train_rmse: 1.157576679020673
Epoch  15 / 20  -  train_rmse: 1.1385495787523745
Epoch  16 / 20  -  train_rmse: 1.124833637549907
Epoch  17 / 20  -  train_rmse: 1.1017896713304083
Epoch  18 / 20  -  train_rmse: 1.088208391594635
Epoch  19 / 20  -  train_rmse: 1.071778958957508
Epoch  20 / 20  -  train_rmse: 1.0535579166256774

Test RMSE: 1.5017

# Scikit-learn compatability

In [34]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [35]:
grid_search.best_score_
grid_search.best_params_

-1.4151348248236668

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}