In [70]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [80]:
from os import sep
from pathlib import Path

cols = ['user_id', 'item_id', 'rating', 'category','color']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
def get_project_root() -> Path:
    return Path(sys.path[5]).parent.parent.parent.parent

# Data location
ROOT_DIR = get_project_root()
DATA_DIR = os.path.join(ROOT_DIR, 'GitHub/Recommender-System-for-AR-Glasses/data/articles.csv')

articles_data = pd.read_csv(DATA_DIR, names = cols, sep=',', usecols=[0, 1, 2, 6, 15], engine='python')

X = articles_data[['user_id', 'item_id']]
y = articles_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(articles_data, frac_new_users=0.2)

articles_data.head(10)

Unnamed: 0,user_id,item_id,rating,category,color
0,20221474,108775015,5,Vest top,Black
1,20221756,108775044,4,Vest top,White
2,20221715,108775051,4,Vest top,White
3,20221931,110065001,4,Bra,Black
4,20221219,110065002,1,Bra,White
5,20221489,110065011,3,Bra,Beige
6,20221859,111565001,2,Underwear Tights,Black
7,20221416,111565003,2,Socks,Beige
8,20221705,111586001,4,Leggings/Tights,Black
9,20221813,111593001,4,Underwear Tights,Black


# Simple model with global mean

This is similar to just the global standard deviation

In [81]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.423924


## SGD

In [82]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.3917176001731084
Epoch  2 / 20  -  train_rmse: 1.3765741433761645
Epoch  3 / 20  -  train_rmse: 1.3625070841745093
Epoch  4 / 20  -  train_rmse: 1.3488047511707293
Epoch  5 / 20  -  train_rmse: 1.3352652439761206
Epoch  6 / 20  -  train_rmse: 1.3218372715311013
Epoch  7 / 20  -  train_rmse: 1.308584731198995
Epoch  8 / 20  -  train_rmse: 1.2954557704418903
Epoch  9 / 20  -  train_rmse: 1.2824616179594772
Epoch  10 / 20  -  train_rmse: 1.2695845153891694
Epoch  11 / 20  -  train_rmse: 1.2568509508855892
Epoch  12 / 20  -  train_rmse: 1.244235985660842
Epoch  13 / 20  -  train_rmse: 1.2317455603018233
Epoch  14 / 20  -  train_rmse: 1.2193974912510825
Epoch  15 / 20  -  train_rmse: 1.2071664786520357
Epoch  16 / 20  -  train_rmse: 1.195063596671153
Epoch  17 / 20  -  train_rmse: 1.1830676115419185
Epoch  18 / 20  -  train_rmse: 1.1711942559648247
Epoch  19 / 20  -  train_rmse: 1.1594672018304757
Epoch  20 / 20  -  train_rmse: 1.1478448067389277

Test RMSE: 

In [83]:
baseline_model.recommend(user=20221000)

Unnamed: 0,user_id,item_id,rating_pred
81968,20221000,880118006,3.594119
39831,20221000,593829004,3.593001
44568,20221000,751546003,3.592753
13625,20221000,834893001,3.592751
54389,20221000,948997001,3.592577
54228,20221000,880599002,3.592236
56532,20221000,717879017,3.592104
7874,20221000,856551005,3.591765
66171,20221000,902069001,3.591263
82420,20221000,825986001,3.591257


## ALS

In [84]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.4680386271586314
Epoch  2 / 20  -  train_rmse: 0.468038805196324
Epoch  3 / 20  -  train_rmse: 0.46803897197540284
Epoch  4 / 20  -  train_rmse: 0.4680391039145932
Epoch  5 / 20  -  train_rmse: 0.4680392007475082
Epoch  6 / 20  -  train_rmse: 0.4680392690408556
Epoch  7 / 20  -  train_rmse: 0.4680393161063982
Epoch  8 / 20  -  train_rmse: 0.468039348088808
Epoch  9 / 20  -  train_rmse: 0.4680393696303135
Epoch  10 / 20  -  train_rmse: 0.46803938405744383
Epoch  11 / 20  -  train_rmse: 0.4680393936844379
Epoch  12 / 20  -  train_rmse: 0.4680394000929458
Epoch  13 / 20  -  train_rmse: 0.4680394043523275
Epoch  14 / 20  -  train_rmse: 0.4680394071803224
Epoch  15 / 20  -  train_rmse: 0.4680394090567067
Epoch  16 / 20  -  train_rmse: 0.4680394103011187
Epoch  17 / 20  -  train_rmse: 0.46803941112616104
Epoch  18 / 20  -  train_rmse: 0.46803941167306384
Epoch  19 / 20  -  train_rmse: 0.4680394120355338
Epoch  20 / 20  -  train_rmse: 0.4680394122757578

Test R

# Matrix Factorization

## Linear Kernel

In [85]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4105634751311078
Epoch  2 / 20  -  train_rmse: 1.405234183865558
Epoch  3 / 20  -  train_rmse: 1.400102468083455
Epoch  4 / 20  -  train_rmse: 1.3951402166229465
Epoch  5 / 20  -  train_rmse: 1.3903178300930894
Epoch  6 / 20  -  train_rmse: 1.385615275692687
Epoch  7 / 20  -  train_rmse: 1.3810129462483451
Epoch  8 / 20  -  train_rmse: 1.3764956821182521
Epoch  9 / 20  -  train_rmse: 1.3720477490868785
Epoch  10 / 20  -  train_rmse: 1.3676590924956045
Epoch  11 / 20  -  train_rmse: 1.3633196478931153
Epoch  12 / 20  -  train_rmse: 1.359019937920508
Epoch  13 / 20  -  train_rmse: 1.354752692207398
Epoch  14 / 20  -  train_rmse: 1.3505108398405956
Epoch  15 / 20  -  train_rmse: 1.3462897298570455
Epoch  16 / 20  -  train_rmse: 1.342084156502831
Epoch  17 / 20  -  train_rmse: 1.337889346267785
Epoch  18 / 20  -  train_rmse: 1.3337009584057575
Epoch  19 / 20  -  train_rmse: 1.3295163192270203
Epoch  20 / 20  -  train_rmse: 1.3253316890366194

Test RMSE: 1.42

## Getting list of recommendations for a user

In [86]:
user = 20222000
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
61706,20222000,613781003,3.276323
50661,20222000,665722001,3.237811
60821,20222000,776781001,3.232709
78588,20222000,877522003,3.229571
3062,20222000,659208003,3.226029
69811,20222000,555622003,3.225628
76398,20222000,698877001,3.225293
51914,20222000,710667001,3.219278
26224,20222000,679687026,3.218313
79374,20222000,803070006,3.211188


## Sigmoid kernel

In [32]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 2.034817956838736
Epoch  2 / 20  -  train_rmse: 1.5263551071137147
Epoch  3 / 20  -  train_rmse: 1.4032789295442796
Epoch  4 / 20  -  train_rmse: 1.3890578138330538
Epoch  5 / 20  -  train_rmse: 1.378604524755767
Epoch  6 / 20  -  train_rmse: 1.3688567318940805
Epoch  7 / 20  -  train_rmse: 1.3586792672025598
Epoch  8 / 20  -  train_rmse: 1.3484580652286358
Epoch  9 / 20  -  train_rmse: 1.337836468306542
Epoch  10 / 20  -  train_rmse: 1.326897341985218
Epoch  11 / 20  -  train_rmse: 1.315147598137661
Epoch  12 / 20  -  train_rmse: 1.3027353715144516
Epoch  13 / 20  -  train_rmse: 1.2895141597240152
Epoch  14 / 20  -  train_rmse: 1.2754253590874913
Epoch  15 / 20  -  train_rmse: 1.260425641090361
Epoch  16 / 20  -  train_rmse: 1.2438950967386315
Epoch  17 / 20  -  train_rmse: 1.2269820165574754
Epoch  18 / 20  -  train_rmse: 1.2084137423972772
Epoch  19 / 20  -  train_rmse: 1.1886975410947411
Epoch  20 / 20  -  train_rmse: 1.1676851959750638

Test RMSE: 1.4

## RBF Kernel

In [33]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4413356532514567
Epoch  2 / 20  -  train_rmse: 1.4154050770262954
Epoch  3 / 20  -  train_rmse: 1.392121538131348
Epoch  4 / 20  -  train_rmse: 1.3740825408844348
Epoch  5 / 20  -  train_rmse: 1.351693120215384
Epoch  6 / 20  -  train_rmse: 1.32244456718193
Epoch  7 / 20  -  train_rmse: 1.2979135920417963
Epoch  8 / 20  -  train_rmse: 1.2804791523446284
Epoch  9 / 20  -  train_rmse: 1.258898521265899
Epoch  10 / 20  -  train_rmse: 1.2362757367776251
Epoch  11 / 20  -  train_rmse: 1.2120430136670712
Epoch  12 / 20  -  train_rmse: 1.1947973631923807
Epoch  13 / 20  -  train_rmse: 1.1736099599162642
Epoch  14 / 20  -  train_rmse: 1.157576679020673
Epoch  15 / 20  -  train_rmse: 1.1385495787523745
Epoch  16 / 20  -  train_rmse: 1.124833637549907
Epoch  17 / 20  -  train_rmse: 1.1017896713304083
Epoch  18 / 20  -  train_rmse: 1.088208391594635
Epoch  19 / 20  -  train_rmse: 1.071778958957508
Epoch  20 / 20  -  train_rmse: 1.0535579166256774

Test RMSE: 1.5017

# Scikit-learn compatability

In [34]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [35]:
grid_search.best_score_
grid_search.best_params_

-1.4151348248236668

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}