In [7]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [8]:
from os import sep
from pathlib import Path

cols = ['user_id', 'item_id', 'rating']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
def get_project_root() -> Path:
    return Path(sys.path[5]).parent.parent.parent.parent

# Data location
ROOT_DIR = get_project_root()
DATA_DIR = os.path.join(ROOT_DIR, 'GitHub/Recommender-System-for-AR-Glasses/data/articles.csv')

articles_data = pd.read_csv(DATA_DIR, names = cols, sep=',', usecols=[0, 1, 2], engine='python')

X = articles_data[['user_id', 'item_id']]
y = articles_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(articles_data, frac_new_users=0.2)

articles_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,20221962,108775015,3
1,20221900,108775044,5
2,20221969,108775051,4
3,20222000,110065001,3
4,20221967,110065002,2
5,20221908,110065011,2
6,20221909,111565001,5
7,20221995,111565003,3
8,20221968,111586001,1
9,20221971,111593001,1


# Simple model with global mean

This is similar to just the global standard deviation

In [9]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.415903


# Baseline Model with biases

## SGD

In [10]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4011274435741392
Epoch  2 / 20  -  train_rmse: 1.386989015580367
Epoch  3 / 20  -  train_rmse: 1.3741066710224317
Epoch  4 / 20  -  train_rmse: 1.3597191351389968
Epoch  5 / 20  -  train_rmse: 1.3463720029514243
Epoch  6 / 20  -  train_rmse: 1.3330942547873512
Epoch  7 / 20  -  train_rmse: 1.3191398207693656
Epoch  8 / 20  -  train_rmse: 1.306012105019405
Epoch  9 / 20  -  train_rmse: 1.2936890081223673
Epoch  10 / 20  -  train_rmse: 1.2801392685183155
Epoch  11 / 20  -  train_rmse: 1.2671343469994643
Epoch  12 / 20  -  train_rmse: 1.2538964767801288
Epoch  13 / 20  -  train_rmse: 1.2422700405382372
Epoch  14 / 20  -  train_rmse: 1.2293134237674488
Epoch  15 / 20  -  train_rmse: 1.2169975162463587
Epoch  16 / 20  -  train_rmse: 1.2050689822192133
Epoch  17 / 20  -  train_rmse: 1.1929785856320319
Epoch  18 / 20  -  train_rmse: 1.18168061405053
Epoch  19 / 20  -  train_rmse: 1.169825190952408
Epoch  20 / 20  -  train_rmse: 1.157406706956025

Test RMSE: 1.4

In [11]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
48274,200,708819001,3.405449
65474,200,708929001,3.404338
43901,200,703399001,3.403797
4517,200,689011020,3.403069
28707,200,740376001,3.402414
31250,200,753031018,3.40229
8888,200,688665001,3.401903
58561,200,821029005,3.401846
48134,200,832298002,3.401714
22985,200,679633013,3.401622


## ALS

In [12]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.47094906530058106
Epoch  2 / 20  -  train_rmse: 0.4709490655381012
Epoch  3 / 20  -  train_rmse: 0.4709490657622446
Epoch  4 / 20  -  train_rmse: 0.4709490659407924
Epoch  5 / 20  -  train_rmse: 0.47094906607277515
Epoch  6 / 20  -  train_rmse: 0.47094906616645693
Epoch  7 / 20  -  train_rmse: 0.4709490662314252
Epoch  8 / 20  -  train_rmse: 0.47094906627587346
Epoch  9 / 20  -  train_rmse: 0.4709490663059758
Epoch  10 / 20  -  train_rmse: 0.4709490663262714
Epoch  11 / 20  -  train_rmse: 0.47094906633986494
Epoch  12 / 20  -  train_rmse: 0.4709490663489901
Epoch  13 / 20  -  train_rmse: 0.4709490663550933
Epoch  14 / 20  -  train_rmse: 0.4709490663591601
Epoch  15 / 20  -  train_rmse: 0.4709490663618534
Epoch  16 / 20  -  train_rmse: 0.47094906636366834
Epoch  17 / 20  -  train_rmse: 0.47094906636489386
Epoch  18 / 20  -  train_rmse: 0.4709490663656941
Epoch  19 / 20  -  train_rmse: 0.47094906636622946
Epoch  20 / 20  -  train_rmse: 0.4709490663665786



## Updating with new users

In [13]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.4011495325071892
Epoch  2 / 20  -  train_rmse: 1.3874114856207294
Epoch  3 / 20  -  train_rmse: 1.374461090817333
Epoch  4 / 20  -  train_rmse: 1.3601996355619361
Epoch  5 / 20  -  train_rmse: 1.3459340746260209
Epoch  6 / 20  -  train_rmse: 1.332405583032106
Epoch  7 / 20  -  train_rmse: 1.3197538949287686
Epoch  8 / 20  -  train_rmse: 1.3061436522939511
Epoch  9 / 20  -  train_rmse: 1.293577403770846
Epoch  10 / 20  -  train_rmse: 1.2803975467850504
Epoch  11 / 20  -  train_rmse: 1.2664956530964686
Epoch  12 / 20  -  train_rmse: 1.2552180863441194
Epoch  13 / 20  -  train_rmse: 1.2422349882383388
Epoch  14 / 20  -  train_rmse: 1.22998737947407
Epoch  15 / 20  -  train_rmse: 1.2175040686403602
Epoch  16 / 20  -  train_rmse: 1.2055842537078365
Epoch  17 / 20  -  train_rmse: 1.1935775806853288
Epoch  18 / 20  -  train_rmse: 1.181603724048166
Epoch  19 / 20  -  train_rmse: 1.1704417256587467
Epoch  20 / 20  -  train_rmse: 1.1590808338034393


BaselineModel(n_epochs=20, reg=0.05)

In [14]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

ZeroDivisionError: division by zero

# Matrix Factorization

## Linear Kernel

In [15]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4127113230156965
Epoch  2 / 20  -  train_rmse: 1.4083688748520442
Epoch  3 / 20  -  train_rmse: 1.4042322386006874
Epoch  4 / 20  -  train_rmse: 1.4001763570871428
Epoch  5 / 20  -  train_rmse: 1.3961589314593192
Epoch  6 / 20  -  train_rmse: 1.3921590160261632
Epoch  7 / 20  -  train_rmse: 1.3881638645615362
Epoch  8 / 20  -  train_rmse: 1.384153288404297
Epoch  9 / 20  -  train_rmse: 1.3801095685675124
Epoch  10 / 20  -  train_rmse: 1.3760164483182475
Epoch  11 / 20  -  train_rmse: 1.3718712390684389
Epoch  12 / 20  -  train_rmse: 1.3676460617818222
Epoch  13 / 20  -  train_rmse: 1.3633292934705727
Epoch  14 / 20  -  train_rmse: 1.3589135668778145
Epoch  15 / 20  -  train_rmse: 1.3543730059512582
Epoch  16 / 20  -  train_rmse: 1.3497116251810957
Epoch  17 / 20  -  train_rmse: 1.3449007380814215
Epoch  18 / 20  -  train_rmse: 1.3399343049319732
Epoch  19 / 20  -  train_rmse: 1.3347912374962099
Epoch  20 / 20  -  train_rmse: 1.32946641388386

Test RMSE: 

## Getting list of recommendations for a user

In [16]:
user = 200
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
22797,200,834315001,3.058648
83370,200,872126001,3.056058
12899,200,655250001,3.055993
70107,200,737491001,3.055764
29682,200,544290036,3.055595
47764,200,743822002,3.055588
71361,200,517678006,3.055586
54441,200,567424028,3.055436
129,200,763743002,3.055396
71120,200,821023001,3.055275


## Updating with new users

In [17]:
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.4118712541036114
Epoch  2 / 20  -  train_rmse: 1.407556351702785
Epoch  3 / 20  -  train_rmse: 1.4033906178814701
Epoch  4 / 20  -  train_rmse: 1.3993062357808406
Epoch  5 / 20  -  train_rmse: 1.3952715157441846
Epoch  6 / 20  -  train_rmse: 1.391236233687564
Epoch  7 / 20  -  train_rmse: 1.3872079942625861
Epoch  8 / 20  -  train_rmse: 1.3831311975831788
Epoch  9 / 20  -  train_rmse: 1.3790273783841651
Epoch  10 / 20  -  train_rmse: 1.374858336998315
Epoch  11 / 20  -  train_rmse: 1.3706027753200907
Epoch  12 / 20  -  train_rmse: 1.3662501775819536
Epoch  13 / 20  -  train_rmse: 1.3617800122546369
Epoch  14 / 20  -  train_rmse: 1.3571733744577277
Epoch  15 / 20  -  train_rmse: 1.3524255089916217
Epoch  16 / 20  -  train_rmse: 1.3474931646337411
Epoch  17 / 20  -  train_rmse: 1.342369499360486
Epoch  18 / 20  -  train_rmse: 1.3370508838263426
Epoch  19 / 20  -  train_rmse: 1.33151503288784
Epoch  20 / 20  -  train_rmse: 1.3257228825026273


KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)

In [18]:
%%time
# Update model with new users
matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

ZeroDivisionError: division by zero

## Sigmoid kernel

In [19]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 2.034874188183689
Epoch  2 / 20  -  train_rmse: 1.5311895298273337
Epoch  3 / 20  -  train_rmse: 1.4023132743573083
Epoch  4 / 20  -  train_rmse: 1.388054913055482
Epoch  5 / 20  -  train_rmse: 1.377821945114798
Epoch  6 / 20  -  train_rmse: 1.368017370514043
Epoch  7 / 20  -  train_rmse: 1.3576458272390048
Epoch  8 / 20  -  train_rmse: 1.3475068120787372
Epoch  9 / 20  -  train_rmse: 1.3371314887985375
Epoch  10 / 20  -  train_rmse: 1.3258812553472845
Epoch  11 / 20  -  train_rmse: 1.3141354643389607
Epoch  12 / 20  -  train_rmse: 1.3015771853612135
Epoch  13 / 20  -  train_rmse: 1.288642896578567
Epoch  14 / 20  -  train_rmse: 1.2743876656553663
Epoch  15 / 20  -  train_rmse: 1.259227777390685
Epoch  16 / 20  -  train_rmse: 1.2428412046116812
Epoch  17 / 20  -  train_rmse: 1.225428999577081
Epoch  18 / 20  -  train_rmse: 1.206715257982113
Epoch  19 / 20  -  train_rmse: 1.1871076327358439
Epoch  20 / 20  -  train_rmse: 1.1658576809341021

Test RMSE: 1.418

## RBF Kernel

In [20]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4425974069665348
Epoch  2 / 20  -  train_rmse: 1.415180736944855
Epoch  3 / 20  -  train_rmse: 1.387397542224782
Epoch  4 / 20  -  train_rmse: 1.3660038724996897
Epoch  5 / 20  -  train_rmse: 1.3468149880008755
Epoch  6 / 20  -  train_rmse: 1.323789237229002
Epoch  7 / 20  -  train_rmse: 1.298225695599849
Epoch  8 / 20  -  train_rmse: 1.2761348945782167
Epoch  9 / 20  -  train_rmse: 1.2584338594785796
Epoch  10 / 20  -  train_rmse: 1.2316621239239225
Epoch  11 / 20  -  train_rmse: 1.2143760213007737
Epoch  12 / 20  -  train_rmse: 1.1937051448465525
Epoch  13 / 20  -  train_rmse: 1.1722761186357922
Epoch  14 / 20  -  train_rmse: 1.15537748490193
Epoch  15 / 20  -  train_rmse: 1.1375102551208551
Epoch  16 / 20  -  train_rmse: 1.125084739529759
Epoch  17 / 20  -  train_rmse: 1.098260454402989
Epoch  18 / 20  -  train_rmse: 1.09027575294519
Epoch  19 / 20  -  train_rmse: 1.072670558729096
Epoch  20 / 20  -  train_rmse: 1.05931781311627

Test RMSE: 1.5180
CPU

# Scikit-learn compatability

In [21]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [22]:
grid_search.best_score_
grid_search.best_params_

-1.4146688688019728

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 20, 'reg': 0.1}