In [8]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

In [9]:
from os import sep
from pathlib import Path

cols = ['user_id', 'item_id', 'rating']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
def get_project_root() -> Path:
    return Path(sys.path[5]).parent.parent.parent.parent

# Data location
ROOT_DIR = get_project_root()
DATA_DIR = os.path.join(ROOT_DIR, 'GitHub/Recommender-System-for-AR-Glasses/data/articles.csv')

articles_data = pd.read_csv(DATA_DIR, names = cols, sep=',', usecols=[0, 1, 2], engine='python')

X = articles_data[['user_id', 'item_id']]
y = articles_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(articles_data, frac_new_users=0.2)

articles_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,20221962,108775015,3
1,20221900,108775044,5
2,20221969,108775051,4
3,20222000,110065001,3
4,20221967,110065002,2
5,20221908,110065011,2
6,20221909,111565001,5
7,20221995,111565003,3
8,20221968,111586001,1
9,20221971,111593001,1


# Simple model with global mean

This is similar to just the global standard deviation

In [10]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.415903


## SGD

In [11]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.401482002396977
Epoch  2 / 20  -  train_rmse: 1.3870512145729617
Epoch  3 / 20  -  train_rmse: 1.3735675982244606
Epoch  4 / 20  -  train_rmse: 1.3597916380837078
Epoch  5 / 20  -  train_rmse: 1.3466218815476627
Epoch  6 / 20  -  train_rmse: 1.3324322044040586
Epoch  7 / 20  -  train_rmse: 1.3195425350836039
Epoch  8 / 20  -  train_rmse: 1.305982530142275
Epoch  9 / 20  -  train_rmse: 1.2923529653462922
Epoch  10 / 20  -  train_rmse: 1.2799232330249988
Epoch  11 / 20  -  train_rmse: 1.2668564030373042
Epoch  12 / 20  -  train_rmse: 1.2548120412928399
Epoch  13 / 20  -  train_rmse: 1.241901703914745
Epoch  14 / 20  -  train_rmse: 1.2292240795265685
Epoch  15 / 20  -  train_rmse: 1.2170179133212535
Epoch  16 / 20  -  train_rmse: 1.2052832306709296
Epoch  17 / 20  -  train_rmse: 1.1927497362637933
Epoch  18 / 20  -  train_rmse: 1.1811533236047884
Epoch  19 / 20  -  train_rmse: 1.1687719839066806
Epoch  20 / 20  -  train_rmse: 1.1580283346195268

Test RMSE: 

In [15]:
baseline_model.recommend(user=20221974)

Unnamed: 0,user_id,item_id,rating_pred
31770,20221974,679853011,3.490969
23343,20221974,866501003,3.490728
61917,20221974,433444019,3.489604
79464,20221974,559212002,3.489565
52634,20221974,467302100,3.489476
32508,20221974,816355003,3.489393
62775,20221974,850606010,3.489063
5770,20221974,573773004,3.488802
44574,20221974,619350001,3.488752
32479,20221974,850824001,3.488739


## ALS

In [16]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.47094906530057945
Epoch  2 / 20  -  train_rmse: 0.4709490655381043
Epoch  3 / 20  -  train_rmse: 0.47094906576224427
Epoch  4 / 20  -  train_rmse: 0.4709490659407875
Epoch  5 / 20  -  train_rmse: 0.47094906607277853
Epoch  6 / 20  -  train_rmse: 0.4709490661664554
Epoch  7 / 20  -  train_rmse: 0.4709490662314286
Epoch  8 / 20  -  train_rmse: 0.4709490662758738
Epoch  9 / 20  -  train_rmse: 0.47094906630597677
Epoch  10 / 20  -  train_rmse: 0.4709490663262714
Epoch  11 / 20  -  train_rmse: 0.4709490663398614
Epoch  12 / 20  -  train_rmse: 0.47094906634898975
Epoch  13 / 20  -  train_rmse: 0.47094906635509
Epoch  14 / 20  -  train_rmse: 0.4709490663591619
Epoch  15 / 20  -  train_rmse: 0.4709490663618534
Epoch  16 / 20  -  train_rmse: 0.4709490663636705
Epoch  17 / 20  -  train_rmse: 0.4709490663648944
Epoch  18 / 20  -  train_rmse: 0.4709490663656943
Epoch  19 / 20  -  train_rmse: 0.4709490663662296
Epoch  20 / 20  -  train_rmse: 0.4709490663665792

Test 

# Matrix Factorization

## Linear Kernel

In [17]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4122170093602162
Epoch  2 / 20  -  train_rmse: 1.4078393219716903
Epoch  3 / 20  -  train_rmse: 1.4036543923444809
Epoch  4 / 20  -  train_rmse: 1.399546876778612
Epoch  5 / 20  -  train_rmse: 1.3954698884184433
Epoch  6 / 20  -  train_rmse: 1.3914213148224133
Epoch  7 / 20  -  train_rmse: 1.3873628796967643
Epoch  8 / 20  -  train_rmse: 1.3832899823336153
Epoch  9 / 20  -  train_rmse: 1.3791809999933626
Epoch  10 / 20  -  train_rmse: 1.3750248114788381
Epoch  11 / 20  -  train_rmse: 1.3708037475692072
Epoch  12 / 20  -  train_rmse: 1.366502267330937
Epoch  13 / 20  -  train_rmse: 1.3621124298345768
Epoch  14 / 20  -  train_rmse: 1.357612875402347
Epoch  15 / 20  -  train_rmse: 1.3529890896267813
Epoch  16 / 20  -  train_rmse: 1.3482354512289825
Epoch  17 / 20  -  train_rmse: 1.3433300936749246
Epoch  18 / 20  -  train_rmse: 1.3382590891069834
Epoch  19 / 20  -  train_rmse: 1.3330103244646048
Epoch  20 / 20  -  train_rmse: 1.327573728527801

Test RMSE: 1

## Getting list of recommendations for a user

In [18]:
user = 20221974
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
8330,20221974,600544002,3.707782
49366,20221974,557157002,3.672905
3018,20221974,905853001,3.664849
43702,20221974,546260001,3.65598
52436,20221974,869870001,3.6501
46619,20221974,730454062,3.633433
7526,20221974,507854004,3.626926
74220,20221974,903861002,3.620899
33802,20221974,570474033,3.615735
34469,20221974,665647001,3.593459


## Sigmoid kernel

In [20]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 2.034792630875378
Epoch  2 / 20  -  train_rmse: 1.5318584776476023
Epoch  3 / 20  -  train_rmse: 1.4033918191980177
Epoch  4 / 20  -  train_rmse: 1.3884784260797114
Epoch  5 / 20  -  train_rmse: 1.3780557089526222
Epoch  6 / 20  -  train_rmse: 1.3680000917976793
Epoch  7 / 20  -  train_rmse: 1.3580014591178764
Epoch  8 / 20  -  train_rmse: 1.3478728614874431
Epoch  9 / 20  -  train_rmse: 1.3373014725655539
Epoch  10 / 20  -  train_rmse: 1.3260128233132566
Epoch  11 / 20  -  train_rmse: 1.3143317123425526
Epoch  12 / 20  -  train_rmse: 1.3019648574207934
Epoch  13 / 20  -  train_rmse: 1.2886580312886176
Epoch  14 / 20  -  train_rmse: 1.2748030645867108
Epoch  15 / 20  -  train_rmse: 1.2595485753419104
Epoch  16 / 20  -  train_rmse: 1.2431010010396197
Epoch  17 / 20  -  train_rmse: 1.2261144136825413
Epoch  18 / 20  -  train_rmse: 1.2074606273114024
Epoch  19 / 20  -  train_rmse: 1.1877955759790588
Epoch  20 / 20  -  train_rmse: 1.166826126931909

Test RMSE:

## RBF Kernel

In [21]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.4404052710497923
Epoch  2 / 20  -  train_rmse: 1.4133103054671263
Epoch  3 / 20  -  train_rmse: 1.3872343284092121
Epoch  4 / 20  -  train_rmse: 1.365868715040777
Epoch  5 / 20  -  train_rmse: 1.3437564744255523
Epoch  6 / 20  -  train_rmse: 1.3165578608407875
Epoch  7 / 20  -  train_rmse: 1.2991979515712953
Epoch  8 / 20  -  train_rmse: 1.2736172769858207
Epoch  9 / 20  -  train_rmse: 1.2548833430519848
Epoch  10 / 20  -  train_rmse: 1.2361580261311385
Epoch  11 / 20  -  train_rmse: 1.214890274643896
Epoch  12 / 20  -  train_rmse: 1.1947179586333139
Epoch  13 / 20  -  train_rmse: 1.1702155401759273
Epoch  14 / 20  -  train_rmse: 1.1523408878864134
Epoch  15 / 20  -  train_rmse: 1.137530659887114
Epoch  16 / 20  -  train_rmse: 1.1211523301727286
Epoch  17 / 20  -  train_rmse: 1.1010582912392204
Epoch  18 / 20  -  train_rmse: 1.082337332436458
Epoch  19 / 20  -  train_rmse: 1.0688448300640963
Epoch  20 / 20  -  train_rmse: 1.0576183805577954

Test RMSE: 1

# Scikit-learn compatability

In [22]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [23]:
grid_search.best_score_
grid_search.best_params_

-1.414701153390991

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 10, 'reg': 0.1}