In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [10]:
from pathlib import Path

cols = ['user_id', 'item_id', 'rating', 'timestamp']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
def get_project_root() -> Path:
    return Path(sys.path[5]).parent.parent.parent.parent

# Data location
ROOT_DIR = get_project_root()
DATA_DIR = os.path.join(ROOT_DIR, 'GitHub/Recommender-System-for-AR-Glasses/data/ml-100k/u.data')

movie_data = pd.read_csv(DATA_DIR, names = cols, sep = '\t', usecols=[0, 1, 2], engine='python')

X = movie_data[['user_id', 'item_id']]
y = movie_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)

movie_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


# Simple model with global mean

This is similar to just the global standard deviation

In [9]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.120652


# Baseline Model with biases

## SGD

In [8]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9683172826185533
Epoch  2 / 20  -  train_rmse: 0.9451415019565567
Epoch  3 / 20  -  train_rmse: 0.9351303325117791
Epoch  4 / 20  -  train_rmse: 0.9296055993119998
Epoch  5 / 20  -  train_rmse: 0.9259708272936896
Epoch  6 / 20  -  train_rmse: 0.9236709816113429
Epoch  7 / 20  -  train_rmse: 0.9218615631410583
Epoch  8 / 20  -  train_rmse: 0.9204950753984653
Epoch  9 / 20  -  train_rmse: 0.919795403994173
Epoch  10 / 20  -  train_rmse: 0.9189174163881855
Epoch  11 / 20  -  train_rmse: 0.9184914381519012
Epoch  12 / 20  -  train_rmse: 0.917986208579641
Epoch  13 / 20  -  train_rmse: 0.9176079181387055
Epoch  14 / 20  -  train_rmse: 0.917261538700951
Epoch  15 / 20  -  train_rmse: 0.9170046482676864
Epoch  16 / 20  -  train_rmse: 0.9168164383929871
Epoch  17 / 20  -  train_rmse: 0.916527192748887
Epoch  18 / 20  -  train_rmse: 0.9163887670520774
Epoch  19 / 20  -  train_rmse: 0.9161326153111765
Epoch  20 / 20  -  train_rmse: 0.9160051739197278

Test RMSE: 0

In [6]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
388,200,408,5.0
212,200,169,5.0
790,200,114,5.0
338,200,64,5.0
281,200,483,5.0
726,200,513,5.0
188,200,178,5.0
378,200,318,5.0
988,200,1449,5.0
54,200,603,5.0


## ALS

In [7]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9312489364350157
Epoch  2 / 20  -  train_rmse: 0.9144875214764501
Epoch  3 / 20  -  train_rmse: 0.9134856911195807
Epoch  4 / 20  -  train_rmse: 0.9133800448918423
Epoch  5 / 20  -  train_rmse: 0.9133615794862777
Epoch  6 / 20  -  train_rmse: 0.9133565857003941
Epoch  7 / 20  -  train_rmse: 0.9133544601244424
Epoch  8 / 20  -  train_rmse: 0.9133531004630441
Epoch  9 / 20  -  train_rmse: 0.9133519902067218
Epoch  10 / 20  -  train_rmse: 0.9133509792033206
Epoch  11 / 20  -  train_rmse: 0.9133500175542733
Epoch  12 / 20  -  train_rmse: 0.9133490869495551
Epoch  13 / 20  -  train_rmse: 0.9133481801287349
Epoch  14 / 20  -  train_rmse: 0.9133472939684136
Epoch  15 / 20  -  train_rmse: 0.9133464269599311
Epoch  16 / 20  -  train_rmse: 0.9133455782426871
Epoch  17 / 20  -  train_rmse: 0.9133447472230197
Epoch  18 / 20  -  train_rmse: 0.9133439334215674
Epoch  19 / 20  -  train_rmse: 0.9133431364114416
Epoch  20 / 20  -  train_rmse: 0.9133423557930989

Test RMS

## Updating with new users

In [25]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.9645834840497174
Epoch  2 / 20  -  train_rmse: 0.9426788664353346
Epoch  3 / 20  -  train_rmse: 0.933173020480211
Epoch  4 / 20  -  train_rmse: 0.9280176538178754
Epoch  5 / 20  -  train_rmse: 0.9247053572523148
Epoch  6 / 20  -  train_rmse: 0.9225146819558325
Epoch  7 / 20  -  train_rmse: 0.9210361429343735
Epoch  8 / 20  -  train_rmse: 0.9197183131436016
Epoch  9 / 20  -  train_rmse: 0.918955811904028
Epoch  10 / 20  -  train_rmse: 0.9183187677881147
Epoch  11 / 20  -  train_rmse: 0.9176959231504462
Epoch  12 / 20  -  train_rmse: 0.9172373977398547
Epoch  13 / 20  -  train_rmse: 0.916948425669373
Epoch  14 / 20  -  train_rmse: 0.916834745181247
Epoch  15 / 20  -  train_rmse: 0.9164204070644032
Epoch  16 / 20  -  train_rmse: 0.9159426707035059
Epoch  17 / 20  -  train_rmse: 0.9159168764283033
Epoch  18 / 20  -  train_rmse: 0.9156423327116029
Epoch  19 / 20  -  train_rmse: 0.9154243585445992
Epoch  20 / 20  -  train_rmse: 0.9151920735377486


BaselineModel(n_epochs=20, reg=0.05)

In [26]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0193834108666215
Epoch  2 / 20  -  train_rmse: 1.0026683697314376
Epoch  3 / 20  -  train_rmse: 0.9901769439761929
Epoch  4 / 20  -  train_rmse: 0.9806875040857721
Epoch  5 / 20  -  train_rmse: 0.9733817252991642
Epoch  6 / 20  -  train_rmse: 0.9676198632538002
Epoch  7 / 20  -  train_rmse: 0.9630161417527313
Epoch  8 / 20  -  train_rmse: 0.9592313887131018
Epoch  9 / 20  -  train_rmse: 0.9561021149525532
Epoch  10 / 20  -  train_rmse: 0.9534711264024373
Epoch  11 / 20  -  train_rmse: 0.951223273997874
Epoch  12 / 20  -  train_rmse: 0.9492759622280281
Epoch  13 / 20  -  train_rmse: 0.9475704516646841
Epoch  14 / 20  -  train_rmse: 0.9460696594124194
Epoch  15 / 20  -  train_rmse: 0.944740807364359
Epoch  16 / 20  -  train_rmse: 0.9435485728661793
Epoch  17 / 20  -  train_rmse: 0.9424704089146838
Epoch  18 / 20  -  train_rmse: 0.9414923539812922
Epoch  19 / 20  -  train_rmse: 0.9406025165004821
Epoch  20 / 20  -  train_rmse: 0.9397942459342439

Test RMSE:

# Matrix Factorization

## Linear Kernel

In [27]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0801507140404096
Epoch  2 / 20  -  train_rmse: 1.0473656907296969
Epoch  3 / 20  -  train_rmse: 1.0244903630628588
Epoch  4 / 20  -  train_rmse: 1.0075296857656224
Epoch  5 / 20  -  train_rmse: 0.9942603284722936
Epoch  6 / 20  -  train_rmse: 0.9835111632439683
Epoch  7 / 20  -  train_rmse: 0.974526337601326
Epoch  8 / 20  -  train_rmse: 0.966819553088186
Epoch  9 / 20  -  train_rmse: 0.9600696133965093
Epoch  10 / 20  -  train_rmse: 0.9540596630998097
Epoch  11 / 20  -  train_rmse: 0.9486193990632841
Epoch  12 / 20  -  train_rmse: 0.943639892522998
Epoch  13 / 20  -  train_rmse: 0.9390311757853279
Epoch  14 / 20  -  train_rmse: 0.9347239917770873
Epoch  15 / 20  -  train_rmse: 0.9306733049109561
Epoch  16 / 20  -  train_rmse: 0.9268335313250561
Epoch  17 / 20  -  train_rmse: 0.9231712688864916
Epoch  18 / 20  -  train_rmse: 0.9196596911866853
Epoch  19 / 20  -  train_rmse: 0.9162768688316496
Epoch  20 / 20  -  train_rmse: 0.9130027554530633

Test RMSE: 

## Getting list of recommendations for a user

In [28]:
user = 200
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
37,200,64,5.0
242,200,357,4.9534
11,200,127,4.915417
61,200,272,4.904673
395,200,480,4.838068
710,200,479,4.836338
275,200,12,4.816772
655,200,427,4.809603
55,200,511,4.804514
17,200,100,4.799981


## Updating with new users

In [29]:
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.070532534908358
Epoch  2 / 20  -  train_rmse: 1.0382550814888798
Epoch  3 / 20  -  train_rmse: 1.016223567186231
Epoch  4 / 20  -  train_rmse: 0.9999259150734995
Epoch  5 / 20  -  train_rmse: 0.9872230783607883
Epoch  6 / 20  -  train_rmse: 0.9769180735139289
Epoch  7 / 20  -  train_rmse: 0.9682961252963588
Epoch  8 / 20  -  train_rmse: 0.960892150084995
Epoch  9 / 20  -  train_rmse: 0.9543918379200625
Epoch  10 / 20  -  train_rmse: 0.9485783384518195
Epoch  11 / 20  -  train_rmse: 0.9433061637143006
Epoch  12 / 20  -  train_rmse: 0.938463263379901
Epoch  13 / 20  -  train_rmse: 0.9339650936805841
Epoch  14 / 20  -  train_rmse: 0.929751847441957
Epoch  15 / 20  -  train_rmse: 0.9257716553695285
Epoch  16 / 20  -  train_rmse: 0.9219882225437939
Epoch  17 / 20  -  train_rmse: 0.9183672795894445
Epoch  18 / 20  -  train_rmse: 0.9148850583690756
Epoch  19 / 20  -  train_rmse: 0.9115196571475493
Epoch  20 / 20  -  train_rmse: 0.908250954121005


KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)

In [30]:
%%time
# Update model with new users
matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0398438867441708
Epoch  2 / 20  -  train_rmse: 1.0205298587191887
Epoch  3 / 20  -  train_rmse: 1.005898944918304
Epoch  4 / 20  -  train_rmse: 0.9946117622294945
Epoch  5 / 20  -  train_rmse: 0.9857161215586545
Epoch  6 / 20  -  train_rmse: 0.9786001978027283
Epoch  7 / 20  -  train_rmse: 0.9727637689370116
Epoch  8 / 20  -  train_rmse: 0.9678744440780326
Epoch  9 / 20  -  train_rmse: 0.9636880571681337
Epoch  10 / 20  -  train_rmse: 0.9600473744304538
Epoch  11 / 20  -  train_rmse: 0.9568149961083536
Epoch  12 / 20  -  train_rmse: 0.9539207501908638
Epoch  13 / 20  -  train_rmse: 0.9512885302329018
Epoch  14 / 20  -  train_rmse: 0.948871075478375
Epoch  15 / 20  -  train_rmse: 0.9466215968349229
Epoch  16 / 20  -  train_rmse: 0.944521169925318
Epoch  17 / 20  -  train_rmse: 0.9425495193083304
Epoch  18 / 20  -  train_rmse: 0.9406793049352827
Epoch  19 / 20  -  train_rmse: 0.9388980542556391
Epoch  20 / 20  -  train_rmse: 0.9371930705136422

Test RMSE: 

## Sigmoid kernel

In [31]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.7254822848969822
Epoch  2 / 20  -  train_rmse: 1.7003142751671472
Epoch  3 / 20  -  train_rmse: 1.6622203537201186
Epoch  4 / 20  -  train_rmse: 1.6210896054829982
Epoch  5 / 20  -  train_rmse: 1.5755969696853245
Epoch  6 / 20  -  train_rmse: 1.523358785200969
Epoch  7 / 20  -  train_rmse: 1.4657473625381845
Epoch  8 / 20  -  train_rmse: 1.4093923649123679
Epoch  9 / 20  -  train_rmse: 1.3583786284889912
Epoch  10 / 20  -  train_rmse: 1.31328765777303
Epoch  11 / 20  -  train_rmse: 1.2739093163622697
Epoch  12 / 20  -  train_rmse: 1.2392970315419771
Epoch  13 / 20  -  train_rmse: 1.208703438927833
Epoch  14 / 20  -  train_rmse: 1.1813865082019266
Epoch  15 / 20  -  train_rmse: 1.1569240600214343
Epoch  16 / 20  -  train_rmse: 1.1348499849610996
Epoch  17 / 20  -  train_rmse: 1.1148490243653801
Epoch  18 / 20  -  train_rmse: 1.0966084892109254
Epoch  19 / 20  -  train_rmse: 1.0798441263248728
Epoch  20 / 20  -  train_rmse: 1.0642930986094383

Test RMSE: 1

## RBF Kernel

In [32]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.261401461583481
Epoch  2 / 20  -  train_rmse: 1.1104716789657518
Epoch  3 / 20  -  train_rmse: 1.0459651301525494
Epoch  4 / 20  -  train_rmse: 1.0041015731544418
Epoch  5 / 20  -  train_rmse: 0.9753249911384554
Epoch  6 / 20  -  train_rmse: 0.9521809987806821
Epoch  7 / 20  -  train_rmse: 0.9348266703462532
Epoch  8 / 20  -  train_rmse: 0.9221728325529471
Epoch  9 / 20  -  train_rmse: 0.91201479599359
Epoch  10 / 20  -  train_rmse: 0.9051753845440291
Epoch  11 / 20  -  train_rmse: 0.9010243233985498
Epoch  12 / 20  -  train_rmse: 0.8937243429616951
Epoch  13 / 20  -  train_rmse: 0.8907050590546616
Epoch  14 / 20  -  train_rmse: 0.8886879944704423
Epoch  15 / 20  -  train_rmse: 0.8866428135120862
Epoch  16 / 20  -  train_rmse: 0.8848659594433683
Epoch  17 / 20  -  train_rmse: 0.8845182030522712
Epoch  18 / 20  -  train_rmse: 0.8833204790414565
Epoch  19 / 20  -  train_rmse: 0.8817838515145145
Epoch  20 / 20  -  train_rmse: 0.8808931018319694

Test RMSE: 

# Scikit-learn compatability

In [33]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5, estimator=KernelMF(gamma=0.01, verbose=0), n_jobs=-1,
             param_grid={'kernel': ['linear', 'sigmoid', 'rbf'],
                         'n_epochs': [10, 20, 50], 'n_factors': [10, 20, 50],
                         'reg': [0, 0.005, 0.1]},
             scoring='neg_root_mean_squared_error', verbose=1)

In [34]:
grid_search.best_score_
grid_search.best_params_

-0.9257614456485126

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}