In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys

# Reload imported code 
%load_ext autoreload
%autoreload 2

# Print all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
    
rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

# Load data

**Movie data found here https://grouplens.org/datasets/movielens/**

In [2]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']
# movie_data = pd.read_csv('../data/ml-1m/ratings.dat', names = cols, sep = '::', usecols=[0, 1, 2], engine='python')
movie_data = pd.read_csv('../data/ml-100k/u.data', names = cols, sep = '\t', usecols=[0, 1, 2], engine='python')

X = movie_data[['user_id', 'item_id']]
y = movie_data['rating']

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Prepare data for online learning
X_train_initial, y_train_initial, X_train_update, y_train_update, X_test_update, y_test_update = train_update_test_split(movie_data, frac_new_users=0.2)

movie_data.head(10)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


# Simple model with global mean

This is similar to just the global standard deviation

In [3]:
global_mean = y_train.mean()
pred = [global_mean for _ in range(y_test.shape[0])]

mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:4f}')


Test RMSE: 1.120652


# Baseline Model with biases

## SGD

In [4]:
%%time

baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9686423944865309
Epoch  2 / 20  -  train_rmse: 0.945454644947265
Epoch  3 / 20  -  train_rmse: 0.9351504213186972
Epoch  4 / 20  -  train_rmse: 0.9295761495251743
Epoch  5 / 20  -  train_rmse: 0.9258745652248292
Epoch  6 / 20  -  train_rmse: 0.9236050331012716
Epoch  7 / 20  -  train_rmse: 0.9218177186021692
Epoch  8 / 20  -  train_rmse: 0.9207562141689237
Epoch  9 / 20  -  train_rmse: 0.9197623029873546
Epoch  10 / 20  -  train_rmse: 0.9189995712834659
Epoch  11 / 20  -  train_rmse: 0.918451462605598
Epoch  12 / 20  -  train_rmse: 0.9180495122471143
Epoch  13 / 20  -  train_rmse: 0.9175399439606876
Epoch  14 / 20  -  train_rmse: 0.9171618159264271
Epoch  15 / 20  -  train_rmse: 0.9169935669356354
Epoch  16 / 20  -  train_rmse: 0.9168335455723187
Epoch  17 / 20  -  train_rmse: 0.9164828124223623
Epoch  18 / 20  -  train_rmse: 0.9164100229768548
Epoch  19 / 20  -  train_rmse: 0.9162339753489479
Epoch  20 / 20  -  train_rmse: 0.9159991874080741

Test RMSE:

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


In [5]:
baseline_model.recommend(user=200)

Unnamed: 0,user_id,item_id,rating_pred
388,200,408,5.0
212,200,169,5.0
790,200,114,5.0
378,200,318,5.0
281,200,483,5.0
338,200,64,5.0
726,200,513,5.0
988,200,1449,5.0
188,200,178,5.0
54,200,603,5.0


## ALS

In [6]:
%%time

baseline_model = BaselineModel(method='als', n_epochs = 20, reg = 0.5, verbose=1)
baseline_model.fit(X_train, y_train)

pred = baseline_model.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 0.9312489364350157
Epoch  2 / 20  -  train_rmse: 0.9144875214764501
Epoch  3 / 20  -  train_rmse: 0.9134856911195807
Epoch  4 / 20  -  train_rmse: 0.9133800448918423
Epoch  5 / 20  -  train_rmse: 0.9133615794862777
Epoch  6 / 20  -  train_rmse: 0.9133565857003941
Epoch  7 / 20  -  train_rmse: 0.9133544601244424
Epoch  8 / 20  -  train_rmse: 0.9133531004630441
Epoch  9 / 20  -  train_rmse: 0.9133519902067218
Epoch  10 / 20  -  train_rmse: 0.9133509792033206
Epoch  11 / 20  -  train_rmse: 0.9133500175542733
Epoch  12 / 20  -  train_rmse: 0.9133490869495551
Epoch  13 / 20  -  train_rmse: 0.9133481801287349
Epoch  14 / 20  -  train_rmse: 0.9133472939684136
Epoch  15 / 20  -  train_rmse: 0.9133464269599311
Epoch  16 / 20  -  train_rmse: 0.9133455782426871
Epoch  17 / 20  -  train_rmse: 0.9133447472230197
Epoch  18 / 20  -  train_rmse: 0.9133439334215674
Epoch  19 / 20  -  train_rmse: 0.9133431364114416
Epoch  20 / 20  -  train_rmse: 0.9133423557930989

Test RMS

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


## Updating with new users

In [7]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, lr=0.01, reg = 0.05, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 0.964686930194181
Epoch  2 / 20  -  train_rmse: 0.9427319362325008
Epoch  3 / 20  -  train_rmse: 0.9333556569724756
Epoch  4 / 20  -  train_rmse: 0.9281134929077341
Epoch  5 / 20  -  train_rmse: 0.924616252210722
Epoch  6 / 20  -  train_rmse: 0.9224724060783375
Epoch  7 / 20  -  train_rmse: 0.9209601096060667
Epoch  8 / 20  -  train_rmse: 0.9197715584060696
Epoch  9 / 20  -  train_rmse: 0.9189359161243891
Epoch  10 / 20  -  train_rmse: 0.9181645778586561
Epoch  11 / 20  -  train_rmse: 0.9176273893537576
Epoch  12 / 20  -  train_rmse: 0.9170737789422722
Epoch  13 / 20  -  train_rmse: 0.9168711098095811
Epoch  14 / 20  -  train_rmse: 0.9163025570472149
Epoch  15 / 20  -  train_rmse: 0.9162865690867323
Epoch  16 / 20  -  train_rmse: 0.9159995138329357
Epoch  17 / 20  -  train_rmse: 0.9161785855136518
Epoch  18 / 20  -  train_rmse: 0.9156641482148411
Epoch  19 / 20  -  train_rmse: 0.9155810137145801
Epoch  20 / 20  -  train_rmse: 0.9154245995660817


0,1,2
,method,'sgd'
,n_epochs,20
,reg,0.05
,lr,0.01
,min_rating,0
,max_rating,5
,verbose,1


In [8]:
%%time
baseline_model.update_users(X_train_update, y_train_update, n_epochs=20, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
mse = mean_squared_error(y_test_update, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0195780914433232
Epoch  2 / 20  -  train_rmse: 1.002900083577449
Epoch  3 / 20  -  train_rmse: 0.9904109648728807
Epoch  4 / 20  -  train_rmse: 0.9809717028029952
Epoch  5 / 20  -  train_rmse: 0.9736893030283219
Epoch  6 / 20  -  train_rmse: 0.9679197401301578
Epoch  7 / 20  -  train_rmse: 0.9633203390747541
Epoch  8 / 20  -  train_rmse: 0.959540548341889
Epoch  9 / 20  -  train_rmse: 0.9564021050290984
Epoch  10 / 20  -  train_rmse: 0.9537597542578576
Epoch  11 / 20  -  train_rmse: 0.9515121413845853
Epoch  12 / 20  -  train_rmse: 0.9495673919686078
Epoch  13 / 20  -  train_rmse: 0.9478626173370919
Epoch  14 / 20  -  train_rmse: 0.9463652169346675
Epoch  15 / 20  -  train_rmse: 0.9450349543415082
Epoch  16 / 20  -  train_rmse: 0.9438493769950023
Epoch  17 / 20  -  train_rmse: 0.9427772080766882
Epoch  18 / 20  -  train_rmse: 0.941809492904174
Epoch  19 / 20  -  train_rmse: 0.9409270128310528
Epoch  20 / 20  -  train_rmse: 0.9401240030612253

Test RMSE: 

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


# Matrix Factorization

## Linear Kernel

In [9]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0801251099047668
Epoch  2 / 20  -  train_rmse: 1.047348115694743
Epoch  3 / 20  -  train_rmse: 1.0245119050469507
Epoch  4 / 20  -  train_rmse: 1.007507162802038
Epoch  5 / 20  -  train_rmse: 0.9942485994051239
Epoch  6 / 20  -  train_rmse: 0.9835016540514843
Epoch  7 / 20  -  train_rmse: 0.9745253004390946
Epoch  8 / 20  -  train_rmse: 0.9668164141621324
Epoch  9 / 20  -  train_rmse: 0.9600677268374247
Epoch  10 / 20  -  train_rmse: 0.9540559701577022
Epoch  11 / 20  -  train_rmse: 0.9486189249653856
Epoch  12 / 20  -  train_rmse: 0.9436393059320735
Epoch  13 / 20  -  train_rmse: 0.9390301282567551
Epoch  14 / 20  -  train_rmse: 0.9347253204516975
Epoch  15 / 20  -  train_rmse: 0.9306723288834663
Epoch  16 / 20  -  train_rmse: 0.9268332372919156
Epoch  17 / 20  -  train_rmse: 0.9231727685802924
Epoch  18 / 20  -  train_rmse: 0.9196608757760587
Epoch  19 / 20  -  train_rmse: 0.9162785037341064
Epoch  20 / 20  -  train_rmse: 0.9130054151762321

Test RMSE:

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


## Getting list of recommendations for a user

In [10]:
user = 200
items_known = X_train.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
37,200,64,5.0
242,200,357,4.952953
11,200,127,4.914617
61,200,272,4.903618
395,200,480,4.837317
710,200,479,4.837083
275,200,12,4.814873
655,200,427,4.808881
55,200,511,4.805896
144,200,285,4.797134


## Updating with new users

In [11]:
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.070538707802871
Epoch  2 / 20  -  train_rmse: 1.0382159716897121
Epoch  3 / 20  -  train_rmse: 1.0162031280663388
Epoch  4 / 20  -  train_rmse: 0.9999204990611213
Epoch  5 / 20  -  train_rmse: 0.9872294909194177
Epoch  6 / 20  -  train_rmse: 0.9769249789892207
Epoch  7 / 20  -  train_rmse: 0.9682958041917087
Epoch  8 / 20  -  train_rmse: 0.9608895565638447
Epoch  9 / 20  -  train_rmse: 0.9543889595540945
Epoch  10 / 20  -  train_rmse: 0.948578659813844
Epoch  11 / 20  -  train_rmse: 0.9433055970501156
Epoch  12 / 20  -  train_rmse: 0.938461641360533
Epoch  13 / 20  -  train_rmse: 0.9339642006418014
Epoch  14 / 20  -  train_rmse: 0.9297502588547513
Epoch  15 / 20  -  train_rmse: 0.9257709608859865
Epoch  16 / 20  -  train_rmse: 0.9219862741469365
Epoch  17 / 20  -  train_rmse: 0.9183655667900865
Epoch  18 / 20  -  train_rmse: 0.9148845245960727
Epoch  19 / 20  -  train_rmse: 0.9115179681015021
Epoch  20 / 20  -  train_rmse: 0.9082505316006381


0,1,2
,n_factors,100
,n_epochs,20
,kernel,'linear'
,gamma,0.01
,reg,0.005
,lr,0.001
,init_mean,0
,init_sd,0.1
,min_rating,0
,max_rating,5


In [12]:
%%time
# Update model with new users
matrix_fact.update_users(X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1)
pred = matrix_fact.predict(X_test_update)
mse = mean_squared_error(y_test_update, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.0397450554800525
Epoch  2 / 20  -  train_rmse: 1.0204337379810584
Epoch  3 / 20  -  train_rmse: 1.0058096346155325
Epoch  4 / 20  -  train_rmse: 0.9945487509230668
Epoch  5 / 20  -  train_rmse: 0.9856788852385475
Epoch  6 / 20  -  train_rmse: 0.9785502807148969
Epoch  7 / 20  -  train_rmse: 0.9727211384013853
Epoch  8 / 20  -  train_rmse: 0.9678294345771077
Epoch  9 / 20  -  train_rmse: 0.9636503985446767
Epoch  10 / 20  -  train_rmse: 0.960009401734517
Epoch  11 / 20  -  train_rmse: 0.9567808372197321
Epoch  12 / 20  -  train_rmse: 0.9538862549785115
Epoch  13 / 20  -  train_rmse: 0.9512620535601249
Epoch  14 / 20  -  train_rmse: 0.9488473078085282
Epoch  15 / 20  -  train_rmse: 0.9466082136054945
Epoch  16 / 20  -  train_rmse: 0.9445122840152649
Epoch  17 / 20  -  train_rmse: 0.9425388443923118
Epoch  18 / 20  -  train_rmse: 0.9406656946030653
Epoch  19 / 20  -  train_rmse: 0.9388828051919528
Epoch  20 / 20  -  train_rmse: 0.9371795087892466

Test RMSE

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


## Sigmoid kernel

In [13]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.01, reg = 0.005, kernel='sigmoid')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.7254805205751333
Epoch  2 / 20  -  train_rmse: 1.700298323525964
Epoch  3 / 20  -  train_rmse: 1.662238337802511
Epoch  4 / 20  -  train_rmse: 1.6209458229579268
Epoch  5 / 20  -  train_rmse: 1.5756407606745388
Epoch  6 / 20  -  train_rmse: 1.523380376461499
Epoch  7 / 20  -  train_rmse: 1.46584264477677
Epoch  8 / 20  -  train_rmse: 1.4093903037737305
Epoch  9 / 20  -  train_rmse: 1.3583723368736296
Epoch  10 / 20  -  train_rmse: 1.3132907595663301
Epoch  11 / 20  -  train_rmse: 1.2739409058784805
Epoch  12 / 20  -  train_rmse: 1.2393439462737148
Epoch  13 / 20  -  train_rmse: 1.2087482554275188
Epoch  14 / 20  -  train_rmse: 1.1814329531446852
Epoch  15 / 20  -  train_rmse: 1.1569723519850141
Epoch  16 / 20  -  train_rmse: 1.1349239855313502
Epoch  17 / 20  -  train_rmse: 1.1148887056124974
Epoch  18 / 20  -  train_rmse: 1.096588994313544
Epoch  19 / 20  -  train_rmse: 1.079775939120299
Epoch  20 / 20  -  train_rmse: 1.0642499147786675

Test RMSE: 1.11

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


## RBF Kernel

In [14]:
%%time 
matrix_fact = KernelMF(n_epochs = 20, n_factors = 100, verbose = 1, lr = 0.5, reg = 0.005, kernel='rbf')
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

print(f'\nTest RMSE: {rmse:.4f}')

Epoch  1 / 20  -  train_rmse: 1.2622950262198738
Epoch  2 / 20  -  train_rmse: 1.1102992660855353
Epoch  3 / 20  -  train_rmse: 1.0463669013800458
Epoch  4 / 20  -  train_rmse: 1.0040136262041024
Epoch  5 / 20  -  train_rmse: 0.9735098603587192
Epoch  6 / 20  -  train_rmse: 0.9523727245812303
Epoch  7 / 20  -  train_rmse: 0.9348634981032051
Epoch  8 / 20  -  train_rmse: 0.9229699775438122
Epoch  9 / 20  -  train_rmse: 0.913491431161034
Epoch  10 / 20  -  train_rmse: 0.9033072087853409
Epoch  11 / 20  -  train_rmse: 0.9002857264773999
Epoch  12 / 20  -  train_rmse: 0.8961511599895111
Epoch  13 / 20  -  train_rmse: 0.8927527284373504
Epoch  14 / 20  -  train_rmse: 0.8900681688285926
Epoch  15 / 20  -  train_rmse: 0.8872865719163149
Epoch  16 / 20  -  train_rmse: 0.8851759267879175
Epoch  17 / 20  -  train_rmse: 0.8838043754937456
Epoch  18 / 20  -  train_rmse: 0.8824818565867462
Epoch  19 / 20  -  train_rmse: 0.8837292387381509
Epoch  20 / 20  -  train_rmse: 0.8810714770613943

Test RMSE

  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)


# Scikit-learn compatability

In [15]:
from sklearn.model_selection import GridSearchCV, ParameterGrid

param_grid = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'n_factors': [10, 20, 50],
    'n_epochs': [10, 20, 50],
    'reg': [0, 0.005, 0.1]
}

grid_search = GridSearchCV(KernelMF(verbose=0), scoring = 'neg_root_mean_squared_error', param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_map)
  X.loc[:, "item_id"] = X["item_id"].map(self.item_id_ma

0,1,2
,estimator,"KernelMF(gamm...01, verbose=0)"
,param_grid,"{'kernel': ['linear', 'sigmoid', ...], 'n_epochs': [10, 20, ...], 'n_factors': [10, 20, ...], 'reg': [0, 0.005, ...]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_factors,50
,n_epochs,50
,kernel,'linear'
,gamma,0.01
,reg,0.1
,lr,0.01
,init_mean,0
,init_sd,0.1
,min_rating,0
,max_rating,5


In [16]:
grid_search.best_score_
grid_search.best_params_

-0.9259453454713402

{'kernel': 'linear', 'n_epochs': 50, 'n_factors': 50, 'reg': 0.1}