In [1]:
import pandas as pd
from surprise import Dataset, Reader

import os

DATA_DIR = "data"
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
X = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(train_data, reader)

In [22]:
import numpy as np
from surprise import NMF
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

reg_range = list(10 ** (np.arange(-5, 0, 0.5)))
param_grid = {
    # "n_epochs": np.arange(25, 500, 25),
    # "n_factors": np.arange(1, 100, 2),
    "n_epochs": [50],
    "n_factors": [17],
    "biased": [False],
    "reg_pu": [0.03162277660168379],
    "reg_qi": [0.31622776601683794],
    "reg_bu": [3.1622776601683795e-05],
    "reg_bi": [0.1],
    "lr_bu": [0.0001],
    "lr_bi": [3.1622776601683795e-05],
    # "init_low": [0, 0.1, 0.2, 0.3],
    # "init_high": [0.8, 0.9, 1, 1.2, 1.5, 2, 5],
}

rs2 = GridSearchCV(
    NMF, param_grid, measures=["rmse"], cv=4, n_jobs=-1, joblib_verbose=5
)
rs2.fit(data)
print(rs2.best_score["rmse"])
print(rs2.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   32.9s


0.6948359414428594
{'rmse': {'n_epochs': 50, 'n_factors': 17, 'biased': False, 'reg_pu': 0.03162277660168379, 'reg_qi': 0.31622776601683794, 'reg_bu': 3.1622776601683795e-05, 'reg_bi': 0.1, 'lr_bu': 0.0001, 'lr_bi': 3.1622776601683795e-05}}


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   51.6s finished


In [24]:
import numpy as np
import scipy.stats as st
from surprise import NMF
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

param_grid = {
    "n_epochs": [50],
    "n_factors": [1, 2, 17, 50, 100],
    "biased": [False],
    "reg_pu": st.norm(0.03162277660168379, 0.01),
    "reg_qi": st.norm(0.31622776601683794, 0.1),
    "reg_bu": st.norm(3.1622776601683795e-05, 1e-5),
    "reg_bi": st.norm(0.1, 0.0333),
    "lr_bu": st.norm(0.0001, 0.00003333),
    "lr_bi": st.norm(3.1622776601683795e-05, 1e-05),
    # "init_low": [0, 0.1, 0.2, 0.3],
    # "init_high": [0.8, 0.9, 1, 1.2, 1.5, 2, 5],
}

rs3 = RandomizedSearchCV(
    NMF, param_grid, measures=["rmse"], cv=4, n_jobs=-1, joblib_verbose=5, n_iter=1000
)
rs3.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 3026 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed: 11.3min


0.6948359414428594
{'rmse': {'n_epochs': 50, 'n_factors': 17, 'biased': False, 'reg_pu': 0.03162277660168379, 'reg_qi': 0.31622776601683794, 'reg_bu': 3.1622776601683795e-05, 'reg_bi': 0.1, 'lr_bu': 0.0001, 'lr_bi': 3.1622776601683795e-05}}


[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 12.9min finished


In [25]:
print(rs3.best_score["rmse"])
print(rs3.best_params)

0.6810504625281086
{'rmse': {'biased': False, 'lr_bi': 1.8372587422887998e-05, 'lr_bu': 8.204276919366751e-05, 'n_epochs': 50, 'n_factors': 100, 'reg_bi': 0.025999804510939015, 'reg_bu': 3.5090782977079864e-05, 'reg_pu': 0.036103747378787965, 'reg_qi': 0.29092834001020496}}


In [30]:
import numpy as np
import scipy.stats as st
from surprise import NMF
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

param_grid = {
    "n_epochs": [50],
    "n_factors": np.linspace(50, 150, 11).astype(int),
    "biased": [False],
    "reg_pu": st.norm(0.036103747378787965, 0.01),
    "reg_qi": st.norm(0.29092834001020496, 0.1),
    # "reg_bu": st.norm(3.1622776601683795e-05, 1e-5),
    # "reg_bi": st.norm(0.1 ,0.0333),
    # "lr_bu": st.norm(0.0001, 0.00003333),
    # "lr_bi": st.norm(3.1622776601683795e-05, 1e-05),
    # "init_low": [0, 0.1, 0.2, 0.3],
    # "init_high": [0.8, 0.9, 1, 1.2, 1.5, 2, 5],
}

rs4 = RandomizedSearchCV(
    NMF, param_grid, measures=["rmse"], cv=4, n_jobs=-1, joblib_verbose=5, n_iter=1000
)
rs4.fit(data)
print(rs4.best_score["rmse"])
print(rs4.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done 3026 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 3512 tasks      | elapsed: 28.6min


0.6753898816370278
{'rmse': {'biased': False, 'n_epochs': 50, 'n_factors': 140, 'reg_pu': 0.02648205778119174, 'reg_qi': 0.41705140795558326}}


[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 64.1min finished


In [31]:
import numpy as np
import scipy.stats as st
from surprise import NMF
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

param_grid = {
    "n_epochs": np.arange(1, 300),
    "n_factors": [140],
    "biased": [False],
    "reg_pu": [0.02648205778119174],
    "reg_qi": [0.41705140795558326],
    # "reg_bu": st.norm(3.1622776601683795e-05, 1e-5),
    # "reg_bi": st.norm(0.1 ,0.0333),
    # "lr_bu": st.norm(0.0001, 0.00003333),
    # "lr_bi": st.norm(3.1622776601683795e-05, 1e-05),
    # "init_low": [0, 0.1, 0.2, 0.3],
    # "init_high": [0.8, 0.9, 1, 1.2, 1.5, 2, 5],
}

rs5 = GridSearchCV(
    NMF, param_grid, measures=["rmse"], cv=4, n_jobs=-1, joblib_verbose=5
)
rs5.fit(data)
print(rs5.best_score["rmse"])
print(rs5.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 31.1min


0.6779387393164802
{'rmse': {'n_epochs': 52, 'n_factors': 140, 'biased': False, 'reg_pu': 0.02648205778119174, 'reg_qi': 0.41705140795558326}}


[Parallel(n_jobs=-1)]: Done 1196 out of 1196 | elapsed: 96.3min finished


In [32]:
nmf = NMF(**{'n_epochs': 52, 'n_factors': 140, 'biased': False, 'reg_pu': 0.02648205778119174, 'reg_qi': 0.41705140795558326})
nmf.fit(data.build_full_trainset())
from predict import predict
predict(nmf,  "nmf.csv")