In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import Dataset
from surprise import Reader
from surprise import NMF
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
import pickle

In [2]:
ratings = pd.read_csv('ratings.csv', nrows =1000000)
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


In [3]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

In [4]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

## NMF

In [5]:
trainset, testset = train_test_split(data, test_size=.2)

In [6]:
trainset.n_users, trainset.n_items

(6747, 20384)

In [None]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = NMF(n_factors=160, n_epochs=100)
algo.fit(trainset)

In [None]:
# if you wanted to evaluate on the trainset
train_pred = algo.test(trainset.build_testset())

In [None]:
accuracy.rmse(train_pred)

In [None]:
# run the trained model against the testset
test_pred = algo.test(testset)

# get RMSE
print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
param_grid = {'n_factors': [100, 120], 
              'n_epochs': [50, 100], 
              'reg_pu': [0.001, 0.005],
              'reg_qi': [0.001, 0.005],
              'biased': [False]}

In [None]:
gs = GridSearchCV(NMF, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs = -1)
gs.fit(data)

In [None]:
algo = gs.best_estimator['rmse']  # pass the best model to algo
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
result = pd.DataFrame(gs.cv_results)

In [None]:
result.to_csv("output_NMF.csv")

In [None]:
with open('NMF_model.pickle', 'wb') as f:
    pickle.dump(algo, f)

In [7]:
infile = open('NMF_model.pickle','rb')
algo = pickle.load(infile)
infile.close()

In [8]:
# run the trained model against the testset
test_pred = algo.test(testset)

# get RMSE
print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

User-based Model : Test Set
RMSE: 1.7499


1.7498904487657376