In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# let's just read in all of the data that we'll need
books = pd.read_csv("goodbooks-10k/books.csv")
book_tags = pd.read_csv("goodbooks-10k/book_tags.csv")
ratings = pd.read_csv("goodbooks-10k/ratings.csv")
tags = pd.read_csv("goodbooks-10k/tags.csv")
to_read = pd.read_csv("goodbooks-10k/to_read.csv")

In [3]:
ratings.describe()

Unnamed: 0,user_id,book_id,rating
count,5976479.0,5976479.0,5976479.0
mean,26224.46,2006.477,3.919866
std,15413.23,2468.499,0.9910868
min,1.0,1.0,1.0
25%,12813.0,198.0,3.0
50%,25938.0,885.0,4.0
75%,39509.0,2973.0,5.0
max,53424.0,10000.0,5.0


In [4]:
# I will build a pure collaborative recommender first then compare it agains a hybrid model. First, let's get
# a set of training and testing data. (Should we do more than one for cross validation?)

from surprise import Dataset, Reader, SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, GridSearchCV

In [6]:
data = Dataset.load_from_df(ratings,reader = Reader(rating_scale=(1,5)))

In [15]:
algorithm = SVD()

cross_validate(algorithm, data, measures = ['RMSE','MAE'], cv = 5, verbose = True, n_jobs = -3)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8311  0.8299  0.8301  0.8302  0.8300  0.8303  0.0004  
MAE (testset)     0.6413  0.6410  0.6407  0.6410  0.6408  0.6409  0.0002  
Fit time          160.23  158.49  163.63  159.78  161.14  160.65  1.71    
Test time         13.09   13.64   13.77   13.68   13.09   13.45   0.30    


{'test_rmse': array([0.83105239, 0.82991082, 0.83008775, 0.83021145, 0.83002173]),
 'test_mae': array([0.64131644, 0.64095053, 0.64065936, 0.64095351, 0.64079814]),
 'fit_time': (160.22657322883606,
  158.4946985244751,
  163.62677383422852,
  159.7800898551941,
  161.13809657096863),
 'test_time': (13.091276407241821,
  13.642457962036133,
  13.770890712738037,
  13.683136701583862,
  13.0865797996521)}

In [17]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8669724032141334
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False,n_jobs = -2)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [9]:
import multiprocessing

multiprocessing.cpu_count()

24