In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import surprise

In [4]:
# let's just read in all of the data that we'll need
books = pd.read_csv("goodbooks-10k/books.csv")
book_tags = pd.read_csv("goodbooks-10k/book_tags.csv")
ratings = pd.read_csv("goodbooks-10k/ratings.csv")
tags = pd.read_csv("goodbooks-10k/tags.csv")
to_read = pd.read_csv("goodbooks-10k/to_read.csv")

In [156]:
# Let's start really simple
data = surprise.Dataset.load_from_df(ratings,reader = surprise.Reader(rating_scale=(1,5)))
alg = surprise.SVD(verbose=True)
output = alg.fit(data.build_full_trainset())

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [163]:
read_user1 = ratings[ratings['user_id']==1]['book_id']
iid = ratings['book_id'].unique()
unread_user1 =np.setdiff1d(iid,read_user1)

In [164]:
temp = [[1,val,1] for val in unread_user1]
predictions = alg.test(temp)
predictions[:10]

[Prediction(uid=1, iid=1, r_ui=1, est=3.6491611678218225, details={'was_impossible': False}),
 Prediction(uid=1, iid=2, r_ui=1, est=4.162152263349226, details={'was_impossible': False}),
 Prediction(uid=1, iid=3, r_ui=1, est=2.590962943481002, details={'was_impossible': False}),
 Prediction(uid=1, iid=5, r_ui=1, est=3.371438349464442, details={'was_impossible': False}),
 Prediction(uid=1, iid=6, r_ui=1, est=3.593603969448954, details={'was_impossible': False}),
 Prediction(uid=1, iid=7, r_ui=1, est=4.127998034300175, details={'was_impossible': False}),
 Prediction(uid=1, iid=8, r_ui=1, est=3.193479015688997, details={'was_impossible': False}),
 Prediction(uid=1, iid=9, r_ui=1, est=3.298265735442696, details={'was_impossible': False}),
 Prediction(uid=1, iid=12, r_ui=1, est=3.143362245021473, details={'was_impossible': False}),
 Prediction(uid=1, iid=14, r_ui=1, est=3.670174415526104, details={'was_impossible': False})]

In [166]:
pred_ratings = pd.DataFrame(np.array([ [int(pred.iid),pred.est] for pred in predictions]),columns = ['book_id', 'predicted rating'])
pred_ratings.sort_values('predicted rating',ascending = False).head()

Unnamed: 0,book_id,predicted rating
8828,8946.0,4.770566
8958,9076.0,4.682006
3287,3395.0,4.607498
3519,3628.0,4.524609
1015,1106.0,4.509581


In [167]:
pred_ratings = pred_ratings.merge(books[['book_id','goodreads_book_id','title','authors']], on = 'book_id')
    

In [169]:
pred_ratings.sort_values('predicted rating',ascending = False).head(10)

Unnamed: 0,book_id,predicted rating,goodreads_book_id,title,authors
8828,8946.0,4.770566,46292,The Divan,Hafez
8958,9076.0,4.682006,2350129,Preach My Gospel: A Guide To Missionary Service,The Church of Jesus Christ of Latter-day Saints
3287,3395.0,4.607498,71252,The Kindly Ones (The Sandman #9),"Neil Gaiman, Marc Hempel, Richard Case, D'Isra..."
3519,3628.0,4.524609,24812,The Complete Calvin and Hobbes,Bill Watterson
1015,1106.0,4.509581,17267,The Great Divorce,C.S. Lewis
8545,8663.0,4.504512,16164271,"Locke & Key, Vol. 6: Alpha & Omega","Joe Hill, Gabriel Rodríguez"
6802,6920.0,4.499619,24815,The Indispensable Calvin and Hobbes,Bill Watterson
3382,3491.0,4.495754,20342617,Just Mercy: A Story of Justice and Redemption,Bryan Stevenson
1214,1308.0,4.492137,17927395,A Court of Mist and Fury (A Court of Thorns an...,Sarah J. Maas
4595,4708.0,4.480491,9832370,BookRags Summary: A Storm of Swords,BookRags


## Being Better

Great! We succesfully built a recommender. However, we've been pretty bad data scientists so far. We haven't evaluated different models, tuned hyperparameters, or split up our data into train/test sets. Let's do this now!

In [172]:
# first let's tune the hyperparameters first
hyperparams = {'lr_all':[0.001,0.01,0.1], 'reg_all':[0.05,0.1,0.5]}
gs = surprise.model_selection.GridSearchCV(surprise.SVD,hyperparams, measures=['rmse','mae'],cv=3)
gs.fit(data)
print(gs.best_params)

{'rmse': {'lr_all': 0.01, 'reg_all': 0.05}, 'mae': {'lr_all': 0.01, 'reg_all': 0.05}}


In [176]:
import time
alg = surprise.SVD(lr_all = 0.01, reg_all = 0.05)
t1 = time.time()
output = surprise.model_selection.cross_validate(alg, data, verbose = True)
t2 = time.time()

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8139  0.8140  0.8137  0.8138  0.8149  0.8141  0.0004  
MAE (testset)     0.6296  0.6294  0.6296  0.6294  0.6303  0.6297  0.0003  
Fit time          190.78  192.35  194.01  199.24  192.07  193.69  2.96    
Test time         28.68   20.06   26.38   28.24   21.25   24.92   3.59    


In [177]:
t3 = time.time()
output2 = surprise.model_selection.cross_validate(alg, data, verbose = True, n_jobs = -1)
t4= time.time()
print("1: ", t2-t1, "\n2: ", t4-t3)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8145  0.8134  0.8136  0.8148  0.8141  0.8141  0.0005  
MAE (testset)     0.6298  0.6292  0.6293  0.6300  0.6297  0.6296  0.0003  
Fit time          151.31  154.21  158.41  165.96  158.66  157.71  4.95    
Test time         12.28   12.04   11.81   12.25   12.30   12.14   0.19    
1:  1175.8836696147919 
2:  320.0379829406738


In [5]:
# I need to make this fast for exploring purposes. I'm just going to reduce the dataset by 75%
subset = ratings.sample(frac = 0.25)

In [201]:
subset.groupby('user_id').count().sort_values('rating')

Unnamed: 0_level_0,book_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
17287,2,2
29841,2,2
29868,2,2
19970,2,2
51725,2,2
...,...,...
3739,61,61
25415,61,61
6634,62,62
11945,63,63


In [202]:
subset.head()

Unnamed: 0,user_id,book_id,rating
3972790,12821,1050,3
511696,9459,44,3
5953497,49250,63,3
5166634,51698,95,5
2060003,18118,6344,4


In [7]:
subset_s = data = surprise.Dataset.load_from_df(subset,reader = surprise.Reader(rating_scale=(1,5)))
benchmark = []
# Iterate over all algorithms... 
# SVD(), SVDpp(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(),
# KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()
for algorithm in [surprise.SVD(),
                  surprise.SVDpp()]:
    # Perform cross validation
    print(algorithm)
    results = surprise.model_selection.cross_validate(algorithm, 
                                                      subset_s, 
                                                      measures=['RMSE'], 
                                                      cv=3, 
                                                      verbose=False,
                                                      n_jobs = -1)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fdbc590a370>
<surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7fdbc57f8430>


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.875822,233.319899,17.677576
SVD,0.886939,32.167252,4.499893


In [137]:
meg_books = {'percy jackson': ['riordan',5],
             'harry potter and the': ['rowling',4],
             'unwanteds': ['mcmann',4],
             'eragon':['paolini',3],
             'night': ['wiesel',5],
             'witch & wizard':['patterson',4]}
meg_ratings = []
for key,value in meg_books.items():
    for index, row in books.iterrows():
        if ((key in row['title'].lower()) and (value[0] in row['authors'].lower())):
            meg_ratings.append([row['book_id'],value[1]])
meg_rat = pd.DataFrame(meg_ratings,
                       columns = ['book_id','rating'])

In [138]:
meg_rat = meg_rat.merge(books[['book_id','title','authors']], on='book_id')
meg_rat.sort_values('book_id').head(35)

Unnamed: 0,book_id,rating,title,authors
14,2,4,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
15,18,4,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck"
16,21,4,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré"
17,23,4,Harry Potter and the Chamber of Secrets (Harry...,"J.K. Rowling, Mary GrandPré"
18,24,4,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré"
19,25,4,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré"
20,27,4,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré"
0,41,5,The Lightning Thief (Percy Jackson and the Oly...,Rick Riordan
24,53,3,"Eragon (The Inheritance Cycle, #1)",Christopher Paolini
27,87,5,Night (The Night Trilogy #1),"Elie Wiesel, Marion Wiesel"
