In [None]:
# Due to the large size of the dataset
# This implementation utilized the HPC 
# sampled 1/1000 of goodreads_interactions.csv data
# used all goodreads_books.json, user_id_map.csv, and book_id_map.csv data


In [1]:
# read the interactions dataset
import pandas as pd
ratings_whole = pd.read_csv('/data/cmpe256-sp22/013900770/project/goodreads_interactions.csv')
ratings_whole.drop(["is_read", "is_reviewed"], inplace=True, axis=1)
ratings_whole.shape


(228648342, 3)

In [2]:
# check the dataset
ratings_whole.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228648342 entries, 0 to 228648341
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   user_id  int64
 1   book_id  int64
 2   rating   int64
dtypes: int64(3)
memory usage: 5.1 GB


In [3]:
ratings_whole.isnull().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [4]:
ratings_whole.head(3)

Unnamed: 0,user_id,book_id,rating
0,0,948,5
1,0,947,5
2,0,946,5


In [5]:

n_user_whole = ratings_whole.user_id.unique().shape[0]
n_book_whole = ratings_whole.book_id.unique().shape[0]
n_user_whole, n_book_whole

(876145, 2360650)

In [3]:
# sample the datase 1/1000
                                            
ratings_1000th = ratings_whole.iloc[[x for x in range(ratings_whole.shape[0]) if x%999==0]]

In [7]:
ratings_1000th.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228878 entries, 0 to 228648123
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  228878 non-null  int64
 1   book_id  228878 non-null  int64
 2   rating   228878 non-null  int64
dtypes: int64(3)
memory usage: 7.0 MB


In [8]:
ratings_1000th.head(3)

Unnamed: 0,user_id,book_id,rating
0,0,948,5
999,1,704,5
1998,5,6869,0


In [9]:
n_user_1000th = ratings_1000th.user_id.unique().shape[0]
n_book_1000th = ratings_1000th.book_id.unique().shape[0]
n_user_1000th, n_book_1000th

(181862, 114043)

In [4]:
#using the surprise library

from surprise import Reader, Dataset, accuracy
from surprise import SVD, SVDpp, SlopeOne, NormalPredictor, NMF
from surprise.model_selection import cross_validate, GridSearchCV, KFold


In [5]:
reader = Reader(rating_scale=(1.0, 5.0))
data_1000th = Dataset.load_from_df(ratings_1000th[['user_id', 'book_id', 'rating']], reader)


In [18]:
# find the best algorithms

benchmark_1000th = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor()]:
    results = cross_validate(algorithm, data_1000th, measures=['RMSE'], cv=KFold(3, random_state=2, shuffle=True), verbose=True, n_jobs=-1)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp['Algorithm'] = str(algorithm).split(' ')[0].split('.')[-1]
    benchmark_1000th.append(tmp)
    
pd.DataFrame(benchmark_1000th).set_index('Algorithm').sort_values('test_rmse')


Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.9909  1.9927  1.9887  1.9908  0.0016  
Fit time          9.78    9.55    9.86    9.73    0.13    
Test time         0.56    0.56    0.57    0.56    0.00    
Evaluating RMSE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.9949  1.9947  1.9907  1.9934  0.0019  
Fit time          24.19   19.15   19.82   21.05   2.23    
Test time         0.81    0.70    0.81    0.77    0.05    
Evaluating RMSE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    2.0886  2.0856  2.0846  2.0863  0.0017  
Fit time          207.41  212.57  207.62  209.20  2.38    
Test time         0.69    0.76    0.60    0.68    0.07    
Evaluating RMSE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.990777,9.729697,0.563782
SVDpp,1.993441,21.052103,0.773935
SlopeOne,2.086279,209.200431,0.684142
NormalPredictor,2.499977,0.214864,0.831689


In [17]:
# find the best params for SVD 

param_grid = {'n_epochs': [20,25,30], 'n_factors': [10,15,20],
             'lr_all': [0.007, 0.008, 0.009], 'reg_all': [0.06,0.07,0.08]}
gs1 = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=KFold(3, random_state=2, shuffle=True), n_jobs=-1)
gs1.fit(data_1000th)


In [18]:
print(gs1.best_score['rmse'])
print(gs1.best_params['rmse'])

1.9835282716973557
{'n_epochs': 30, 'n_factors': 15, 'lr_all': 0.007, 'reg_all': 0.08}


In [19]:
# tweak the params for SVD
param_grid = {'n_epochs': [30, 35, 40], 'n_factors': [10, 15, 20],
             'lr_all': [0.005,0.006,0.007], 'reg_all': [0.08,0.09,0.1]}
gs1 = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=KFold(3, random_state=2, shuffle=True), n_jobs=-1)
gs1.fit(data_1000th)


In [20]:
print(gs1.best_score['rmse'])
print(gs1.best_params['rmse'])

1.983510362124577
{'n_epochs': 30, 'n_factors': 10, 'lr_all': 0.007, 'reg_all': 0.1}


In [23]:
# since the improvement is tiny, 
# we'll take these pramas as the best combination
# try to build a SVD model on the sample dataset

trainset_1000th = data_1000th.build_full_trainset()
svd_1000th = gs1.best_estimator['rmse']
svd_1000th.fit(trainset_1000th)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc5dcc99550>

In [24]:
# try to predict 
svd_1000th.predict(3, 5).est

1.8005793479495626

In [25]:
# save the trained SVD model

import joblib
joblib.dump(svd_1000th, 'svd_1000th_trained_joblib')

['svd_1000th_trained_joblib']

In [None]:
# read the book data and user data

In [26]:
import json

metadata = pd.read_json('/data/cmpe256-sp22/013900770/project/goodreads_books.json', lines=True)
metadata.head(3)


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122.0,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9.0,,1984,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,6,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,B000FC0PBC,...,10.0,Abridged,2001,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,1323437,Good Harbor,Good Harbor
2,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."


In [27]:
user_map = pd.read_csv('/data/cmpe256-sp22/013900770/project/user_id_map.csv')
user_map.head(3)

Unnamed: 0,user_id_csv,user_id
0,0,8842281e1d1347389f2ab93d60773d4d
1,1,72fb0d0087d28c832f15776b0d936598
2,2,ab2923b738ea3082f5f3efcbbfacb218


In [28]:
book_map =pd.read_csv('/data/cmpe256-sp22/013900770/project/book_id_map.csv')
book_map.head(3)

Unnamed: 0,book_id_csv,book_id
0,0,34684622
1,1,34536488
2,2,34017076


In [52]:
# recommendation based on user, title, genre

import difflib
import random

def get_user_mapped_id(user_id, user_map):
    user_ids = list(user_map['user_id'].values)
    return user_ids.index(user_id)

def get_book_info(book_id, metadata):
    book_info = metadata[metadata['book_id'] == book_id][['book_id', 'isbn', 'authors', 'title', 'average_rating']]
    return book_info.to_dict(orient='records')

def is_in_genre(book_id, genre):
    shelf = metadata[metadata['book_id']==book_id].values.tolist()[0][5]
    for dic in shelf:
        if dic['name'] == genre: return True
    return False

def generate_recommendation(user_id, title, model, genre, thresh=4):
    if user_id == None: return "Please provide your user_id"
    user_mapped_id = get_user_mapped_id(user_id, user_map)
    book_ids = list(book_map['book_id'].values)
    if title:  
        titles = list(metadata['title'].values)
        random.shuffle(titles)
        closest_titles = difflib.get_close_matches(title, titles)

        for t in closest_titles:
            book_id = metadata[metadata['title'] == t]['book_id'].values[0]
            book_mapped_id = book_ids.index(book_id)
            rating = model.predict(user_mapped_id, book_mapped_id).est
            if rating >= thresh:
                if genre:
                    if is_in_genre(book_id, genre):
                        return get_book_info(book_id, metadata)
                else:
                    return get_book_info(book_id, metadata)
        return "All books with this title will be rated by you below the threshold"
    else:
        if genre:
            random.shuffle(book_ids)
            for book_id in book_ids:
                if is_in_genre(book_id, genre):
                    book_mapped_id = book_ids.index(book_id)
                    rating = model.predict(user_mapped_id, book_mapped_id).est
                    if rating >= thresh: 
                        return get_book_info(book_id, metadata)
        else:
            return "Please provide title and/or genre"
                
                

In [32]:
# Recommend with svd_100th model, given user_id, genre, and threshold

user_id = '8842281e1d1347389f2ab93d60773d4d'
title = None
model = svd_1000th
genre = 'romance'
thresh = 4.0

generate_recommendation(user_id, title, model, genre, thresh)


[{'book_id': 27416023,
  'isbn': '1455536776',
  'authors': [{'author_id': '7084780', 'role': ''}],
  'title': 'Remembering Everly (Lost & Found, #2)',
  'average_rating': '3.70'}]

In [53]:
# Recommend with svd_100th model, given user_id, title, and threshold

user_id = '8842281e1d1347389f2ab93d60773d4d'
title = 'Dog Heaven'
model = svd_1000th
genre = None
thresh = 4.0

generate_recommendation(user_id, title, model, genre, thresh)


'All books with this title will be rated by you below the threshold'

In [None]:
# Works cited:
# Mengting Wan, Julian McAuley, "Item Recommendation on Monotonic Behavior Chains", in RecSys'18. [bibtex]
# Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "Fine-Grained Spoiler Detection from Large-Scale Review Corpora", in ACL'19. [bibtex]
# Amol Mavuduru, "How you can build simple recommender systems with Surprise"
# Poorna Srinivas Gutta, "Solution to HW1"
# Harshil Bharatkumar Darji, "Solution to HW1"
