In [1]:
# read the interactions dataset
import pandas as pd
ratings_raw = pd.read_csv('/data/cmpe256-sp22/013900770/project/goodreads_interactions.csv')
ratings_raw.shape


(228648342, 5)

In [13]:
# check the dataset
ratings_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228648342 entries, 0 to 228648341
Data columns (total 5 columns):
 #   Column       Dtype
---  ------       -----
 0   user_id      int64
 1   book_id      int64
 2   is_read      int64
 3   rating       int64
 4   is_reviewed  int64
dtypes: int64(5)
memory usage: 8.5 GB


In [14]:
count = (ratings_raw['rating'] == 0).sum()
count

124096793

In [15]:
zero_rows = ratings_raw[ratings_raw['rating']==0]
zero_rows.head(3)

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
35,0,913,0,0,1
37,0,911,0,0,0
38,0,910,0,0,1


In [16]:
ratings_whole = ratings_raw[ratings_raw['rating']!=0]

In [17]:
ratings_whole.drop(["is_read", "is_reviewed"], inplace=True, axis=1)

In [18]:
ratings_whole.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104551549 entries, 0 to 228648341
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   user_id  int64
 1   book_id  int64
 2   rating   int64
dtypes: int64(3)
memory usage: 3.1 GB


In [19]:
ratings_whole.isnull().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [20]:
ratings_whole.head(3)

Unnamed: 0,user_id,book_id,rating
0,0,948,5
1,0,947,5
2,0,946,5


In [5]:

n_user_whole = ratings_whole.user_id.unique().shape[0]
n_book_whole = ratings_whole.book_id.unique().shape[0]
n_user_whole, n_book_whole

(876145, 2360650)

In [22]:
# sample the datase 1/1000
                                            
ratings_1000th = ratings_whole.iloc[[x for x in range(ratings_whole.shape[0]) if x%999==0]]


In [23]:
ratings_1000th.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104657 entries, 0 to 228647995
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  104657 non-null  int64
 1   book_id  104657 non-null  int64
 2   rating   104657 non-null  int64
dtypes: int64(3)
memory usage: 3.2 MB


In [24]:
ratings_1000th.head(3)

Unnamed: 0,user_id,book_id,rating
0,0,948,5
1809,5,7052,3
8269,11,1528,3


In [9]:
n_user_1000th = ratings_1000th.user_id.unique().shape[0]
n_book_1000th = ratings_1000th.book_id.unique().shape[0]
n_user_1000th, n_book_1000th

(181862, 114043)

In [34]:
#using the surprise library

from surprise import Reader, Dataset, accuracy
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate, GridSearchCV, KFold


In [26]:
reader = Reader(rating_scale=(1.0, 5.0))
data_1000th = Dataset.load_from_df(ratings_1000th[['user_id', 'book_id', 'rating']], reader)


In [35]:
# find the best algorithms

benchmark_1000th = []
for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), NMF(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    results = cross_validate(algorithm, data_1000th, measures=['RMSE'], cv=KFold(3, random_state=2, shuffle=True), verbose=True, n_jobs=-1)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp['Algorithm'] = str(algorithm).split(' ')[0].split('.')[-1]
    benchmark_1000th.append(tmp)
    
pd.DataFrame(benchmark_1000th).set_index('Algorithm').sort_values('test_rmse')


Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9773  0.9715  0.9809  0.9765  0.0039  
Fit time          4.29    4.31    4.00    4.20    0.14    
Test time         0.22    0.24    0.22    0.23    0.01    
Evaluating RMSE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9780  0.9718  0.9810  0.9769  0.0038  
Fit time          7.22    7.91    7.23    7.45    0.32    
Test time         0.24    0.24    0.25    0.24    0.00    
Evaluating RMSE of algorithm SlopeOne on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9947  0.9885  0.9962  0.9931  0.0033  
Fit time          40.97   41.89   38.81   40.55   1.29    
Test time         0.21    0.25    0.23    0.23    0.02    
Evaluating RMSE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.976542,4.200119,0.228878
KNNBaseline,0.976702,136.187085,0.234494
SVDpp,0.976941,7.452104,0.243214
KNNBasic,0.988136,137.168154,0.212898
SlopeOne,0.993134,40.554932,0.229957
KNNWithMeans,0.993134,136.498292,0.241831
KNNWithZScore,0.993134,140.01671,0.26098
NMF,0.998656,11.667029,0.202944
NormalPredictor,1.318902,0.074932,0.31437


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
# Among best three: SVD, KNNBaseline, & SVDpp, 
# we'll choose SVD for the least time consuming and best rmse

In [44]:
# find the best params for SVD 

param_grid = {'n_epochs': [20,25,30], 'n_factors': [5,10,15],
             'lr_all': [0.003,0.004,0.005], 'reg_all': [0.13,0.14,0.15]}
gs1 = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=KFold(3, random_state=2, shuffle=True), n_jobs=-1)
gs1.fit(data_1000th)


In [45]:
print(gs1.best_score['rmse'])
print(gs1.best_params['rmse'])

0.9764644996078665
{'n_epochs': 25, 'n_factors': 10, 'lr_all': 0.004, 'reg_all': 0.14}


In [46]:
# The best params are sitting in the center of the grid
# we'll take these pramas as the best combination
# try to build a SVD model on the sample dataset

trainset_1000th = data_1000th.build_full_trainset()
svd_1000th = gs1.best_estimator['rmse']
svd_1000th.fit(trainset_1000th)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa443a23460>

In [47]:
# try to predict 
svd_1000th.predict(3, 5).est

3.9398415777253315

In [48]:
# save the trained SVD model

import joblib
joblib.dump(svd_1000th, 'svd_1000th_trained_joblib')

['svd_1000th_trained_joblib']

In [3]:
# import joblib
# svd_1000th = joblib.load('old/svd_1000th_trained_joblib')

In [49]:
# try to build a SVD model on the whole dataset

data_whole = Dataset.load_from_df(ratings_whole[['user_id', 'book_id', 'rating']], reader)
trainset_whole = data_whole.build_full_trainset()
svd_whole = gs1.best_estimator['rmse']
svd_whole.fit(trainset_whole)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa443a23460>

In [50]:
# try to predict 
svd_whole.predict(3, 5).est

3.6066059410253977

In [51]:
# save the trained SVD model

import joblib
joblib.dump(svd_whole, 'svd_whole_trained_joblib')

['svd_whole_trained_joblib']

In [1]:
# import joblib 
# svd_whole = joblib.load('svd_whole_trained_joblib')

In [None]:
# read the book data and user data

In [4]:
import pandas as pd
import json

metadata = pd.read_json('/data/cmpe256-sp22/013900770/project/goodreads_books.json', lines=True)
metadata.head(3)


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122.0,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9.0,,1984,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,6,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,B000FC0PBC,...,10.0,Abridged,2001,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,1323437,Good Harbor,Good Harbor
2,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."


In [5]:
user_map = pd.read_csv('/data/cmpe256-sp22/013900770/project/user_id_map.csv')
user_map.head(3)


Unnamed: 0,user_id_csv,user_id
0,0,8842281e1d1347389f2ab93d60773d4d
1,1,72fb0d0087d28c832f15776b0d936598
2,2,ab2923b738ea3082f5f3efcbbfacb218


In [6]:
book_map =pd.read_csv('/data/cmpe256-sp22/013900770/project/book_id_map.csv')
book_map.head(3)


Unnamed: 0,book_id_csv,book_id
0,0,34684622
1,1,34536488
2,2,34017076


In [52]:
# recommendation based on user, title, genre

import difflib
import random

def get_user_mapped_id(user_id, user_map):
    user_ids = list(user_map['user_id'].values)
    return user_ids.index(user_id)

def get_book_info(book_id, metadata):
    book_info = metadata[metadata['book_id'] == book_id][['book_id', 'isbn', 'authors', 'title', 'average_rating']]
    return book_info.to_dict(orient='records')

def is_in_genre(book_id, genre):
    shelf = metadata[metadata['book_id']==book_id].values.tolist()[0][5]
    for dic in shelf:
        if dic['name'] == genre: return True
    return False

def generate_recommendation(user_id, title, model, genre, thresh=4):
    if user_id == None: return "Please provide your user_id"
    user_mapped_id = get_user_mapped_id(user_id, user_map)
    read_books = ratings_raw[ratings_raw['is_read']== 1]
    read_books = read_books[read_books['user_id']== user_mapped_id]
    read_books = list(read_books['book_id'].values)     
    
    book_ids = list(book_map['book_id'].values)
    if title:  
        titles = list(metadata['title'].values)
        random.shuffle(titles)
        closest_titles = difflib.get_close_matches(title, titles)

        for t in closest_titles:
            book_id = metadata[metadata['title'] == t]['book_id'].values[0]
            book_mapped_id = book_ids.index(book_id)
            rating = model.predict(user_mapped_id, book_mapped_id).est
            if rating >= thresh:
                if genre:
                    if is_in_genre(book_id, genre):
                        if book_mapped_id not in read_books:
                            return get_book_info(book_id, metadata)
                else:
                    return get_book_info(book_id, metadata)
        return "All books with this title will be rated by you below the threshold"
    else:
        if genre:
            random.shuffle(book_ids)
            for book_id in book_ids:
                if is_in_genre(book_id, genre):
                    book_mapped_id = book_ids.index(book_id)
                    rating = model.predict(user_mapped_id, book_mapped_id).est
                    if rating >= thresh: 
                        if book_mapped_id not in read_books:
                            return get_book_info(book_id, metadata)
            return "All books in this genre will be rated by you below the threshold"
        else:
            return "Please provide title and/or genre"
                
                

In [53]:
# Recommend by svd_1000th model

user_id = '8842281e1d1347389f2ab93d60773d4d'
title = None
model = svd_1000th
genre = 'romance'
thresh = 4.0

generate_recommendation(user_id, title, model, genre, thresh)


[{'book_id': 17261561,
  'isbn': '',
  'authors': [{'author_id': '997129', 'role': ''}],
  'title': 'Melting Point (Seattle Steam, #2.5)',
  'average_rating': '3.34'}]

In [10]:
user_id = '8842281e1d1347389f2ab93d60773d4d'
title = 'Dog Heaven'
model = svd_1000th
genre = None
thresh = 4.0

generate_recommendation(user_id, title, model, genre, thresh)


[{'book_id': 89378,
  'isbn': '0590417010',
  'authors': [{'author_id': '5411', 'role': ''}],
  'title': 'Dog Heaven',
  'average_rating': '4.43'}]

In [13]:
# Recommend by svd_whole model

user_id = '8842281e1d1347389f2ab93d60773d4d'
title = None
model = svd_whole
genre = 'romance'
thresh = 4.0

generate_recommendation(user_id, title, model, genre, thresh)


[{'book_id': 29396562,
  'isbn': '1513708929',
  'authors': [{'author_id': '14874116', 'role': ''}],
  'title': 'The Beloved (Faerie Tale, #2)',
  'average_rating': '4.43'}]

In [12]:
user_id = '8842281e1d1347389f2ab93d60773d4d'
title = 'Dog Heaven'
model = svd_whole
genre = None
thresh = 4.0

generate_recommendation(user_id, title, model, genre, thresh)

[{'book_id': 89378,
  'isbn': '0590417010',
  'authors': [{'author_id': '5411', 'role': ''}],
  'title': 'Dog Heaven',
  'average_rating': '4.43'}]

In [None]:
# Works cited:
# Mengting Wan, Julian McAuley, "Item Recommendation on Monotonic Behavior Chains", in RecSys'18. [bibtex]
# Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "Fine-Grained Spoiler Detection from Large-Scale Review Corpora", in ACL'19. [bibtex]
# Amol Mavuduru, "How you can build simple recommender systems with Surprise"
# Poorna Srinivas Gutta, "Solution to HW1"
# Harshil Bharatkumar Darji, "Solution to HW1"
