In [2]:
import pandas as pd
ratings = pd.read_csv('goodreads_interactions.csv', names=['user_id', 'book_id', 'is_read', 'rating', 'is_reviewed'])
ratings.drop(["is_read", "is_reviewed"], inplace=True, axis=1)
ratings.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(228648343, 3)

In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228648343 entries, 0 to 228648342
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  object
 1   book_id  object
 2   rating   object
dtypes: object(3)
memory usage: 5.1+ GB


In [4]:
ratings.isnull().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [5]:
n_user = ratings.user_id.unique().shape[0]
n_book = ratings.book_id.unique().shape[0]
n_user, n_book

(876147, 2431885)

In [None]:
rating_matrix = ratings.pivot_table(index=["user_id"], columns=["book_id"], values="rating")
rating_matrix.head()

In [None]:
from surprise import Reader, Dataset, NormalPredictor, accuracy
from surprise import SVD, SVDpp, NMF, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate, GridSearchCV, KFold

In [None]:
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

In [None]:
benchmark = []

for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=KFold(10, random_state=2, shuffle=True), verbose=True, n_jobs=-1)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')


In [None]:
#SVDpp
param_grid = {'n_epochs': [20,25,30,40], 'n_factors': [5,8,10,15,20,25],
             'lr_all': [0.005, 0.007], 'reg_all': [0.02, 0.01, 0.04]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=KFold(6, random_state=2, shuffle=True), n_jobs=-1)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])


In [None]:
svdpp = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
svdpp.fit(trainset)

In [None]:
svdpp.predict(3, 5).est

In [None]:
import json

metadata = pd.read_json('goodreads_books.json')
metadata.head(3)


In [None]:
user_map = pd.read_csv('user_id_map.csv')
user_map.head(3)

In [None]:
book_map =pd.read_csv('book_id_map')
book_map.head(3)

In [None]:
# recommendation based on user, title, genre
import difflib
import random

def get_user_mapped_id(user_id, user_map):
    user_ids = list(user_map['user_id'].values)
    return user_ids.index(user_id)

def get_book_info(book_id, metadata):
    book_info = metadata[metadata['book_id'] == book_id][['book_id', 'isbn', 'authors', 'title']]
    return book_info.to_dict(orient='records')

def is_in_genre(book_id, genre):
    shelf = metadata[metadata['book_id']==book_id].popular_shelves
    for dic in shelf:
        if dic['name']==genre: return True
    return False

def generate_recommendation(user_id, title, model, genre, thresh=4):
    if user_id == None: return "Please provide your user_id"
    user_mapped_id = get_user_mapped_id(user_id, user_map)
    book_ids = list(book_map['book_id'].values)
    if title:  
        titles = list(metadata['title'].values)
        random.shuffle(titles)
        closest_titles = difflib.get_close_matches(title, titles)

        for t in closest_titles:
            book_id = metadata[metadata['title'] == t]['book_id'].values[0]
            book_mapped_id = book_ids.index(book_id)
            rating = model.predict(user_mapped_id, book_mapped_id).est
            if rating >= thresh:
                if genre:
                    if is_in_genre(book_id, genre):
                        return get_book_info(book_id, metadata)
                else:
                    return get_book_info(book_id, metadata)
    else:
        if genre:
            random.shuffle(book_ids)
            for book_id in book_ids:
                if is_in_genre(book_id, genre):
                    book_mapped_id = book_ids.index(book_id)
                    rating = model.predict(user_mapped_id, book_mapped_id)
                    if rating >= thresh: 
                        return get_book_info(book_id, metadata)
        else:
            return "Please provide title and/or genre"
                
                
        

In [None]:
user_id = None
title = None
model = svdpp
genre = None
thresh = 4
generate_recommendation(user_id, title, model, genre, thresh)

In [None]:
# Works cited:
# Mengting Wan, Julian McAuley, "Item Recommendation on Monotonic Behavior Chains", in RecSys'18. [bibtex]
# Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "Fine-Grained Spoiler Detection from Large-Scale Review Corpora", in ACL'19. [bibtex]
# Amol Mavuduru, "How you can build simple recommender systems with Surprise"
# Poorna Srinivas Gutta, "Solution to HW1"
# Harshil Bharatkumar Darji, "Solution to HW1"