In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-white') 
plt.rcParams.update({'font.size': 15}) 
%matplotlib inline

In [None]:
!pip install scikit-surprise

In [None]:
users = pd.read_csv('BX-Users.csv', sep=';')
books = pd.read_csv('BX-Books.csv', sep=';')
book_ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';')

In [None]:
users.head()

In [None]:
books.head()

In [None]:
book_ratings.head()

In [None]:
print(f'Users: {len(users)}\nBooks: {len(books)}\nRatings: {len(book_ratings)}')

In [None]:
users.columns = users.columns.str.strip().str.lower().str.replace('-', '_')
users.head()

In [None]:
users.dtypes

In [None]:
uniq_users = users.user_id.nunique()
all_users = users.user_id.count()
print(f'No. of unique user_id entries: {uniq_users} | Total user_id entries: {all_users}')

In [None]:
print(sorted(users.age.unique()))

In [None]:
users.loc[(users.age<5) | (users.age>100), 'age'] = np.nan

In [None]:
users.head()

In [None]:
print(sorted(users.age.unique()))

In [None]:
ax = users.age.hist(bins=10, figsize=(12,5))
ax.set_xlabel('Age')
ax.set_ylabel('counts')
ax.set_xticks(range(0,110,10))
plt.show()

In [None]:
u = users.age.value_counts().sort_index()
plt.figure(figsize=(20, 10))
plt.rcParams.update({'font.size': 15})
plt.bar(u.index, u.values)
plt.xlabel('Age')
plt.ylabel('counts')
plt.show()

In [None]:
age_null = users.age.isnull().sum()
all_users = users.user_id.count()
print(f'There are {age_null} empty age values in our set of {all_users} users (or {(age_null/all_users)*100:.2f}%).')

In [None]:
user_location_expanded = users.location.str.split(',', 2, expand=True)
user_location_expanded.columns = ['city', 'state', 'country']
users = users.join(user_location_expanded)

In [None]:
users.drop(columns=['location'], inplace=True)
users.head()

In [None]:
top_cities = users.city.value_counts()[:10]
print(f'The 10 cities with the most users are:\n{top_cities}')

In [None]:
top_countries = users.country.value_counts()[:10]
print(f'The 10 countries with the most users are:\n{top_countries}')

In [None]:
empty_string_country = users[users.country == ''].country.count()
nan_country = users.country.isnull().sum()
print(f'There are {empty_string_country} entries with empty strings, and {nan_country} NaN entries in the Country field')

In [None]:
users.country.replace('', np.nan, inplace=True)

In [None]:
books.head()

In [None]:
books.columns = books.columns.str.strip().str.lower().str.replace('-', '_')
books.drop(columns=['image_url_s', 'image_url_m', 'image_url_l'], inplace=True)
books.head()

In [None]:
books.dtypes

In [None]:
# Convert years to float
books.year_of_publication = pd.to_numeric(books.year_of_publication, errors='coerce')

In [None]:

zero_yr = books[books.year_of_publication == 0].year_of_publication.count()
nan_yr = books.year_of_publication.isnull().sum()
print(f'There are {zero_yr} entries as \'0\', and {nan_yr} NaN entries in the Year of Publication field')

In [None]:
books.year_of_publication.replace(0, np.nan, inplace=True)

In [None]:
yr = books.year_of_publication.value_counts().sort_index()
yr = yr.where(yr>5) 
plt.figure(figsize=(20, 10))
plt.rcParams.update({'font.size': 15}) 
plt.bar(yr.index, yr.values)
plt.xlabel('Year of Publication')
plt.ylabel('counts')
plt.show()

In [None]:
historical_books = books[books.year_of_publication<1900] 
books_from_the_future = books[books.year_of_publication>2018] 

hist_books_mini = historical_books[['book_title', 'year_of_publication']]
future_books_mini = books_from_the_future[['book_title', 'year_of_publication']]
print(f'Historical books:\n{hist_books_mini}')
print('\n')
print(f'Future books:\n{future_books_mini}')

In [None]:
print(f'Length of books dataset before removal: {len(books)}')
books = books.loc[~(books.isbn.isin(historical_books.isbn))] 
books = books.loc[~(books.isbn.isin(books_from_the_future.isbn))] 
print(f'Length of books dataset after removal: {len(books)}')

In [None]:
books.publisher = books.publisher.str.replace('&amp', '&', regex=False)
books.head()

In [None]:
uniq_books = books.isbn.nunique()
all_books = books.isbn.count()
print(f'No. of unique books: {uniq_books} | All book entries: {all_books}')

In [None]:
top_publishers = books.publisher.value_counts()[:10]
print(f'The 10 publishers with the most entries in the books table are:\n{top_publishers}')

In [None]:
top_authors = books.book_author.value_counts()[:10]
print(f'The 10 authors with the most entries in the books table are:\n{top_authors}')

In [None]:
empty_string_publisher = books[books.publisher == ''].publisher.count()
nan_publisher = books.publisher.isnull().sum()
print(f'There are {empty_string_publisher} entries with empty strings, and {nan_publisher} NaN entries in the Publisher field')

In [None]:
empty_string_author = books[books.book_author == ''].book_author.count()
nan_author = books.book_author.isnull().sum()
print(f'There are {empty_string_author} entries with empty strings, and {nan_author} NaN entries in the Author field')

In [None]:
top_titles = books.book_title.value_counts()[:10]
print(f'The 10 book titles with the most entries in the books table are:\n{top_titles}')

In [None]:
books[books.book_title=='Jane Eyre']

In [None]:
book_ratings.columns = book_ratings.columns.str.strip().str.lower().str.replace('-', '_')
book_ratings.head()

In [None]:
book_ratings.dtypes

In [None]:
super_users = book_ratings.groupby('user_id').isbn.count().sort_values(ascending=False)
print(f'The 20 users with the most ratings:\n{super_users[:20]}')

In [None]:
user_hist = super_users.where(super_users<50)
user_hist.hist(bins=30)
plt.xlabel('No. of ratings')
plt.ylabel('count')
plt.show()

In [None]:
super_user_hist = super_users.where(super_users>1000)
super_user_hist.hist(bins=30)
plt.xlabel('No. of ratings (min. 1000)')
plt.ylabel('count')
plt.show()

In [None]:
rtg = book_ratings.book_rating.value_counts().sort_index()

plt.figure(figsize=(10, 5))
plt.rcParams.update({'font.size': 15}) 
plt.bar(rtg.index, rtg.values)
plt.xlabel('Rating')
plt.ylabel('counts')
plt.show()

In [None]:
print(f'Size of book_ratings before removing zero ratings: {len(book_ratings)}')
book_ratings = book_ratings[book_ratings.book_rating != 0]
print(f'Size of book_ratings after removing zero ratings: {len(book_ratings)}')

In [None]:
rtg = book_ratings.book_rating.value_counts().sort_index()

plt.figure(figsize=(10, 5))
plt.rcParams.update({'font.size': 15}) 
plt.bar(rtg.index, rtg.values)
plt.xlabel('Rating')
plt.ylabel('counts')
plt.show()

In [None]:
print(f'Books table size: {len(books)}')
print(f'Ratings table size: {len(book_ratings)}')
books_with_ratings = book_ratings.join(books.set_index('isbn'), on='isbn')
print(f'New table size: {len(books_with_ratings)}')

In [None]:
books_with_ratings.head()

In [None]:
print(f'There are {books_with_ratings.book_title.isnull().sum()} books with no title/author information.')
print(f'This represents {len(books_with_ratings)/books_with_ratings.book_title.isnull().sum():.2f}% of the ratings dataset.')

In [None]:
books_with_ratings.info()

In [None]:
books_with_ratings.dropna(subset=['book_title'], inplace=True) # remove rows with missing title/author data

In [None]:
cm_rtg = books_with_ratings.groupby('book_title').book_rating.sum()
cm_rtg = cm_rtg.sort_values(ascending=False)[:10]
idx = cm_rtg.index.tolist() 
vals = cm_rtg.values.tolist() 

plt.figure(figsize=(10, 5))
plt.rcParams.update({'font.size': 15})
plt.bar(range(len(idx)), vals)
plt.xticks(range(len(idx)), idx, rotation='vertical')
plt.ylabel('cumulative rating score')
plt.show()

In [None]:
cutoff = books_with_ratings.book_title.value_counts()
mean_rtg = books_with_ratings[books_with_ratings.book_title.isin(cutoff[cutoff>50].index)].groupby('book_title')['book_rating'].mean()
mean_rtg.sort_values(ascending=False)[:10] # show only top 10

In [None]:
mean_rtg.sort_values(ascending=False)[-10:] 

In [None]:
books_with_ratings.groupby('book_title').isbn.nunique().sort_values(ascending=False)[:10]

In [None]:
multiple_isbns = books_with_ratings.groupby('book_title').isbn.nunique()
multiple_isbns.value_counts()

In [None]:
has_mult_isbns = multiple_isbns.where(multiple_isbns>1)
has_mult_isbns.dropna(inplace=True) 

In [None]:
print(f'There are {len(has_mult_isbns)} book titles with multiple ISBN numbers which we will try to re-assign to a unique identifier')

In [None]:
has_mult_isbns['Jane Eyre']

In [None]:
with open('multiple_isbn_dict.pickle', 'rb') as handle:
    multiple_isbn_dict = pickle.load(handle)

In [None]:
print(f'There are now {len(multiple_isbn_dict)} books in the ISBN dictionary that have multiple ISBN numbers')

In [None]:
print(f'Length of Jane Eyre dict entry: {len(multiple_isbn_dict["Jane Eyre"])}\n')
multiple_isbn_dict['Jane Eyre']

In [None]:
def add_unique_isbn_col(df):
    df['unique_isbn'] = df.apply(lambda row: multiple_isbn_dict[row.book_title][0] if row.book_title in multiple_isbn_dict.keys() else row.isbn, axis=1)
    return df

%time books_with_ratings = add_unique_isbn_col(books_with_ratings)

In [None]:
books_with_ratings.head()

In [None]:
books_with_ratings[books_with_ratings.book_title=='Jane Eyre'].head()

In [None]:
print(f'Books+Ratings table size: {len(books_with_ratings)}')
print(f'Users table size: {len(users)}')
books_users_ratings = books_with_ratings.join(users.set_index('user_id'), on='user_id')
print(f'New "books_users_ratings" table size: {len(books_users_ratings)}')

Inspect the new table.

In [None]:
books_users_ratings.head()

In [None]:
books_users_ratings.info()

books_users_ratings = books_users_ratings[:20000]

In [None]:
books_users_ratings.shape

In [None]:
user_item_rating = books_users_ratings[['user_id', 'unique_isbn', 'book_rating']]
user_item_rating.head()

In [None]:
rtg = user_item_rating.book_rating.value_counts().sort_index()

plt.figure(figsize=(10, 5))
plt.rcParams.update({'font.size': 15}) 
plt.bar(rtg.index, rtg.values)
plt.xlabel('Rating')
plt.ylabel('counts')
plt.show()

Looks perfect! Continue.

In [None]:
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(user_item_rating, test_size=0.20)

In [None]:
print(f'Training set size: {len(train_data)}')
print(f'Testing set size: {len(test_data)}')
print(f'Test set is {(len(test_data)/(len(train_data)+len(test_data))*100):.0f}% of the full dataset.')

In [None]:
u_unique_train = train_data.user_id.unique()  
train_data_user2idx = {o:i for i, o in enumerate(u_unique_train)}

b_unique_train = train_data.unique_isbn.unique() 
train_data_book2idx = {o:i for i, o in enumerate(b_unique_train)}


u_unique_test = test_data.user_id.unique() 
test_data_user2idx = {o:i for i, o in enumerate(u_unique_test)}

b_unique_test = test_data.unique_isbn.unique()  
test_data_book2idx = {o:i for i, o in enumerate(b_unique_test)}

In [None]:

train_data['u_unique'] = train_data['user_id'].map(train_data_user2idx)
train_data['b_unique'] = train_data['unique_isbn'].map(train_data_book2idx)


test_data['u_unique'] = test_data['user_id'].map(test_data_user2idx)
test_data['b_unique'] = test_data['unique_isbn'].map(test_data_book2idx)


train_data = train_data[['u_unique', 'b_unique', 'book_rating']]
test_data = test_data[['u_unique', 'b_unique', 'book_rating']]

In [None]:
train_data.tail()

In [None]:
train_data.dtypes

In [None]:
n_users = train_data['u_unique'].nunique()
n_books = train_data['b_unique'].nunique()


train_matrix = np.zeros((n_users, n_books))


for entry in train_data.itertuples(): 
    train_matrix[entry[1]-1, entry[2]-1] = entry[3] 

In [None]:
train_matrix.shape

In [None]:
n_users = test_data['u_unique'].nunique()
n_books = test_data['b_unique'].nunique()


test_matrix = np.zeros((n_users, n_books))


for entry in test_data.itertuples(): 
    test_matrix[entry[1]-1, entry[2]-1] = entry[3] 

In [None]:
test_matrix.shape

In [None]:

train_matrix_small = train_matrix[:10000, :10000]
test_matrix_small = test_matrix[:10000, :10000]

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_matrix_small, metric='cosine')
item_similarity = pairwise_distances(train_matrix_small.T, metric='cosine') 

In [None]:
def predict(ratings, similarity, type='user'): 
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
item_prediction = predict(train_matrix_small, item_similarity, type='item')
user_prediction = predict(train_matrix_small, user_similarity, type='user')

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, test_matrix):
    prediction = prediction[test_matrix.nonzero()].flatten()
    test_matrix = test_matrix[test_matrix.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, test_matrix))


print(f'User-based CF RMSE: {rmse(user_prediction, test_matrix_small)}')
print(f'Item-based CF RMSE: {rmse(item_prediction, test_matrix_small)}')

In [None]:
from surprise import Reader, Dataset

In [None]:
user_item_rating.head() # take a look at our data

In [None]:
reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(user_item_rating, reader)

In [None]:
from surprise import SVD, NMF, model_selection, accuracy

### SVD model

In [None]:
model = SVD()


%time model_selection.cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

In [None]:
trainset, testset = model_selection.train_test_split(data, test_size=0.2)

model = SVD()

model.fit(trainset)
predictions = model.test(testset)

accuracy.rmse(predictions)

### NMF model

In [None]:
model = NMF()

%time model_selection.cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

In [None]:
trainset, testset = model_selection.train_test_split(data, test_size=0.2)

In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [80, 100, 120], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.01, 0.02, 0.04]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [None]:
%time gs.fit(data)

In [None]:
model = gs.best_estimator['rmse']

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
model_selection.cross_validate(model, data, measures=['rmse', 'mae'], cv=5, verbose=True)

In [None]:
model = SVD(n_factors=80, lr_all=0.005, reg_all=0.04)
model.fit(trainset) 
test_pred = model.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:

uid = 276744  
iid = '038550120X' 

pred = model.predict(uid, iid, verbose=True)

In [None]:
print(f'The estimated rating for the book with the "unique_isbn" code {pred.iid} from user #{pred.uid} is {pred.est:.2f}.\n')
actual_rtg = user_item_rating[(user_item_rating.user_id==pred.uid) & (user_item_rating.unique_isbn==pred.iid)].book_rating.values[0]
print(f'The real rating given for this was {actual_rtg:.2f}.')

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):

    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
    return top_n

In [None]:
pred = model.test(testset)
top_n = get_top_n(pred)

In [None]:
def get_reading_list(userid):
    """
    Retrieve full book titles from full 'books_users_ratings' dataframe
    """
    reading_list = defaultdict(list)
    top_n = get_top_n(predictions, n=10)
    for n in top_n[userid]:
        book, rating = n
        title = books_users_ratings.loc[books_users_ratings.unique_isbn==book].book_title.unique()[0]
        reading_list[title] = rating
    return reading_list

In [None]:
example_reading_list = get_reading_list(userid=6251)
print("Recommended books are:")
for book, rating in example_reading_list.items():
    print(f'{book}: {rating}')