# Project - Book Rental Recommendation

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Read the books dataset and explore it

In [2]:
book = pd.read_csv('BX-Books.csv',encoding='latin-1')

In [3]:
user = pd.read_csv('BX-Users.csv',encoding='latin-1')

In [4]:
ratings = pd.read_csv('BX-Book-Ratings.csv',encoding='latin-1',nrows=10000)

In [5]:
user.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
user.isnull().sum()

Unnamed: 0,0
user_id,0
Location,1
Age,110763


In [7]:
user.shape

(278859, 3)

In [8]:
user.dtypes

Unnamed: 0,0
user_id,object
Location,object
Age,float64


In [9]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278859 entries, 0 to 278858
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   user_id   278859 non-null  object 
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.4+ MB


In [10]:
user.describe()

Unnamed: 0,Age
count,168096.0
mean,34.751434
std,14.428097
min,0.0
25%,24.0
50%,32.0
75%,44.0
max,244.0


# Clean up NaN values

In [11]:
user1 = user.dropna()

In [12]:
user1.isnull().sum()

Unnamed: 0,0
user_id,0
Location,0
Age,0


In [13]:
book.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [14]:
book.shape

(271379, 5)

In [15]:
book.dtypes

Unnamed: 0,0
isbn,object
book_title,object
book_author,object
year_of_publication,object
publisher,object


In [16]:
book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271379 non-null  object
 1   book_title           271379 non-null  object
 2   book_author          271377 non-null  object
 3   year_of_publication  271379 non-null  object
 4   publisher            271377 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [17]:
book.describe()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
count,271379,271379,271377,271379,271377
unique,271379,242150,102041,202,16823
top,195153448,Selected Poems,Agatha Christie,2002,Harlequin
freq,1,27,632,17145,7535


In [18]:
book.isnull().sum()

Unnamed: 0,0
isbn,0
book_title,0
book_author,2
year_of_publication,0
publisher,2


In [19]:
book1 = book.dropna()

In [20]:
book1.isnull().sum()

Unnamed: 0,0
isbn,0
book_title,0
book_author,0
year_of_publication,0
publisher,0


# Read the data where ratings are given by users

In [21]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [22]:
ratings.shape

(10000, 3)

In [23]:
ratings.dtypes

Unnamed: 0,0
user_id,int64
isbn,object
rating,int64


In [24]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  10000 non-null  int64 
 1   isbn     10000 non-null  object
 2   rating   10000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 234.5+ KB


In [25]:
ratings.describe()

Unnamed: 0,user_id,rating
count,10000.0,10000.0
mean,265844.3796,1.9747
std,56937.189618,3.424884
min,2.0,0.0
25%,277478.0,0.0
50%,278418.0,0.0
75%,278418.0,4.0
max,278854.0,10.0


In [26]:
ratings.isnull().sum()

Unnamed: 0,0
user_id,0
isbn,0
rating,0


In [27]:
df = pd.merge(ratings,book,on='isbn')

In [28]:
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


# Take a quick look at the number of unique users and books

In [29]:
n_users = df['user_id'].nunique()
print('Number of Unique User :',n_users)

Number of Unique User : 828


In [30]:
n_books = df['isbn'].nunique()
print('Number of Unique Books :',n_books)

Number of Unique Books : 8051


# Convert ISBN variables to numeric numbers in the correct order

In [31]:
isbn_list = df['isbn'].unique()

In [32]:
print('Length of isbn list :',len(isbn_list))

Length of isbn list : 8051


In [33]:
def get_isbn_numeric_id(isbn):
    itemindex = np.where(isbn_list==isbn)
    return itemindex[0][0]

# Convert the user_id variable to numeric numbers in the correct order

In [34]:
userid_list = df['user_id'].unique()

In [35]:
print('Length of User id list :',len(userid_list))

Length of User id list : 828


In [36]:
def get_user_id_numeric_id(user_id):
    itemindex = np.where(userid_list==user_id)
    return itemindex[0][0]

# Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

In [37]:
df['user_id_order'] = df['user_id'].apply(get_user_id_numeric_id)

In [38]:
df['isbn_id'] = df['isbn'].apply(get_isbn_numeric_id)

In [39]:
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_order,isbn_id
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


# Re-index the columns to build a matrix

In [40]:
new_col_order = ['user_id_order','isbn_id','rating','book_title','book_author','year_of_publication','publisher','isbn','user_id']

In [41]:
df = df.reindex(columns=new_col_order)
df.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,034545104X,276725
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle,155061224,276726
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,276727
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,278418
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,052165615X,276729


# Split your data into two sets (training and testing)

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
train_data,test_data = train_test_split(df,test_size=0.30)

In [44]:
train_data_matrix = np.zeros((n_users,n_books))

In [45]:
for line in train_data.itertuples():
    train_data_matrix[line[1]-1,line[2]-1] = line[3]

In [46]:
test_data_matrix = np.zeros((n_users,n_books))

In [47]:
for line in test_data.itertuples():
    test_data_matrix[line[1]-1,line[2]-1] = line[3]

In [48]:
from sklearn.metrics.pairwise import pairwise_distances

In [49]:
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')

In [50]:
user_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [51]:
item_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

# Make predictions based on user and item variables

In [52]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [53]:
item_prediction = predict(train_data_matrix,item_similarity,type='item')

In [54]:
user_prediction = predict(train_data_matrix,user_similarity,type='user')

# Use RMSE to evaluate the predictions

In [55]:
from sklearn.metrics import mean_squared_error
import math

In [56]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return np.sqrt(mean_squared_error(prediction,ground_truth))

In [57]:
print('User-based collaborative filtering RMSE :',rmse(user_prediction,test_data_matrix))

User-based collaborative filtering RMSE : 7.71803354110672


In [58]:
print('Item-based collaborative filtering RMSE :',rmse(item_prediction,test_data_matrix))

Item-based collaborative filtering RMSE : 7.717608190702198
