### Book Recommender System using clustering | Collaborative filtering

In [1]:
# Libraries used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Importing the books dataset

In [2]:
# The data is separated by a semicolon, 
#there are bad lines where the columns have errors that is why
#the bad lines come in,the dataset is formatted with the latin-1
#hence the encoding is latin-1 
books = pd.read_csv('data/BX-Books.csv', sep = ';', error_bad_lines= False, encoding = 'latin-1')



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  exec(code_obj, sel

In [3]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [4]:
# Showing the data shape
books.shape

(271360, 8)

In [5]:
# Displaying all the columns in the books dataset
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

### Selecting the needed columns for analysis

In [6]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-L',]]

In [7]:
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...


### Renaming the columns for easy analysis

In [8]:
books.rename(columns= {
    'Book-Title':'title',
    'Book-Author': 'author',
    'Year-Of-Publication': 'year',
    'Publisher':'publisher',
    'Image-URL-L': 'img_url'   
}, inplace =True)

In [9]:
books.head(3)

Unnamed: 0,ISBN,title,author,year,publisher,img_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...


### Importing the user dataset

In [10]:
users = pd.read_csv('data/BX-Users.csv', sep = ';', error_bad_lines = False, encoding = 'latin-1')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [12]:
# Renaming columns
users.rename(columns= {
    'User-ID':'user_id',
    'Location':'location',
    'Age':'age'
}, inplace =True)

In [13]:
users.head(3)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [14]:
users.shape

(278858, 3)

### Import the ratings dataset

In [15]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep = ';', error_bad_lines= False, encoding = 'latin-1')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [16]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [17]:
ratings.shape

(1149780, 3)

In [18]:
# Renaming columns
ratings.rename(columns= {
    'User-ID': 'user_id',
    'Book-Rating':'rating'
}, inplace =True)

In [19]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


### From the datasets we can observe missing data

In [20]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 6)
(278858, 3)
(1149780, 3)


In [21]:
ratings['user_id'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: user_id, Length: 105283, dtype: int64

### Based on the cell above we can infer that some users are more active than others in reading and rating books
### The users who have only rated one book would not add much to our analysis so we drop them

### Number of unique users that have rated a book

In [22]:
ratings['user_id'].unique().shape

(105283,)

### We are only considering the users who have rated 200 books and more

In [23]:
x = ratings['user_id'].value_counts() > 200

In [24]:
x

11676      True
198711     True
153662     True
98391      True
35859      True
          ...  
116180    False
116166    False
116154    False
116137    False
276723    False
Name: user_id, Length: 105283, dtype: bool

In [25]:
x[x].shape

(899,)

### Getting the index of all the users who have rated over 200 books

In [26]:
y = x[x].index

In [27]:
y

Int64Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
            110973, 235105,
            ...
            260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727,
            268622, 188951],
           dtype='int64', length=899)

###  Checking if the indexes gotten in variable 'y' is in the ratings data frame and displaying the corresponding data

In [28]:
ratings = ratings[ratings['user_id'].isin(y)]

In [29]:
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [30]:
ratings.shape

(526356, 3)

### Merging the ratings dataset with the books dataset

In [31]:
ratings_with_books = ratings.merge(books, on = 'ISBN')

In [32]:
ratings_with_books.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [33]:
ratings_with_books.shape 

(487671, 8)

### Getting the number of times a book has been rated
#### Books that have been rated less than 50 times by users are discarded

In [34]:
num_rating =ratings_with_books.groupby('title')['rating'].count().reset_index()

In [35]:
num_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [36]:
# Renaming the 'rating' column to 'num_of_ rating' 
num_rating.rename(columns={'rating':'num_rating'},inplace = True)

In [37]:
num_rating.head(2)

Unnamed: 0,title,num_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1


### Merging the number of rating data with the books rating data

In [38]:
final_rating= ratings_with_books.merge(num_rating, on ='title')

In [39]:
final_rating.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82


In [40]:
final_rating.shape

(487671, 9)

### Books that have been rated 50 or more times


In [41]:
final_rating= final_rating[final_rating['num_rating']>= 50]

In [42]:
final_rating.sample(10)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_rating
126720,77809,0312983824,0,Faking It,Jennifer Crusie,2003,St. Martin's Paperbacks,http://images.amazon.com/images/P/0312983824.0...,97
6663,13552,0440170796,10,The Promise,Danielle Steel,1978,Dell,http://images.amazon.com/images/P/0440170796.0...,62
36860,247752,0446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,http://images.amazon.com/images/P/0446364193.0...,134
41224,200226,051512608X,0,The Reef,Nora Roberts,1999,Jove Books,http://images.amazon.com/images/P/051512608X.0...,70
131632,255489,0671027360,9,Angels &amp; Demons,Dan Brown,2001,Pocket Star,http://images.amazon.com/images/P/0671027360.0...,193
109565,102275,0449910237,0,Moo,Jane Smiley,1996,Ballantine Books,http://images.amazon.com/images/P/0449910237.0...,80
74275,180917,0446608653,0,The Alibi,Sandra Brown,2000,Warner Books,http://images.amazon.com/images/P/0446608653.0...,64
7474,252071,0440225701,0,The Street Lawyer,JOHN GRISHAM,1999,Dell,http://images.amazon.com/images/P/0440225701.0...,186
2363,268330,0316666343,0,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,270
125731,169233,0425162443,0,The Rapture of Canaan,Sheri Reynolds,1997,Berkley Publishing Group,http://images.amazon.com/images/P/0425162443.0...,87


### Dropping the duplicated user_id and title of the book the have rated more than once

In [43]:
final_rating.drop_duplicates(['user_id','title'], inplace =True)

In [44]:
final_rating.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82


In [45]:
final_rating.shape

(59850, 9)

### Creating a pivot table
#### The clustering will work by analyzing users who have similar ratings and group them together in a cluster such that the system recommends the books the have rated highly among those in the clusters

In [46]:
book_pivot = final_rating.pivot_table(columns = 'user_id', index = 'title', values = 'rating')

In [47]:
book_pivot.head()

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,


In [48]:
# Fill the NaN values with 0
book_pivot.fillna(0, inplace = True)

In [51]:
book_pivot.head(7)

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Case of Need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
book_pivot.shape

(742, 888)

### Based on the pivot table, it is seen that there are a lot of 0 values so I used a csr_matrix which stands for compressed sparse row matrix to remove the non-zero elements to improve the computation efficiency of the system

In [52]:
from scipy.sparse import csr_matrix

In [53]:
book_sparse = csr_matrix(book_pivot)

In [54]:
book_sparse

<742x888 sparse matrix of type '<class 'numpy.float64'>'
	with 14942 stored elements in Compressed Sparse Row format>

### Importing the  clustering algorithm

#### The nearest neighbor clustering algorithm
The Nearest Neighbor algorithm is used to find the data point(s) that are closest to a given query point based on a specified distance metric. In its simplest form, it involves calculating the distance between the query point and all other data points in the dataset and selecting the one(s) with the smallest distance.

#### Brute force calcualtes the distance and selects the smallest distance

In [55]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm = 'brute')

In [58]:
# Fitting the model
model.fit(book_sparse)

NearestNeighbors(algorithm='brute')

#### Creating the distance and suggestion varaibles to the store the nearest distance and their indices

In [63]:
# Calculating the nearest neighbors, converting to a numpy array of a single row and finding 6 nearest neighbors
# Index 237 is the index of the Harry Potter books
distance, suggestion = model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1), n_neighbors=6 )

In [64]:
distance

array([[ 0.        , 68.78953409, 69.5413546 , 72.64296249, 76.83098333,
        77.28518616]])

In [65]:
suggestion

array([[237, 240, 238, 241, 184, 536]], dtype=int64)

#### Getting the names of the suggested books 

In [66]:
for i in range (len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [69]:
 book_pivot.index

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=742)

In [67]:
# Storing all the book names in a variable
books_name = book_pivot.index

### Pickle module saves objects/data structures to files


In [68]:
import pickle 
pickle.dump(model, open('data/artifacts/model.pkl', 'wb'))
pickle.dump(books_name, open('data/artifacts/books_name.pkl', 'wb'))
pickle.dump(final_rating, open('data/artifacts/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('data/artifacts/book_pivot.pkl', 'wb'))

#### Book Recommender Function

In [71]:
# Function recommends a book by matching the book with it's index then uses the 
# nearest neighbor algorithm to get the shortest distance and recommends books
def recommend_books(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors =6)
    
    for i in range(len(suggestion)):
        books= book_pivot.index[suggestion[i]]
        for j in books:
            print(j)


In [73]:
# Testing the system
book_name = 'A Civil Action'
recommend_books(book_name)

A Civil Action
No Safe Place
Long After Midnight
Exclusive
Lake Wobegon days
Pleading Guilty
