# Book Recommendation System using cluster 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Books.csv')
df.head(1)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...


In [3]:
df.shape

(271360, 8)

In [4]:
df.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [5]:
df = df[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-L']]
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [6]:
df.rename(columns={'Book-Title':'Title','Book-Author':'Author','Year-Of-Publication':'Year','Image-URL-L':'Image-URL'},inplace=True)

In [7]:
df.head(2)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Image-URL
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [8]:
users = pd.read_csv('Users.csv')
users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [9]:
users['Age'] = users['Age'].fillna(users.Age.mean()).astype('int')
users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",34
1,2,"stockton, california, usa",18


In [10]:
users.shape

(278858, 3)

In [11]:
ratings = pd.read_csv('Ratings.csv')
ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


In [32]:
ratings['User-ID'].value_counts()

User-ID
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [33]:
ratings['User-ID'].unique().shape

(105283,)

In [34]:
x=ratings['User-ID'].value_counts()>200

In [35]:
x[x].shape

(899,)

In [37]:
y=x[x].index
y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727, 268622,
       188951],
      dtype='int64', name='User-ID', length=899)

In [38]:
ratings=ratings[ratings['User-ID'].isin(y)]

In [39]:
ratings.shape

(526356, 3)

In [40]:
data = df.merge(ratings,on='ISBN')
data.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Image-URL,User-ID,Book-Rating
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,85526,0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,96054,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,177458,0
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,110912,10


In [41]:
num_rating = data.groupby('Title')['Book-Rating'].count().reset_index()
num_rating.shape

(160269, 2)

In [42]:
num_rating.rename(columns={'Book-Rating':'num_of_rating'},inplace=True)

In [43]:
final_rating = data.merge(num_rating,on='Title')
print(final_rating.shape)
final_rating.head()


(487671, 9)


Unnamed: 0,ISBN,Title,Author,Year,Publisher,Image-URL,User-ID,Book-Rating,num_of_rating
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8,4
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,85526,0,4
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,96054,0,4
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,177458,0,4
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,110912,10,3


In [44]:
final_rating = final_rating[final_rating['num_of_rating']>=50]
final_rating

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Image-URL,User-ID,Book-Rating,num_of_rating
7,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,11676,9,111
8,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,36836,0,111
9,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,46398,9,111
10,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,113270,0,111
11,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,113519,0,111
...,...,...,...,...,...,...,...,...,...
486552,1878702831,Echoes,Nancy Morse,1992,Meteor Publishing Corporation,http://images.amazon.com/images/P/1878702831.0...,238781,0,54
486653,0394429869,I Know Why the Caged Bird Sings,Maya Angelou,1996,Random House,http://images.amazon.com/images/P/0394429869.0...,239594,8,82
486655,0449001164,The Promise,CHAIM POTOK,1997,Ballantine Books,http://images.amazon.com/images/P/0449001164.0...,239594,7,62
486788,0743527631,The Pillars of the Earth,Ken Follett,2002,Encore,http://images.amazon.com/images/P/0743527631.0...,240144,0,85


In [45]:
final_rating.sample(5)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Image-URL,User-ID,Book-Rating,num_of_rating
20552,380789019,Neverwhere,Neil Gaiman,1998,Avon,http://images.amazon.com/images/P/0380789019.0...,157273,0,72
16345,440211727,A Time to Kill,JOHN GRISHAM,1992,Dell,http://images.amazon.com/images/P/0440211727.0...,187517,0,210
40505,743412028,Pay It Forward,Catherine Ryan Hyde,2000,Pocket,http://images.amazon.com/images/P/0743412028.0...,175003,0,70
59989,515119784,Range of Motion,Elizabeth Berg,1996,Jove Books,http://images.amazon.com/images/P/0515119784.0...,222296,0,54
48180,140119906,Love in the Time of Cholera (Penguin Great Boo...,Gabriel Garcia Marquez,1994,Penguin Books,http://images.amazon.com/images/P/0140119906.0...,204864,10,62


In [46]:
final_rating.drop_duplicates(['User-ID','Title'],inplace=True)

In [47]:
final_rating.shape

(59850, 9)

In [48]:
book_pivot = final_rating.pivot_table(columns='User-ID',index='Title',values='Book-Rating')
book_pivot.head()

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,


In [49]:
book_pivot.fillna(0,inplace=True)
book_pivot

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from scipy.sparse import csr_matrix

In [66]:
book_sparse = csr_matrix(book_pivot)
book_sparse

<742x888 sparse matrix of type '<class 'numpy.float64'>'
	with 14940 stored elements in Compressed Sparse Row format>

In [67]:
from sklearn.neighbors import NearestNeighbors

In [68]:
model = NearestNeighbors(algorithm='brute')

In [69]:
model.fit(book_sparse)

In [70]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1),n_neighbors=5)

In [71]:
distance

array([[ 0.        , 69.5413546 , 69.69935437, 72.74613392, 76.83098333]])

In [72]:
suggestion

array([[237, 238, 240, 241, 184]], dtype=int64)

In [73]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive'],
      dtype='object', name='Title')


In [74]:
book_pivot.index[3]

'4 Blondes'

In [75]:
distance, suggestion = model.kneighbors(book_pivot.iloc[3,:].values.reshape(1,-1),n_neighbors=5)

In [76]:
distance

array([[ 0.        , 22.93468988, 23.        , 24.08318916, 24.33105012]])

In [77]:
suggestion

array([[  3, 372, 393, 320, 184]], dtype=int64)

In [78]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['4 Blondes', 'No Safe Place', 'Pleading Guilty', 'Long After Midnight',
       'Exclusive'],
      dtype='object', name='Title')


In [79]:
books_name = book_pivot.index

In [80]:
books_name

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Title', length=742)

In [81]:
import pickle

In [82]:
pickle.dump(model,open('model.pkl','wb'))
pickle.dump(books_name,open('books_name.pkl','wb'))
pickle.dump(final_rating,open('final_rating.pkl','wb'))
pickle.dump(book_pivot,open('book_pivot.pkl','wb'))

In [84]:
def recommend_book(book_name):
    book_id=np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1),n_neighbors=5)
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
    for j in books:
        print(j)

In [85]:
recommend_book('A Bend in the Road')

A Bend in the Road
Exclusive
No Safe Place
Family Album
Lake Wobegon days
