In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [39]:
books = pd.read_csv('data/BX-books.csv',sep=";",on_bad_lines='skip',encoding='latin-1',low_memory=False)
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [40]:
books.shape

(271360, 8)

In [41]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [42]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-L']]
books.shape

(271360, 6)

In [43]:
#Renaming columns
books.rename(columns={"Book-Title":"title",
                      "Book-Author":"author",
                      "Year-Of-Publication":"year",
                      "Publisher":"publisher",
                      "Image-URL-L":"image"},inplace=True)             

In [44]:
books.head(1)

Unnamed: 0,ISBN,title,author,year,publisher,image
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...


In [45]:
users = pd.read_csv('data/BX-Users.csv',sep=";",on_bad_lines='skip',encoding='latin-1',low_memory=False)
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [46]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv',sep=";",on_bad_lines='skip',encoding='latin-1',low_memory=False)
ratings.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [47]:
#checking for missing values
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 6)
(278858, 3)
(1149780, 3)


In [55]:
ratings.rename(columns={"User-ID":"id","Book-Rating":"rating"},inplace=True)
users.rename(columns={"User-ID":"id","Location":"location","Age":"age"},inplace=True)

In [64]:
#Filtering out users who has rated more than 200 books
x = ratings['id'].value_counts()>200
x[x].shape

(899,)

In [65]:
y=x[x].index
y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727, 268622,
       188951],
      dtype='int64', name='id', length=899)

In [71]:
ratings = ratings[ratings['id'].isin(y)]
ratings.shape

(526356, 3)

In [72]:
books.head(1)

Unnamed: 0,ISBN,title,author,year,publisher,image
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...


In [75]:
#Merging books and Ratings.
ratings_with_books=ratings.merge(books,on="ISBN")
ratings_with_books.head(1)

Unnamed: 0,id,ISBN,rating,title,author,year,publisher,image
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [87]:
num_rating = ratings_with_books.groupby('title')['rating'].count().sort_values(ascending=False).reset_index()

In [90]:
num_rating.rename(columns={"rating":"num_of_rating"},inplace=True)

In [91]:
num_rating.head()

Unnamed: 0,title,num_of_rating
0,Wild Animus,363
1,Bridget Jones's Diary,277
2,The Lovely Bones: A Novel,270
3,The Notebook,241
4,The Pelican Brief,236


In [93]:
#Merging ratings_with_books and num_rating
final_rating=ratings_with_books.merge(num_rating,on='title')

In [94]:
final_rating.head(2)

Unnamed: 0,id,ISBN,rating,title,author,year,publisher,image,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7


In [97]:
#Filtering out books with less than 50 ratings
mask = final_rating['num_of_rating']>=50
final_rating=final_rating[mask]

In [100]:
final_rating.sample(3)

Unnamed: 0,id,ISBN,rating,title,author,year,publisher,image,num_of_rating
318836,185233,515130389,0,Carolina Moon,Nora Roberts,2001,Jove Books,http://images.amazon.com/images/P/0515130389.0...,88
154868,89602,1573228214,0,High Fidelity,Nick Hornby,2000,Riverhead Books,http://images.amazon.com/images/P/1573228214.0...,62
92925,51883,425081818,0,The Talisman,Stephen King,1985,Berkley Publishing Group,http://images.amazon.com/images/P/0425081818.0...,84


In [102]:
final_rating.shape

(61853, 9)

In [101]:
#Removing duplicate

In [104]:
final_rating.drop_duplicates(['id','title'], inplace=True)

In [105]:
final_rating.shape

(59850, 9)

In [107]:
book_pivot = final_rating.pivot_table(columns='id',index='title',values='rating')

In [108]:
book_pivot

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [109]:
book_pivot.fillna(0,inplace=True)

In [110]:
book_pivot.head(2)

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
#tells not to consider the '0' rating because it is just filling the missing value, if not done this it will affect the prediction
from scipy.sparse import csr_matrix

In [115]:
book_sparse =  csr_matrix(book_pivot)

In [116]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [118]:
model.fit(book_sparse)

In [122]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237].values.reshape(1,-1),n_neighbors=6)

In [123]:
distance

array([[ 0.        , 67.75691847, 68.05145112, 72.277244  , 75.81556568,
        76.30203143]])

In [124]:
suggestion

array([[237, 238, 240, 241, 184, 536]], dtype=int64)

In [126]:
#To see the name of the books
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [127]:
books_name = book_pivot.index

In [128]:
import pickle
pickle.dump(model, open('Artifacts/model.pkl','wb'))
pickle.dump(books_name, open('Artifacts/books_name.pkl','wb'))
pickle.dump(final_rating, open('Artifacts/final_rating.pkl','wb'))  
pickle.dump(book_pivot, open('Artifacts/book_pivot.pkl','wb'))

In [129]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index==book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id].values.reshape(1,-1),n_neighbors=6)
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for j in books:
            print(j)