In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
books = pd.read_csv('data/BX-Books.csv', sep=";", on_bad_lines='skip', encoding='latin-1', low_memory=False)

In [3]:
books.head(1)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...


In [4]:
books.shape

(271360, 8)

In [5]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [6]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-L']]

In [7]:
books.head(1)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...


In [8]:
books.rename(columns={
    "Book-Title": "title",
    "Book-Author": "author",
    "Year-Of-Publication": "year",
    "Publisher": "publisher",
    "Image-URL-L": "image_url"
}, inplace = True)

In [9]:
books.head(1)

Unnamed: 0,ISBN,title,author,year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...


In [10]:
users = pd.read_csv('data/BX-Users.csv', sep=";", on_bad_lines='skip', encoding='latin-1', low_memory=False)

In [11]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [12]:
users.shape

(278858, 3)

In [13]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=";", on_bad_lines='skip', encoding='latin-1', low_memory=False)

In [14]:
ratings.head(1)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0


In [15]:
ratings.shape

(1149780, 3)

In [16]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 6)
(278858, 3)
(1149780, 3)


In [17]:
ratings.rename(columns={
    "User-ID": "user_id",
    "Book-Rating": "rating",
}, inplace = True)

In [18]:
ratings.head(1)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0


In [19]:
ratings['user_id'].value_counts()

user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
119573        1
276706        1
276697        1
276679        1
276676        1
Name: count, Length: 105283, dtype: int64

In [20]:
ratings['user_id'].unique().shape

(105283,)

In [21]:
x = ratings['user_id'].value_counts() > 200

In [22]:
x[x].shape

(899,)

In [23]:
 y = x[x].index

In [24]:
y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       116122,  44296,  28634,  59727,  73681, 274808, 188951,   9856, 155916,
       268622],
      dtype='int64', name='user_id', length=899)

In [25]:
ratings = ratings[ratings['user_id'].isin(y)]

In [26]:
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [27]:
ratings.shape

(526356, 3)

In [28]:
books.head(1)

Unnamed: 0,ISBN,title,author,year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...


In [29]:
rating_with_books = ratings.merge(books, on = 'ISBN')

In [30]:
rating_with_books.head(1)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [31]:
rating_with_books.shape

(487671, 8)

In [32]:
num_rating = rating_with_books.groupby('title')['rating'].count().reset_index()

In [33]:
num_rating.head(1)

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2


In [34]:
final_rating = rating_with_books.merge(num_rating, on = 'title')

In [35]:
final_rating.head(1)

Unnamed: 0,user_id,ISBN,rating_x,title,author,year,publisher,image_url,rating_y
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82


In [36]:
final_rating.shape

(487671, 9)

In [37]:
final_rating = final_rating[final_rating['num_of_rating']>=50]

KeyError: 'num_of_rating'

In [None]:
final_rating.sample(1)

In [None]:
final_rating.shape

In [None]:
final_rating.drop_duplicates(['user_id', 'title'], inplace = True)

In [None]:
final_rating.shape

In [None]:
book_pivot = final_rating.pivot_table(columns = 'user_id', index = 'title', values = 'rating')

In [None]:
book_pivot

In [None]:
book_pivot.shape

In [None]:
book_pivot.fillna(0, inplace = True)

In [None]:
book_pivot

In [None]:
from scipy.sparse import csr_matrix

In [None]:
book_sparse = csr_matrix(book_pivot)

In [None]:
book_sparse

In [None]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm = 'brute')

In [None]:
model.fit(book_sparse)

In [None]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237, :].values.reshape(1, -1), n_neighbors = 6)

In [None]:
distance

In [None]:
suggestion

In [None]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])
book_pivot.index[237]
book_pivot.index


In [None]:
import pickle
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(books_name, open('artifacts/books_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('artifacts/book_pivot.pkl', 'wb'))


In [None]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id, :].values.reshape(1, -1), n_neighbors=6)

    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for j in books:
            print(j)
