In [1]:
import pymongo
import json
import os
import pandas as pd
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

In [2]:
# import the connecting credentials 
sys.path.append('../')
from secret import credentials
sys.path.remove('../')

In [3]:
## mongo API connection
sec = credentials()
mongo_uri = sec.connect_string

myclient = pymongo.MongoClient(mongo_uri)

In [4]:
print(myclient.list_database_names())

['bookdb', 'admin', 'local']


In [5]:
mydb = myclient['bookdb']
col_books = mydb['books']
col_ratings = mydb['ratings']
col_users = mydb['users']

In [6]:
# Query the data (this example retrieves all documents in the collection)
data = list(col_books.find({}))
# Convert the data to a Pandas DataFrame
books = pd.DataFrame(data)

# Query the data (this example retrieves all documents in the collection)
data = list(col_ratings.find({}))
# Convert the data to a Pandas DataFrame
ratings = pd.DataFrame(data)

# Query the data (this example retrieves all documents in the collection)
data = list(col_users.find({}))
# Convert the data to a Pandas DataFrame
users = pd.DataFrame(data)

In [7]:
books.drop('_id', axis=1, inplace= True)
ratings.drop('_id', axis=1, inplace= True)
users.drop('_id', axis=1, inplace= True)

## Data Preprocessing 

In [9]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [10]:
# select the columns that needed 
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-L']]

In [11]:
# rename the columns
books.rename(columns={
    'Book-Title':'title',
    'Book-Author':'author',
    'Year-Of-Publication':'year',
    'Image-URL-L':'Image_URL'
}, inplace=True)

In [12]:
books.head()

Unnamed: 0,ISBN,title,author,year,Publisher,Image_URL
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [13]:
ratings.rename(columns={
    'User-ID': 'user_id',
    'Book-Rating': 'rating'
}, inplace=True)

In [14]:
ratings.head()

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [15]:
users.rename(columns={
    'User-ID':'user_id'
}, inplace=True)

In [16]:
users.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


## Data Cleaning 

In [17]:
ratings['user_id'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: user_id, Length: 105283, dtype: int64

In [18]:
# we are flitering out users rating more than 200 books
# create a flieter 

flit = ratings['user_id'].value_counts() > 200

In [19]:
flit_index = flit[flit].index

In [20]:
ratings = ratings[ratings['user_id'].isin(flit_index)]

In [21]:
# new ratings after flitering
ratings

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0
...,...,...,...
1147612,275970,3829021860,0
1147613,275970,4770019572,0
1147614,275970,896086097,0
1147615,275970,9626340762,8


In [22]:
# merge ratings and books
ratings_with_books = ratings.merge(books, on='ISBN', how='inner')

In [23]:
ratings_with_books

Unnamed: 0,user_id,ISBN,rating,title,author,year,Publisher,Image_URL
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
...,...,...,...,...,...,...,...,...
487666,275970,1892145022,0,Here Is New York,E. B. White,1999,Little Bookroom,http://images.amazon.com/images/P/1892145022.0...
487667,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias,2002,Capital Books (VA),http://images.amazon.com/images/P/1931868123.0...
487668,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt,1993,"Bibliographisches Institut, Mannheim",http://images.amazon.com/images/P/3411086211.0...
487669,275970,3829021860,0,The Penis Book,Joseph Cohen,1999,Konemann,http://images.amazon.com/images/P/3829021860.0...


In [24]:
raing_countBy_books = ratings_with_books.groupby('title')['rating'].count().reset_index()


In [25]:
raing_countBy_books.rename(columns={
    'rating':'num_of_rating'
}, inplace=True)

In [26]:
# this is the num of rating by books
raing_countBy_books.head()

Unnamed: 0,title,num_of_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [27]:
final_rating = raing_countBy_books.merge(ratings_with_books, on='title')

In [28]:
# this is the final rating dataframe contain num of rating
final_rating.head()

Unnamed: 0,title,num_of_rating,user_id,ISBN,rating,author,year,Publisher,Image_URL
0,A Light in the Storm: The Civil War Diary of ...,2,35859,590567330,0,Karen Hesse,1999,Hyperion Books for Children,http://images.amazon.com/images/P/0590567330.0...
1,A Light in the Storm: The Civil War Diary of ...,2,96448,590567330,9,Karen Hesse,1999,Hyperion Books for Children,http://images.amazon.com/images/P/0590567330.0...
2,Always Have Popsicles,1,172742,964147726,0,Rebecca Harvin,1994,Rebecca L. Harvin,http://images.amazon.com/images/P/0964147726.0...
3,Apple Magic (The Collector's series),1,198711,942320093,0,Martina Boudreau,1984,Amer Cooking Guild,http://images.amazon.com/images/P/0942320093.0...
4,Beyond IBM: Leadership Marketing and Finance ...,1,11601,962295701,0,Lou Mobley,1989,"Teleonet, Incorporated",http://images.amazon.com/images/P/0962295701.0...


In [29]:
# Flieter rating, select num of rating greater than 30
final_rating= final_rating[final_rating['num_of_rating']>30]

In [30]:
final_rating.shape

(96548, 9)

In [31]:
final_rating.drop_duplicates(subset=['title','user_id'], inplace=True)

In [32]:
# this is the final Rating after data Cleaning 
final_rating

Unnamed: 0,title,num_of_rating,user_id,ISBN,rating,author,year,Publisher,Image_URL
290,10 Lb. Penalty,35,39281,0515123471,0,Dick Francis,1998,Jove Books,http://images.amazon.com/images/P/0515123471.0...
291,10 Lb. Penalty,35,49109,0515123471,5,Dick Francis,1998,Jove Books,http://images.amazon.com/images/P/0515123471.0...
292,10 Lb. Penalty,35,69405,0515123471,0,Dick Francis,1998,Jove Books,http://images.amazon.com/images/P/0515123471.0...
293,10 Lb. Penalty,35,76352,0515123471,0,Dick Francis,1998,Jove Books,http://images.amazon.com/images/P/0515123471.0...
294,10 Lb. Penalty,35,83287,0515123471,10,Dick Francis,1998,Jove Books,http://images.amazon.com/images/P/0515123471.0...
...,...,...,...,...,...,...,...,...,...
487432,"\O\"" Is for Outlaw""",105,234359,0805059555,0,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...
487433,"\O\"" Is for Outlaw""",105,234623,0805059555,0,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...
487434,"\O\"" Is for Outlaw""",105,236283,0805059555,10,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...
487435,"\O\"" Is for Outlaw""",105,238120,0805059555,0,Sue Grafton,1999,Henry Holt &amp; Company,http://images.amazon.com/images/P/0805059555.0...


## Feature engineering

In [33]:
pivot = final_rating.pivot_table(columns='user_id', index='title', values='rating')

In [34]:
pivot.fillna(0, inplace=True)

In [35]:
pivot

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# fix 0 in the datasets
book_sparse = csr_matrix(pivot)

In [37]:
book_sparse

<1652x893 sparse matrix of type '<class 'numpy.float64'>'
	with 22543 stored elements in Compressed Sparse Row format>

## Data Modeling 

In [38]:
model = NearestNeighbors(algorithm='brute')

In [39]:
model.fit(book_sparse)

NearestNeighbors(algorithm='brute')

In [40]:
# find the 6 cloested distance of 6 neigbors(include itself)
distance, suggestion = model.kneighbors(pivot.iloc[1].values.reshape(1,-1), n_neighbors=6)

In [41]:
distance

array([[ 0.        , 13.85640646, 15.45962483, 15.90597372, 16.18641406,
        16.2788206 ]])

In [42]:
suggestion

array([[   1,  574, 1515, 1522, 1236, 1093]], dtype=int64)

In [43]:
suggestion.shape

(1, 6)

In [44]:
# This is the 5 books that are recommended to 16 Lighthouse Road
for i in range(len(suggestion[0])):
    print(pivot.index[suggestion[0][i]])

16 Lighthouse Road
Hurricane Bay
This Matter Of Marriage
Three Weeks in Paris
The Experiment
Tapestry


In [45]:
books_name = pivot.index

In [46]:
# Here is all the bookname
books_name

Index(['10 Lb. Penalty', '16 Lighthouse Road', '1984', '1st to Die: A Novel',
       '2010: Odyssey Two', '204 Rosewood Lane', '24 Hours', '2nd Chance',
       '3rd Degree', '4 Blondes',
       ...
       'Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players',
       'World of Pies : A Novel', 'Writ of Execution', 'Wuthering Heights',
       'Wuthering Heights (Penguin Classics)', 'Year of Wonders',
       'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=1652)

In [47]:
# creat the recommendation system

def recommend_book(book_name):
    book_id = np.where(pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(pivot.iloc[book_id].values.reshape(1,-1), n_neighbors=6)
    for i in range(len(suggestion[0])):
        print(pivot.index[suggestion[0][i]])


In [48]:
# testing 
recommend_book('You Belong To Me')

You Belong To Me
The Cradle Will Fall
Exclusive
Loves Music, Loves to Dance
The Anastasia Syndrome
Gates of Paradise


In [61]:
# This is important
pivot.to_csv('../py_objects/pivot.csv', index=True)

In [66]:
# this is important
final_rating.to_csv('../py_objects/final_rating.csv', index=True)

In [152]:
# Save model and python object
with open('../py_objects/model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('../py_objects/books_name.pkl', 'wb') as file:
    pickle.dump(books_name, file)

with open('../py_objects/final_rating.pkl', 'wb') as file:
    pickle.dump(final_rating, file)

with open('../py_objects/pivot.pkl', 'wb') as file:
    pickle.dump(pivot, file)