In [214]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from sklearn.neighbors import NearestNeighbors

In [76]:
books_df = pd.read_csv('Books.csv', on_bad_lines='skip')
users_df = pd.read_csv('Users.csv', on_bad_lines='skip')
ratings_df = pd.read_csv('Ratings.csv', on_bad_lines='skip')

print(books_df.shape)
print(users_df.shape)
print(ratings_df.shape)

  books_df = pd.read_csv('Books.csv', on_bad_lines='skip')


(271360, 8)
(278858, 3)
(1149780, 3)


In [77]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [78]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [79]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [80]:
print(books_df.columns)
print(users_df.columns)
print(ratings_df.columns)

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')
Index(['User-ID', 'Location', 'Age'], dtype='object')
Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')


In [81]:
books_df.rename(columns={
    'ISBN': 'isbn',
    'Book-Title':'title',
    'Book-Author':'author',
    'Year-Of-Publication':'year',
    'Publisher':'publisher',
}, inplace=True)

users_df.rename(columns={
    'User-ID':'userid',
    'User-Age':'age',
    'Location':'location',
}, inplace=True)

ratings_df.rename(columns={
    'User-ID':'userid',
    'ISBN':'isbn',
    'Book-Rating':'rating',
}, inplace=True)

print(books_df.columns)
print(users_df.columns)
print(ratings_df.columns)

Index(['isbn', 'title', 'author', 'year', 'publisher', 'Image-URL-S',
       'Image-URL-M', 'Image-URL-L'],
      dtype='object')
Index(['userid', 'location', 'Age'], dtype='object')
Index(['userid', 'isbn', 'rating'], dtype='object')


In [82]:
books_df = books_df[['isbn', 'title', 'author', 'year', 'publisher']]
books_df.head(2)

Unnamed: 0,isbn,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada


In [83]:
users_df.head(2)

Unnamed: 0,userid,location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [84]:
ratings_df.head(2)

Unnamed: 0,userid,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


In [277]:
users_with_min_200_ratings = ratings_df['userid'].value_counts() >= 100
users_with_min_200_ratings = users_with_min_200_ratings[users_with_min_200_ratings].index
users_with_min_200_ratings.shape

(899,)

In [278]:
ratings_df = ratings_df[ratings_df['userid'].isin(users_with_min_200_ratings)]
ratings_df.shape

(526356, 3)

In [279]:
ratings_df.head()

Unnamed: 0,userid,isbn,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [280]:
ratings_with_books_df = ratings_df.merge(books_df, on='isbn')
ratings_with_books_df.head()

Unnamed: 0,userid,isbn,rating,title,author,year,publisher
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books


In [281]:
book_ratings = ratings_with_books_df.groupby('title')['rating'].count().reset_index()
book_ratings.rename(columns={
    'rating':'number_of_ratings'
}, inplace=True)
book_ratings.head()

Unnamed: 0,title,number_of_ratings
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [284]:
books_with_rating_count_df = ratings_with_books_df.merge(book_ratings, on='title')
books_with_rating_count_df.shape

(487671, 8)

In [285]:
books_with_rating_count_df = books_with_rating_count_df[books_with_rating_count_df['number_of_ratings'] >= 50]
books_with_rating_count_df.shape

(61853, 8)

In [286]:
books_with_rating_count_df.drop_duplicates(['userid', 'title'], inplace=True)
books_with_rating_count_df.shape

(59850, 8)

In [287]:
books_pivot = books_with_rating_count_df.pivot_table(columns='userid', index='title', values='rating')
books_pivot.shape

(742, 888)

In [288]:
books_pivot.fillna(0, inplace=True)
books_pivot

userid,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [289]:
books_sparse = csr_matrix(books_pivot)
type(books_sparse)

scipy.sparse._csr.csr_matrix

In [290]:
model = NearestNeighbors(algorithm='brute')

In [291]:
model.fit(books_sparse) 

In [339]:
def get_book_recommendations(book_name):
    if(book_name == ''):
        print('provide a book name')
        return
    
    book_id = np.where(books_pivot.index.str.lower().str.contains(book_name.lower()))[0][0]
    distances, suggestions = model.kneighbors(books_pivot.iloc[book_id, :].values.reshape(1, -1), n_neighbors=6)
    for i in range(len(suggestions)):
        print(books_pivot.index[suggestions[i]])

In [340]:
get_book_recommendations('game')

Index(['Ender's Game (Ender Wiggins Saga (Paperback))', 'Exclusive',
       'Hearts in Atlantis', 'The First Counsel', 'Absolute Power',
       'Foucault's Pendulum'],
      dtype='object', name='title')
