In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

books = pd.read_csv('datasets/book datasets/books.csv', sep = ';', error_bad_lines = False, encoding = 'latin1',
                   usecols = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher'])
# books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis = 1, inplace = True)
ratings = pd.read_csv('datasets/book datasets/ratings.csv', sep = ';', error_bad_lines = False, encoding = 'latin1')
users = pd.read_csv('datasets/book datasets/users.csv', sep = ';', error_bad_lines = False, encoding = 'latin1')

In [2]:
pd.merge(books, ratings, on = 'ISBN', how = 'inner')

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0
...,...,...,...,...,...,...,...
1031170,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,7
1031171,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,276579,4
1031172,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,276680,0
1031173,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,276680,0


In [3]:
books.shape, ratings.shape, users.shape

((271379, 5), (1149780, 3), (278858, 3))

In [4]:
# books.head()

In [5]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [6]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
merged = pd.merge(books, ratings, how = 'inner', on = 'ISBN')
merged = pd.merge(merged, users, on = 'User-ID')
merged['Age'].fillna(merged['Age'].mean(), inplace = True) # fill NaN 'Age' entries with age mean
vc = merged['ISBN'].value_counts()
top_books = vc[vc >= 100].keys().to_list() # filter out books obscure/less-read books
merged = merged[merged['ISBN'].isin(top_books)]
merged.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location,Age
19,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,11400,9,"ottawa, ontario, canada",49.0
20,151008116,Life of Pi,Yann Martel,2002,Harcourt,11400,6,"ottawa, ontario, canada",49.0
21,671021001,She's Come Undone (Oprah's Book Club),Wally Lamb,1998,Pocket,11400,0,"ottawa, ontario, canada",49.0
22,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,11400,7,"ottawa, ontario, canada",49.0
23,446364193,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,11400,0,"ottawa, ontario, canada",49.0


In [10]:
test = merged.pivot_table(index = 'Book-Title', columns = 'User-ID', 
                          values = 'Book-Rating', aggfunc = 'sum').fillna(0)
test.head()

User-ID,9,14,16,26,39,42,44,51,67,75,...,278800,278807,278813,278819,278828,278832,278836,278843,278844,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# we can see that our pivot table is mostly 0 entries, so let's convert this into a sparse matrix for more efficient
# computation and to save memory

from scipy.sparse import csr_matrix # compressed sparse row matrix

test_sparse = csr_matrix(test.values)

In [13]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(test_sparse)

NearestNeighbors(algorithm='brute', metric='cosine')

In [15]:
initial_book = np.random.choice(test.shape[0])
distances, indices = knn.kneighbors(test.iloc[initial_book].values.reshape(1, -1), n_neighbors = 10)
# initial_ISBN = test.iloc[1343].name
# liked_book = merged[merged['ISBN'] == initial_ISBN]['Book-Title'].value_counts().index[0]

In [16]:
count = 1
print('Other recommendations if you enjoyed {}\n'.format(test.iloc[initial_book].name))
for recommendations in zip(indices.flatten().tolist(), distances.flatten().tolist()):
    print('{}. {}, with a distance of {}\n'.format(count, test.iloc[recommendations[0]].name, recommendations[1]))
    count += 1

Other recommendations if you enjoyed All Around the Town

1. All Around the Town, with a distance of 0.0

2. You Belong To Me, with a distance of 0.8364839487163842

3. I'll Be Seeing You, with a distance of 0.8390079817283184

4. Let Me Call You Sweetheart, with a distance of 0.8531018367907175

5. Moonlight Becomes You, with a distance of 0.8553121590491972

6. Remember Me, with a distance of 0.8571648165160501

7. Hide &amp; Seek, with a distance of 0.8651652093866798

8. Before I Say Good-Bye, with a distance of 0.8667353084283282

9. The Lottery Winner : Alvirah And Willy Stories, with a distance of 0.8719381226004121

10. Loves Music, Loves to Dance, with a distance of 0.8721297550336378

