In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings = pd.read_csv('dataset/reviews/BX-Book-Ratings.csv', sep=';')
books = pd.read_csv('dataset/reviews/BX_Books.csv', sep=';')

books = books.drop(books.columns[2:], axis=1)
books.head()

Unnamed: 0,ISBN,Book-Title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [3]:
df = pd.merge(ratings, books, on='ISBN')
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [4]:
book_num_ratings = (
    df.groupby(by=["Book-Title"])["Book-Rating"]
    .count()
    .reset_index()
    .rename(columns={'Book-Rating': 'num_ratings'})
    [['Book-Title', 'num_ratings']]
)
book_num_ratings.head()

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [5]:
combined_df = df.merge(book_num_ratings, on='Book-Title', how='left')
combined_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,num_ratings
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [6]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_num_ratings['num_ratings'].describe())

count   241090.000
mean         4.277
std         16.738
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: num_ratings, dtype: float64


In [9]:
popularity_threshold = 100
rating_popular_books = combined_df[combined_df['num_ratings'] >= popularity_threshold]
rating_popular_books.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,num_ratings
62,276727,446520802,0,The Notebook,650
63,278418,446520802,0,The Notebook,650
64,638,446520802,0,The Notebook,650
65,3363,446520802,0,The Notebook,650
66,7158,446520802,10,The Notebook,650


In [10]:
rating_popular_books.shape

(183800, 5)

In [10]:
book_features_df = rating_popular_books.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)
book_features_df.head()

User-ID,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from scipy.sparse import csr_matrix
book_features_df_matrix = csr_matrix(book_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(book_features_df_matrix)

In [12]:
book_features_df.shape

(2444, 47994)

In [None]:
query_index = np.random.choice(book_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(book_features_df.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

781


In [33]:
query_index = 2162
distances, indices = model_knn.kneighbors(book_features_df.iloc[query_index, :].values.reshape(1, -1), n_neighbors=21)

for i in range(len(distances.flatten())):
    if i == 0:
        print(f'Recommendations for {book_features_df.index[query_index]}:\n')
    else:
        print(f'{i}: {book_features_df.index[indices.flatten()[i]]}, with distance of {distances.flatten()[i]}')

Recommendations for The Stand: Complete and Uncut:

1: Pet Sematary, with distance of 0.8529169272065685
2: Four Past Midnight, with distance of 0.8548820532202399
3: The Dark Half, with distance of 0.8568049250978269
4: Cujo, with distance of 0.8700173010308165
5: The Dead Zone, with distance of 0.8797015595811015
6: Insomnia, with distance of 0.8863191548177562
7: Different Seasons (Signet), with distance of 0.8914687474523878
8: Misery, with distance of 0.8925939973488407
9: Carrie, with distance of 0.8959366148922875
10: Lovers, with distance of 0.8971801173531649
11: The Talisman, with distance of 0.8972092859680657
12: The Drawing of the Three (The Dark Tower, Book 2), with distance of 0.9015128457312231
13: Wizard and Glass (The Dark Tower, Book 4), with distance of 0.9057267989135563
14: Desperation, with distance of 0.906179325426637
15: Nightmares & Dreamscapes, with distance of 0.90892774255048
16: Gerald's Game, with distance of 0.9107243084975871
17: Black House, with dist

In [None]:
for i in range(len(distances.flatten())):
    if i == 0:
        print(f'Recommendations for {book_features_df.index[query_index]}:\n')
    else:
        print(f'{i}: {book_features_df.index[indices.flatten()[i]]}, with distance of {distances.flatten()[i]}')

Recommendations for Hamlet:

1: Macbeth, with distance of 0.7726397974758158
2: The Secret Garden, with distance of 0.8694229413235309
3: The Love Letter, with distance of 0.883535062636157
4: The Odyssey, with distance of 0.8838949354321393
5: Poland, with distance of 0.893358889503112
