Dataset Source: https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
books = pd.read_csv('Books.csv')
users = pd.read_csv('Users.csv')
ratings = pd.read_csv('Ratings.csv')

In [6]:
print('Books shape: ', books.shape)
print('Users shape: ', users.shape)
print('Ratings shape: ', ratings.shape)

Books shape:  (55042, 8)
Users shape:  (278858, 3)
Ratings shape:  (805641, 3)


## Data observation

In [7]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    1
Publisher              1
Image-URL-S            1
Image-URL-M            1
Image-URL-L            1
dtype: int64

In [10]:
books.duplicated().sum()

0

In [13]:
ratings_with_name = ratings.merge(books, on='ISBN')

In [17]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'}, inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Beyond IBM: Leadership Marketing and Finance ...,1
2,Earth Prayers From around the World: 365 Pray...,5
3,Final Fantasy Anthology: Official Strategy Gu...,4
4,It Takes Two,2
...,...,...
49855,Â¿QuiÃ©n se ha llevado mi queso?,1
49856,"Â¿QuÃ© me quieres, amor?",6
49857,Ã?Â?ber die Freiheit.,1
49858,Ã?Â?bermorgen.,1


In [18]:
avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_rating'}, inplace=True)
avg_rating_df

  avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()


Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Beyond IBM: Leadership Marketing and Finance ...,0.00
2,Earth Prayers From around the World: 365 Pray...,6.60
3,Final Fantasy Anthology: Official Strategy Gu...,5.00
4,It Takes Two,0.00
...,...,...
49855,Â¿QuiÃ©n se ha llevado mi queso?,7.00
49856,"Â¿QuÃ© me quieres, amor?",2.00
49857,Ã?Â?ber die Freiheit.,7.00
49858,Ã?Â?bermorgen.,0.00


In [29]:
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df.shape

(49860, 3)

In [30]:
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_rating', ascending=False)
popular_df.shape

(80, 3)

In [31]:
popular_df = popular_df.merge(books, on="Book-Title").drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Image-URL-M', 'num_ratings', 'avg_rating']]
popular_df.shape

(80, 5)

In [40]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
frequent_users = x[x].index
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(frequent_users)]
filtered_rating.shape

(147181, 10)

In [46]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [76]:
pt = final_ratings.pivot_table(index="Book-Title", columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,189334,189835,190708,190925,193499,194600,194669,195694,277427,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
A Map of the World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Whispers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
White Oleander : A Novel,0.0,0.0,0.0,7.0,0.0,0.0,0.0,8.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
White Oleander : A Novel (Oprah's Book Club),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wicked: The Life and Times of the Wicked Witch of the West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from sklearn.metrics.pairwise import cosine_similarity

Cosine similarity of 2 vectors:
![image](https://www.machinelearningplus.com/wp-content/uploads/2018/10/Cosine-Similarity-Formula-1.png)



In [75]:
similarity_score = cosine_similarity(pt)
similarity_score

array([[1.        , 0.22143596, 0.1699959 , ..., 0.12691682, 0.13790727,
        0.07661507],
       [0.22143596, 1.        , 0.12204727, ..., 0.        , 0.10507763,
        0.07079691],
       [0.1699959 , 0.12204727, 1.        , ..., 0.06634019, 0.12574219,
        0.13730456],
       ...,
       [0.12691682, 0.        , 0.06634019, ..., 1.        , 0.13416218,
        0.        ],
       [0.13790727, 0.10507763, 0.12574219, ..., 0.13416218, 1.        ,
        0.16197767],
       [0.07661507, 0.07079691, 0.13730456, ..., 0.        , 0.16197767,
        1.        ]])

In [59]:
def recommend(book_name):
  index = np.where(pt.index==book_name)[0][0]
  similar_items = sorted(list(enumerate(similarity_score[index])), key=lambda x:x[1], reverse=True)[1:6]
  similar_items = [pt.index[i[0]] for i in similar_items]
  return similar_items

In [64]:
recommend('1st to Die: A Novel')

['One for the Money (Stephanie Plum Novels (Paperback))',
 'Three To Get Deadly : A Stephanie Plum Novel (A Stephanie Plum Novel)',
 'Along Came a Spider (Alex Cross Novels)',
 'High Five (A Stephanie Plum Novel)',
 'Violets Are Blue']