In [2]:
!pip install opendatasets -q

In [3]:
import opendatasets as od

dataset_url = "https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset"
od.download(dataset_url)

Downloading book-recommendation-dataset.zip to ./book-recommendation-dataset


100%|██████████| 23.8M/23.8M [00:01<00:00, 13.8MB/s]





In [4]:
# importing the dependancies
import numpy as np 
import pandas as pd


In [5]:
# reading the csv file 
books = pd.read_csv('book-recommendation-dataset/Books.csv')
users = pd.read_csv('book-recommendation-dataset/Users.csv')
ratings = pd.read_csv('book-recommendation-dataset/Ratings.csv')


  books = pd.read_csv('book-recommendation-dataset/Books.csv')


In [6]:
# checking the dataset 
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [8]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
# shape of the dataset 
print(f"Shape of books dataset: {books.shape}")
print(f"Shape of users dataset: {users.shape}")
print(f"Shape of ratings dataset: {ratings.shape}")

Shape of books dataset: (271360, 8)
Shape of users dataset: (278858, 3)
Shape of ratings dataset: (1149780, 3)


In [10]:
# checking for null values in the dataset 
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [11]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [12]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [13]:
print(f"Duplicated rows in the books datasets: {books.duplicated().sum()}")
print(f"Duplicated rows in the users datasets: {users.duplicated().sum()}")
print(f"Duplicated rows in the ratings datasets: {ratings.duplicated().sum()}")

Duplicated rows in the books datasets: 0
Duplicated rows in the users datasets: 0
Duplicated rows in the ratings datasets: 0


### Popularity Based Recommender System

- Display the top 50 books in dataset based on avg.ratings
- Will consider only those books, who has more than 250 votes 

In [14]:
# merging the books and ratings dataset 
ratings_with_name = ratings.merge(books, on='ISBN')

# random 5 rows in the dataset
ratings_with_name.sample(5)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
461617,14422,0380771713,8,No Other Man,Shannon Drake,1995,Avon,http://images.amazon.com/images/P/0380771713.0...,http://images.amazon.com/images/P/0380771713.0...,http://images.amazon.com/images/P/0380771713.0...
690556,171118,0886779006,0,"Flightless Falcon (Daw Book Collectors, No. 1156)",Mickey Zucker Reichert,2000,Daw Hardcover,http://images.amazon.com/images/P/0886779006.0...,http://images.amazon.com/images/P/0886779006.0...,http://images.amazon.com/images/P/0886779006.0...
511743,12538,044145125X,0,The Knight and Knave of Swords,Fritz Leiber,1990,Ace Books,http://images.amazon.com/images/P/044145125X.0...,http://images.amazon.com/images/P/044145125X.0...,http://images.amazon.com/images/P/044145125X.0...
179624,249111,0671038184,9,Jewel,Bret Lott,1999,Washington Square Press,http://images.amazon.com/images/P/0671038184.0...,http://images.amazon.com/images/P/0671038184.0...,http://images.amazon.com/images/P/0671038184.0...
57193,41123,0446608890,0,Saving Faith,David Baldacci,2000,Warner Vision,http://images.amazon.com/images/P/0446608890.0...,http://images.amazon.com/images/P/0446608890.0...,http://images.amazon.com/images/P/0446608890.0...


In [15]:
# calculating number of ratings per book
# performing groupby operation on 'Book-Title' column 
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()

# renaming the column name for simplicity 
num_rating_df.rename(columns={'Book-Rating':'num_ratings'}, inplace=True)

# checking any random 5 rows from the dataframe
num_rating_df.sample(5)

Unnamed: 0,Book-Title,num_ratings
94440,It All Began With Daisy,1
164578,Space: Above and Beyond - A Novel (Book 1),2
172884,Teaching Reading With Literature: Case Studies...,1
24859,Blackwater,6
62830,"Expert Guide to Windows 98 (Minaki, Mark. Mark...",1


In [16]:
# calculating avg book rating for each book 
avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()

# renaming the column name for simpliciy 
avg_rating_df.rename(columns={'Book-Rating':'avg_ratings'}, inplace=True)

# checking any random 5 rows from the dataset
avg_rating_df.sample(5)  

  avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()


Unnamed: 0,Book-Title,avg_ratings
236602,"With Love, With Connie",9.0
55079,Dragons of a Fallen Sun (Dragonlance: The War ...,4.8
136824,POWER TAROT : MORE THAN 100 SPREADS THAT GIVE ...,7.0
183505,The Cut Direct: A Leonidas Witherall Mystery,0.0
108061,"Life, Liberty and the Pursuit of Happiness",0.0


In [17]:
# Now we will merge 'num_rating_df' and 'avg_rating_df' 
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')

# checking the first 5 rows of the dataset
popular_df.head()

Unnamed: 0,Book-Title,num_ratings,avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


In [18]:
# filtering top 50 books with more than 250 ratings counts, and sorting the books based on 'avg ratings'
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_ratings', ascending=False).head(50)
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_ratings
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
80445,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
211384,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
219741,To Kill a Mockingbird,510,4.7


In [19]:
# mergin popular_df with books dataset for more details about the books 
popular_df = popular_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Image-URL-M', 'num_ratings', 'avg_ratings']]
popular_df

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453
16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...,281,5.007117
17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...,368,4.94837
26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...,575,4.895652
28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.0...,260,4.880769
39,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,510,4.7


## Collaborative Filtering Based Recommender System

In [20]:
# Considering only the users who has voted more than 200 books
x = ratings_with_name.groupby('User-ID').count()['Book-Rating']>200
frequent_readers = x[x].index

In [21]:
# filtering out only books which has one of the reader from requent readers list
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(frequent_readers)]

In [22]:
# filtering out the books which has more than 50 votes
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index

In [23]:
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [24]:
# filtering famous books from the dataset 
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [25]:
print(f"Number of duplicate rows in the final rating dataframe: {final_ratings.duplicated().sum()}")

Number of duplicate rows in the final rating dataframe: 0


In [26]:
# Applying pivot tabel over final_rating dataframe
pt = final_ratings.pivot_table(index='Book-Title',
                               columns='User-ID',
                               values='Book-Rating')
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,0.0,...,,9.0,,,,,0.0,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,0.0,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,


In [27]:
print(f"pivote tabel has {pt.shape[0]} Books and {pt.shape[1]} User-IDs")

pivote tabel has 706 Books and 810 User-IDs


In [28]:
# we will fill the NaN values with 0 for further calculation 
pt.fillna(0, inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


[Similarity_score_Explaination](https://youtu.be/1YoD0fg3_EM?start=2622&end=2713)

article on cosine similarity --> https://www.machinelearningplus.com/nlp/cosine-similarity/

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
# calcuating similarity score of 706 books with 706 books
cosine_similarity(pt).shape

(706, 706)

In [31]:
similarity_score = cosine_similarity(pt)

In [32]:
def recommend(book_name):

  index = np.where(pt.index==book_name)[0][0]
  similar_item = sorted(list(enumerate(similarity_score[index])), key=lambda x:x[1], reverse=True)[1:6]
  
  for i in similar_item:
    print(pt.index[i[0]])

In [33]:
# The numpy.where() function returns the indices of elements in an input array where the given condition is satisfied.
np.where(pt.index=='Zoya')[0][0]

704

In [34]:
sorted(list(enumerate(similarity_score[0])), key=lambda x:x[1], reverse=True)[1:6]

[(47, 0.2702651417103732),
 (545, 0.2639619371123496),
 (82, 0.2366937434740099),
 (634, 0.23299389358170397),
 (551, 0.2262639743141286)]

- `enumerate(similarity_score[0])` - Enumerates (i.e., assigns an index) to each element of the similarity_score list's first element. The result is a list of tuples, where the first element of each tuple is the index and the second element is the corresponding value from the list.
- `list(...)` - Converts the enumerated list to a regular Python list.
- `sorted(..., key=lambda x: x[1], reverse=True)` - Sorts the list of tuples by the second element (i.e., the similarity score) in descending order. This means that the items with the highest similarity scores will come first in the list.
- `[1:6]` - Returns a new list containing the elements at positions 1 through 5 (i.e., excluding the first element) of the sorted list. This means that the top 5 most similar items, excluding the most similar item, will be returned.

In [35]:
# This function recommends books similar to the given book name based on a pre-calculated similarity score matrix.

# Parameters:
# book_name (str): The name of the book to find similar books for.

# Returns:
# None
def recommend(book_name):
    # Find the index of the book in the Pandas DataFrame that matches the given book_name
    index = np.where(pt.index==book_name)[0][0]
    
    # Calculate similarity scores between the book and all other books, and return the indices of the top 5 most similar books
    similar_item = sorted(list(enumerate(similarity_score[index])), key=lambda x:x[1], reverse=True)[1:6]
    
    # Print the names of the top 5 most similar books
    for i in similar_item:
        print(pt.index[i[0]])


In [36]:
recommend('Message in a Bottle')

Nights in Rodanthe
The Mulberry Tree
A Walk to Remember
River's End
Nightmares &amp; Dreamscapes


In [37]:
recommend('The Notebook')

A Walk to Remember
The Rescue
One Door Away from Heaven
Toxin
The Five People You Meet in Heaven


In [38]:
recommend('The Hobbit : The Enchanting Prelude to The Lord of the Rings')

The Two Towers (The Lord of the Rings, Part 2)
The Fellowship of the Ring (The Lord of the Rings, Part 1)
Where the Red Fern Grows
One for the Money (A Stephanie Plum Novel)
The Return of the King (The Lord of the Rings, Part 3)


In [39]:
recommend("Harry Potter and the Chamber of Secrets (Book 2)")

Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Sorcerer's Stone (Book 1)
Harry Potter and the Order of the Phoenix (Book 5)


In [40]:
popular_df['avg_ratings'] = round(popular_df['avg_ratings'], 2)

In [41]:
popular_df.head()

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.85
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.82
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.74
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.5
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.18


In [42]:
import pickle
pickle.dump(popular_df,open('popular.pkl', 'wb'))

In [43]:
print(pd.__version__)

1.5.3


In [44]:
def recommend(book_name):

  index = np.where(pt.index==book_name)[0][0]
  similar_item = sorted(list(enumerate(similarity_score[index])), key=lambda x:x[1], reverse=True)[1:6]
  
  data = []
  for i in similar_item:
    # print(pt.index[i[0]])
    item = []
    temp_df = books[books['Book-Title'] == pt.index[i[0]]]
    item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
    item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
    item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))

    data.append(item)
  return data

In [45]:
recommend('The Notebook')

[['A Walk to Remember',
  'Nicholas Sparks',
  'http://images.amazon.com/images/P/0446608955.01.MZZZZZZZ.jpg'],
 ['The Rescue',
  'Nicholas Sparks',
  'http://images.amazon.com/images/P/0446610399.01.MZZZZZZZ.jpg'],
 ['One Door Away from Heaven',
  'Dean R. Koontz',
  'http://images.amazon.com/images/P/0553582755.01.MZZZZZZZ.jpg'],
 ['Toxin',
  'Robin Cook',
  'http://images.amazon.com/images/P/0425166619.01.MZZZZZZZ.jpg'],
 ['The Five People You Meet in Heaven',
  'Mitch Albom',
  'http://images.amazon.com/images/P/0786868716.01.MZZZZZZZ.jpg']]

In [46]:
pickle.dump(pt, open('pt.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(similarity_score, open('similarity_score.pkl', 'wb'))