In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

import os, sys
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Load Dataset

In [4]:
# Load dataset: book.csv
df_books = pd.read_csv('data/Books.csv')
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg


In [5]:
#check column names
df_books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [6]:
#check dataset shape
df_books.shape

(271360, 8)

In [7]:
# remove un wanted columns 
df_books = df_books[['ISBN', 'Book-Title', 'Book-Author']]

In [8]:
# Load dataset: ratings
df_ratings = pd.read_csv('data/Ratings.csv')
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [9]:
#check dataset shape
df_ratings.shape

(1149780, 3)

## Check Null values

In [10]:
# Check Null values:
df_books.isnull().sum()

ISBN           0
Book-Title     0
Book-Author    2
dtype: int64

In [11]:
df_ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

## Drop Null Values

In [12]:
df_books.dropna(inplace=True)

In [13]:
df_books.isnull().sum()

ISBN           0
Book-Title     0
Book-Author    0
dtype: int64

In [14]:
#check dataset shape
df_books.shape

(271358, 3)

In [15]:
# df_books.to_csv('df_books.csv')

## Sort Ratings

In [16]:
# Calculate the count of ratings given by each user and store it in the 'ratings' Series
ratings = df_ratings['User-ID'].value_counts()
# Sort the 'ratings' Series in descending order based on the counts of user IDs
ratings.sort_values(ascending=False).head()

User-ID
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
Name: count, dtype: int64

In [17]:
ratings.shape

(105283,)

## check users with less than 200 ratings

In [18]:
len(ratings[ratings < 200])

104378

In [19]:
df_ratings['User-ID'].isin(ratings[ratings < 200].index).sum()


622224

In [20]:
df_ratings_rm = df_ratings[
  ~df_ratings['User-ID'].isin(ratings[ratings < 200].index)
]
df_ratings_rm.shape

(527556, 3)

## Check books with less than 100 ratings

In [21]:
ratings = df_ratings['ISBN'].value_counts() 
ratings.sort_values(ascending=False).head()

ISBN
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: count, dtype: int64

In [22]:
len(ratings[ratings < 100])

339825

In [23]:
df_books['ISBN'].isin(ratings[ratings < 100].index).sum()

269422

In [24]:
df_ratings_rm = df_ratings_rm[
  ~df_ratings_rm['ISBN'].isin(ratings[ratings < 100].index)
]
df_ratings_rm.shape

(49781, 3)

In [25]:
# These should exist
books = ["Where the Heart Is (Oprah's Book Club (Paperback))",
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True"]

for book in books:
    print(df_ratings_rm['ISBN'].isin(df_books[df_books['Book-Title'] == book]['ISBN']).sum())

183
75
49
57
77


In [26]:
# df_ratings_rm.to_csv('df_ratings_rm.csv')


## Preprocess Dataset For Machine Learning

In [27]:
df_ratings_rm.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1469,277427,0060930535,0
1471,277427,0060934417,0
1474,277427,0061009059,9
1484,277427,0140067477,0


In [28]:
df_ratings_rm['User-ID'].nunique()

888

In [29]:
df_ratings_rm['ISBN'].nunique()

731

In [30]:
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [32]:
df = df_ratings_rm.pivot_table(index=['User-ID'],columns=['ISBN'],values='Book-Rating').fillna(0).T
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
df.shape

(731, 888)

In [34]:
df.index = df.join(df_books.set_index('ISBN'))['Book-Title'] 

In [36]:
df = df.sort_index()
df.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df.loc["The Queen of the Damned (Vampire Chronicles (Paperback))"][:5]

User-ID
254     0.0
2276    0.0
2766    0.0
2977    0.0
3363    0.0
Name: The Queen of the Damned (Vampire Chronicles (Paperback)), dtype: float64

In [43]:
len(df.index.values)

731

In [None]:
# df.to_csv('Recomendation_dataset.csv', index=False)

## Model

In [44]:
model = NearestNeighbors(metric='cosine')
model.fit(df.values)

In [45]:
title = 'The Queen of the Damned (Vampire Chronicles (Paperback))'
df.loc[title].shape

(888,)

In [46]:
distance, indice = model.kneighbors([df.loc[title].values], n_neighbors=10)

print(distance[0])
print(indice[0])

[2.22044605e-16 5.17841186e-01 5.37633845e-01 7.34506886e-01
 7.44865700e-01 7.93983542e-01 7.95716638e-01 8.05232193e-01
 8.14139374e-01 8.31365720e-01]
[612 660 648 272 667 110 297 547 345 628]


In [47]:
df.iloc[indice[0]].index.values

array(['The Queen of the Damned (Vampire Chronicles (Paperback))',
       'The Vampire Lestat (Vampire Chronicles, Book II)',
       'The Tale of the Body Thief (Vampire Chronicles (Paperback))',
       'Interview with the Vampire',
       'The Witching Hour (Lives of the Mayfair Witches)', 'Catch 22',
       'Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)',
       'The Gunslinger (The Dark Tower, Book 1)',
       'Neuromancer (Remembering Tomorrow)', 'The Search'], dtype=object)

In [48]:
pd.DataFrame({
    'title'   : df.iloc[indice[0]].index.values,
    'distance': distance[0]
}).sort_values(by='distance', ascending=False)

Unnamed: 0,title,distance
9,The Search,0.8313657
8,Neuromancer (Remembering Tomorrow),0.8141394
7,"The Gunslinger (The Dark Tower, Book 1)",0.8052322
6,Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches),0.7957166
5,Catch 22,0.7939835
4,The Witching Hour (Lives of the Mayfair Witches),0.7448657
3,Interview with the Vampire,0.7345069
2,The Tale of the Body Thief (Vampire Chronicles (Paperback)),0.5376338
1,"The Vampire Lestat (Vampire Chronicles, Book II)",0.5178412
0,The Queen of the Damned (Vampire Chronicles (Paperback)),2.220446e-16


In [89]:
# function to return recommended books - this will be tested
def get_recommends(title = ""):
  try:
    book = df.loc[title]
  except KeyError as e:
    print('The given book', e, 'does not exist')
    return

  distance, indice = model.kneighbors([book.values], n_neighbors=5)

  recommended_books = pd.DataFrame({
      'title'   : df.iloc[indice[0]].index.values,
      'distance': distance[0]
    }) \
    .sort_values(by='distance', ascending=False) \
    .head(5).values

  return [title, recommended_books]

In [90]:
recommendations = get_recommends("A Fine Balance")

if recommendations:
  query_book, recommended_books = recommendations
  print("Recommended books for", query_book, ":")
  for title, distance in recommended_books:
    print(f"- {title} (Similarity: {distance:.2f})")
else:
  print("Book not found.")


Recommended books for A Fine Balance :
- How to Make an American Quilt (Similarity: 0.76)
- The Phantom Tollbooth (Similarity: 0.72)
- Atonement: A Novel (Similarity: 0.70)
- Three Junes (Similarity: 0.68)
- A Fine Balance (Similarity: 0.00)


## Save Model


In [92]:
import pickle

# Save the model to a file
with open('Knn_Recomendation_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [87]:
# Load the saved model using pickle
with open('Knn_Recomendation_model.pkl', 'rb') as f:
      model = pickle.load(f)
      

In [95]:
from joblib import dump
filename = 'Knn_Recomendation_model.joblib'
dump(model, filename)

['Knn_Recomendation_model.joblib']

In [98]:
from joblib import load

# Load the saved model using joblib
filename = 'Knn_Recomendation_model.joblib'
model = load(filename)