## Enhancing Books Recommendation System Using Clustering | Collaborative based

In [3]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [4]:
## Loading The First Dataset Called Books
books = pd.read_csv('BX-Books.csv', sep=';', on_bad_lines='skip', encoding='latin-1', low_memory=False)
books.shape
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [5]:
#Retaining The Required Columns
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L']]

In [6]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [7]:
# Renaming The Columns For Easier Usage
books.rename(columns={
    "Book-Title": "title",
    "Book-Author": "author",
    "Year-Of-Publication": "year",
    "Publisher": "publisher",
    "Image-URL-L": "img_url"
}, inplace = True)

books.head(2)

Unnamed: 0,ISBN,title,author,year,publisher,img_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [8]:
# Loading The First Data Containing Users
users = pd.read_csv('BX-Users.csv', sep=';', on_bad_lines='skip', encoding='latin-1')

In [9]:
users.rename(columns={
    "User-ID": "user_id",
    "Location": "location",
    "Age": "age"
}, inplace = True)

users.head()

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [10]:
# Loading The Data Containing Book's Ratings
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', on_bad_lines='skip', encoding='latin-1')

In [11]:
ratings.rename(columns={
    "User-ID": "user_id",
    "Book-Rating": "rating"
}, inplace = True)

ratings.head()

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [12]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 6)
(278858, 3)
(1149780, 3)


## Data Exploration

In [14]:
# Doing Data Exploration on Ratings
ratings['user_id'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: user_id, Length: 105283, dtype: int64

In [15]:
ratings["user_id"].unique().shape

(105283,)

In [16]:
x =ratings['user_id'].value_counts() > 200

In [17]:
x[x].shape

(899,)

In [18]:
y = x[x].index

In [19]:
y

Int64Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
            110973, 235105,
            ...
            260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727,
            268622, 188951],
           dtype='int64', length=899)

- Reducing the number of data by filtering the books having less than 200 user ratings

In [21]:
# Filtering Out Users Who Rated Less Than 200 Books
ratings= ratings[ratings['user_id'].isin(y)]
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [22]:
ratings.shape

(526356, 3)

In [23]:
#Filtering Books Related To The Ratings By Merging in ISBN
ratings_with_books = ratings.merge(books, on = "ISBN")
ratings_with_books.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [24]:
ratings_with_books.shape

(487671, 8)

In [25]:
# Learning How Many Time Each Book Has Been Rated
num_rating = ratings_with_books.groupby('title')['rating'].count().reset_index()
num_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [26]:
# Filtering Out Books With Less Than 50 Ratings
num_rating.rename(columns={"rating": "num_of_rating"}, inplace =True)
num_rating.head()

Unnamed: 0,title,num_of_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [27]:
final_rating = ratings_with_books.merge(num_rating, on='title')
final_rating.head(5)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82


In [28]:
final_rating.shape

(487671, 9)

In [29]:
# Removing All The Books With Less Than 50 Ratings
final_rating = final_rating[final_rating['num_of_rating']>=50]
final_rating.shape

(61853, 9)

In [30]:
# Removing Available Duplicates
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)
final_rating.shape

(59850, 9)

### Pivot Table

I Am Doing This To Generate A Frequency Matrix Of The Books On Each User And Their Ratings

In [32]:
book_pivot = final_rating.pivot_table(columns= 'user_id', index='title', values='rating')
book_pivot

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [33]:
# Removing Null Values
book_pivot.fillna(0, inplace=True)
book_pivot

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
book_sparse = csr_matrix(book_pivot)
book_sparse

<742x888 sparse matrix of type '<class 'numpy.float64'>'
	with 14942 stored elements in Compressed Sparse Row format>

In [35]:
# Clustering
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [36]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1), n_neighbors=6)

distance
suggestion

array([[237, 240, 238, 241, 184, 536]], dtype=int64)

In [37]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [38]:
book_pivot.index[237]

'Harry Potter and the Chamber of Secrets (Book 2)'

In [39]:
books_name = book_pivot.index
books_name

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=742)

In [40]:
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(books_name, open('artifacts/books_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('artifacts/book_pivot.pkl', 'wb'))

In [41]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6)
    
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for j in books:
            print(j)

In [42]:
book_name = "Harry Potter and the Chamber of Secrets (Book 2)"
recommend_book(book_name)

Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Book 1)
Exclusive
The Cradle Will Fall


### Using the recommended books to calculate the accuracy of the recommendations and the relevance

In [44]:
# Defining The Recommended Books Returned By The Recommend Book Function
recommended_books = ["Harry Potter and the Chamber of Secrets (Book 2)",
"Harry Potter and the Prisoner of Azkaban (Book 3)",
"Harry Potter and the Goblet of Fire (Book 4)",
"Harry Potter and the Sorcerer's Stone (Book 1)",
"Exclusive",
"The Cradle Will Fall"]

# Filtering 'books_info' DataFrame To Include Only The Recommended Books
recommended_books_info = final_rating.loc[final_rating['title'].isin(recommended_books)].copy()

# Displaying The Information Of Recommended Books & Removing The Duplicates
recommended_books_info.drop_duplicates(['title', 'rating'], inplace=True)
recommended_books_info

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_of_rating
6205,277427,0439064872,0,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,183
6206,254,0439064872,9,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,183
6209,8245,0439064872,8,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,183
6218,28523,0439064872,10,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,183
6251,126736,0439064872,5,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,183
6268,156269,0439064872,7,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,183
6388,277427,0439136369,0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...,138
6389,254,0439136369,9,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...,138
6392,9856,0439136369,10,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...,138
6397,21576,0439136369,8,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...,138


In [45]:
# Calculating The Average Rating For Each Book
average_ratings = recommended_books_info.groupby('ISBN')['rating'].mean()

# Defining a Threshold For Relevance (e.g., 4 or 5 stars)
threshold = 5.0

# Generating a Ground Truth Labels Based on The Threshold
ground_truth_labels = average_ratings >= threshold

# Printing out the Ground Truth Labels
print(ground_truth_labels)

ISBN
0439064872     True
0439136350     True
0439136369     True
0439139597     True
0439139600     True
043936213X     True
0440115450    False
0446604232     True
0590353403     True
0671741195     True
Name: rating, dtype: bool


## Precision, Recall and F1 Score of The Recommendations

In [47]:
ground_truth_labels = {
    '0439064872': True,
    '0439136350': True,
    '0439136369': True,
    '0439139597': True,
    '0439139600': True,
    '043936213X': True,
    '0440115450': False,
    '0446604232': True,
    '0590353403': True,
    '0671741195': True
}

suggested_books = ['0439064872', '0439136350','0439136369', '0439139597', '0439139600', '043936213X', '0440115450', '0446604232', '0590353403', '0671741195']

# Converting Ground Truth Labels To a List Of Books
ground_truth_labels_list = [ground_truth_labels.get(book, False) for book in suggested_books]

# Converting Suggested Books To a Set For Faster Membership Check
suggested_books_set = set(suggested_books)

# Calculating The True Positives, False Positives, False Negatives
true_positives = sum(1 for book in suggested_books_set if ground_truth_labels.get(book, False))
false_positives = len(suggested_books_set) - true_positives
false_negatives = sum(1 for book in ground_truth_labels if book not in suggested_books_set and ground_truth_labels[book])

# Calculating Precision and Recall
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Calculating F1 Score
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

Precision: 0.9
Recall: 1.0
F1 Score: 0.9473684210526316


## Calculating Accuracy Metric for The Recommended Books 

In [49]:
#Calculating Accuracy 
from sklearn.metrics import accuracy_score, confusion_matrix

def evaluate_accuracy(ground_truth_labels, suggested_books):
    # Converting suggested_books To a Set For Faster Membership Check
    suggested_books_set = set(suggested_books)

    # Calculating True Positives, False Positives, False Negatives, True Negatives
    true_positives = sum(1 for book in suggested_books_set if ground_truth_labels.get(book, False))
    false_positives = len(suggested_books_set) - true_positives
    false_negatives = sum(1 for book in ground_truth_labels if book not in suggested_books_set and ground_truth_labels[book])
    true_negatives = len(ground_truth_labels) - (true_positives + false_positives + false_negatives)

    # Calculating Confusion matrix
    y_true = [ground_truth_labels[book] for book in ground_truth_labels]
    y_pred = [book in suggested_books_set for book in ground_truth_labels]

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculating Accuracy
    accuracy = accuracy_score(y_true, y_pred)

    return accuracy
    
# Defining The Best Set of Ground Truth Labels And Suggested Books For Accuracy 
dataset = [
    ({
        '0439064872': True,
        '0439136350': True,
        '0439136369': True,
        '0439139597': True,
        '0439139600': True,
        '043936213X': True,
        '0440115450': False,
        '0446604232': True,
        '0590353403': True,
        '0671741195': True
    }, ['0439064872', '0439136350', '0439136369', '0439139597', '0439139600', '043936213X', '0440115450', '0446604232', '0590353403', '0671741195'])
]

# Evaluating The Dataset and Storing The Result
results = []

for ground_truth_labels, suggested_books in dataset:
    accuracy = evaluate_accuracy(ground_truth_labels, suggested_books)
    results.append(accuracy)

# Printing The Result
for i, accuracy in enumerate(results):
    print(f"Accuracy: {accuracy}")


Accuracy: 0.9
