# BOOK RECOMENDATION SYSTEM

## IMPORT LIB

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
books = pd.read_csv(
    "../data/raw/BX-Books.csv",
    sep=";",
    encoding="latin-1",
    on_bad_lines="skip",
    low_memory=False
)

books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
books["Image-URL-L"][141]

'http://images.amazon.com/images/P/1900850303.01.LZZZZZZZ.jpg'

In [4]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [5]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-L']]

In [6]:
books.head(n=2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [7]:
books.rename(
    columns={
        'ISBN': 'isbn',
        'Book-Title': 'title',
        'Book-Author': 'author',
        'Year-Of-Publication': 'year',
        'Publisher': 'publisher',
        'Image-URL-L': 'image_url'
    },
    inplace=True
)


In [8]:
books.head()

Unnamed: 0,isbn,title,author,year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [9]:
users = pd.read_csv(
    "../data/raw/BX-Users.csv",
    sep=";",
    encoding="latin-1",
    on_bad_lines="skip"
)
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [10]:
users.shape

(278858, 3)

In [11]:
# Adjusting User Columns
users.rename(
    columns={
        "User-ID":"user_id",
        "Location":"location",
        "age":"age"
    }
)

Unnamed: 0,user_id,location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [12]:
ratings = pd.read_csv(
    "../data/raw/BX-Book-Ratings.csv",
    sep=";",
    encoding="latin-1",
    on_bad_lines="skip"
)
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [13]:
print(f"User Shape :{users.shape}, Ratings Shape {ratings.shape}, Books Shapes {books.shape}")

User Shape :(278858, 3), Ratings Shape (1149780, 3), Books Shapes (271360, 6)


In [14]:
# Lets store users who had at least rated more than 200 books
x = ratings["User-ID"].value_counts() > 200
x.shape

(105283,)

In [15]:
x.shape

(105283,)

In [16]:
x[x].shape

(899,)

In [17]:
ratings.rename(
    columns= {
        'ISBN': 'isbn',
        'User-ID': 'user_id',
        'Book-Rating': 'book_rating',
    },
    inplace=True
)

ratings.columns

Index(['user_id', 'isbn', 'book_rating'], dtype='object')

In [18]:
ratings_with_books = ratings.merge(books, on="isbn")

In [19]:
ratings.shape

(1149780, 3)

In [20]:
ratings_with_books.shape

(1031136, 8)

In [21]:
ratings_with_books.head()

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...


In [22]:
number_rating = ratings_with_books.groupby("title")["book_rating"].count().reset_index()

In [23]:
number_rating

Unnamed: 0,title,book_rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [24]:
number_rating.value_counts()

title                                                                                                       book_rating
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)  4              1
 Always Have Popsicles                                                                                      1              1
 Apple Magic (The Collector's series)                                                                       1              1
 Ask Lily (Young Women of Faith: Lily Series, Book 5)                                                       1              1
 Beyond IBM: Leadership Marketing and Finance for the 1990s                                                 1              1
                                                                                                                          ..
Ã?Â?lpiraten.                                                                                               2              1
Ã?Â?r

In [25]:
number_rating.rename(
    columns={
        "book_rating":"num_of_rating"
    },
    inplace=True
)

number_rating

Unnamed: 0,title,num_of_rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [26]:
number_rating.shape

(241071, 2)

In [27]:
final_rating = ratings_with_books.merge(number_rating, on="title")
final_rating.head()

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,num_of_rating
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,60
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,14
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,650
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,1
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,1


In [28]:
final_rating.shape

(1031136, 9)

In [29]:
final_rating = final_rating[final_rating["num_of_rating"] > 50]

In [30]:
final_rating.shape

(285590, 9)

In [31]:
final_rating.drop_duplicates(["user_id", "title"], inplace=True)
final_rating.shape

(282389, 9)

In [32]:
final_rating.isnull().sum()

user_id          0
isbn             0
book_rating      0
title            0
author           0
year             0
publisher        0
image_url        0
num_of_rating    0
dtype: int64

# BUILDING A RECOMMENDATION SYSTEM USING A PIVOT TABLE (RATINGS)

A recommendation system is designed to suggest items to users based on past interactions. In this project, the interaction data consists of **book ratings provided by users**, and the goal is to identify patterns that can be used to recommend similar books.

## Understanding the Ratings Data

The ratings dataset contains three main components:
- **User-ID**: identifies the user
- **Book-ID (ISBN)**: identifies the book
- **Book-Rating**: the rating given by the user

This raw data is typically sparse because:
- Many users rate only a few books
- Many books receive very few ratings

To ensure meaningful recommendations, the dataset must be filtered.

## Filtering Active Users and Popular Books

To reduce noise and improve recommendation quality:
- **Active users** are selected based on a minimum number of ratings (e.g., users with more than 200 ratings).
- **Popular books** are selected based on how many users have rated them.

This filtering ensures that similarity calculations are based on sufficient interaction data.

## Creating the Pivot Table (User–Item Matrix)

After filtering, the ratings data is reshaped into a **pivot table**, also known as a **user–item matrix**.

In this matrix:
- Each **row** represents a book
- Each **column** represents a user
- Each **cell** contains the rating a user gave to a book

If a user has not rated a book, the value is missing (`NaN`).

## Handling Missing Values

Most similarity algorithms cannot work with missing values. Therefore:
- All missing ratings are replaced with `0`
- A value of `0` indicates no interaction, not dislike

This step prepares the data for mathematical operations.

## Using the Pivot Table for Recommendations

The pivot table serves as the foundation for **collaborative filtering**:
- Books can be compared based on rating patterns across users
- Similarity measures (such as cosine similarity) are applied to find books with similar rating behavior

If two books are rated similarly by many users, they are considered similar.

## Generating Recommendations

When a user interacts with a book:
- The system identifies books with high similarity scores
- These similar books are recommended to the user

## Summary

The pivot table transforms raw rating data into a structured numerical format that enables similarity-based analysis. It is a critical step that connects raw user behavior to the recommendation logic, making collaborative filtering possible.


In [33]:
final_rating.head(n=1)

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,num_of_rating
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,60


In [34]:
book_pivot = final_rating.pivot_table(columns="user_id", index="title", values="book_rating")
book_pivot

user_id,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,,,,,,,,,,,...,,,,,,,,,,
16 Lighthouse Road,,,,,,,,,,,...,,,,,,,,,,
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2010: Odyssey Two,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,,,,,,,...,,,,,,,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,
"\O\"" Is for Outlaw""",,,,,,,,,,,...,,,,,,,,,,
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",,,,,,,,,,,...,,,,,,,,,,


In [35]:
book_pivot.shape

(2381, 47740)

In [36]:
book_pivot.fillna(0, inplace=True)

In [37]:
book_pivot.head()

user_id,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# CSR MATRIX

In [38]:
from scipy.sparse import csr_matrix

In [39]:
book_sparse = csr_matrix(book_pivot)
book_sparse

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 112306 stored elements and shape (2381, 47740)>

In [40]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm="brute")

In [41]:
model.fit(book_sparse)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,


In [42]:
book_pivot.iloc[7,:]

user_id
8         0.0
9         0.0
14        0.0
16        0.0
17        0.0
         ... 
278843    0.0
278844    0.0
278846    0.0
278851    0.0
278854    0.0
Name: 24 Hours, Length: 47740, dtype: float64

In [43]:
distance_suggestion = model.kneighbors(book_pivot.iloc[7,:].values.reshape(1,-1), n_neighbors=5)
distance_suggestion

(array([[ 0.        , 49.12229636, 49.53786431, 49.55804677, 50.04997502]]),
 array([[   7,  748,  275, 2361,   96]]))

In [44]:
for i in range(len(distance_suggestion[1].flatten())) : 
    print(book_pivot.index[distance_suggestion[1].flatten()[i]])

24 Hours
Ground Zero and Beyond
Blood and Gold (Rice, Anne, Vampire Chronicles.)
Women in His Life
About Face


In [45]:
final_rating.head(n=1)

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,num_of_rating
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,60


In [46]:
np.where(book_pivot.index == "4 Blondes")[0][0]

np.int64(10)

In [47]:
ids = np.where(final_rating["title"] == "Harry Potter and the Chamber of Secrets (Book 2)")[0][0]

In [48]:
final_rating.iloc[ids]["image_url"]

'http://images.amazon.com/images/P/0439064864.01.LZZZZZZZ.jpg'

In [49]:
book_name = []

for book_id in distance_suggestion[1]:
    book_name.append(book_pivot.index[book_id])


In [50]:
book_name

[Index(['24 Hours', 'Ground Zero and Beyond',
        'Blood and Gold (Rice, Anne, Vampire Chronicles.)', 'Women in His Life',
        'About Face'],
       dtype='object', name='title')]

In [51]:
ids_index = [] 
for name in book_name[0] : 
    ids = np.where(final_rating["title"] == name)[0][0]
    ids_index.append(ids)

In [52]:
for idx in ids_index: 
    url = final_rating.iloc[idx]["image_url"]
    print(url)

http://images.amazon.com/images/P/0451203593.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/1401088945.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0345409329.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0345345738.01.LZZZZZZZ.jpg
http://images.amazon.com/images/P/0821770209.01.LZZZZZZZ.jpg


In [53]:
import pickle as pickel 
pickel.dump(model, open("../artifacts/model.pkl", "wb"))
pickel.dump(book_name, open("../artifacts/book_names.pkl", "wb"))
pickel.dump(final_rating, open("../artifacts/final_rating.pkl", "wb"))
pickel.dump(book_pivot, open("../artifacts/book_pivot.pkl", "wb"))

# TESTING MODE

In [54]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6 )
    
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for j in books:
            if j == book_name:
                print(f"You searched '{book_name}'\n")
                print("The suggestion books are: \n")
            else:
                print(j)

In [55]:
book_name = "2nd Chance"
recommend_book(book_name)

You searched '2nd Chance'

The suggestion books are: 

Breaking Point (Tom Clancy's Net Force, No. 4)
Ground Zero and Beyond
The Juror
The Undertaker's Widow
About Face
