# Google Books Recommender System

In [90]:
import pandas as pd
import numpy as np
import string
import re
import requests

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from scipy.spatial import distance

In [43]:
df = pd.read_csv('book_data_all.csv')
df.head()

Unnamed: 0,title,author,rating,voters,price,currency,description,publisher,page_count,language,published_date,genre,ISBN_13
0,La Chartreuse De Parme,Stendhal,,,,,Reproduction Of The Original.,Bod – Books On Demand,629.0,English,2022-09-28,Fiction,9783368304492
1,"Erewhon; Or, Over The Range",Samuel Butler,,,,,Reproduction Of The Original.,Bod – Books On Demand,302.0,English,2022-10-31,Fiction,9783368314804
2,Heart Bones,Colleen Hoover,,,,,Ung Kærlighed Med Udfordringer I Rørende Colle...,Lindhardt Og Ringhof,309.0,Danish,2023-02-15,Fiction,9788727012858
3,The Power Of Movement In Plants,Charles Darwin,,,,,"Reprint Of The Original, First Published In 1898.",Bod – Books On Demand,605.0,English,2022-10-03,Fiction,9783368272722
4,Babbitt,Sinclair Lewis,,,,,A Novel That Stood The Test Of Time “But I Do ...,Xist Publishing,434.0,English,2015-08-07,Fiction,9781681951737


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2046 entries, 0 to 2045
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           2046 non-null   object 
 1   author          2019 non-null   object 
 2   rating          682 non-null    float64
 3   voters          682 non-null    object 
 4   price           227 non-null    float64
 5   currency        227 non-null    object 
 6   description     1965 non-null   object 
 7   publisher       1879 non-null   object 
 8   page_count      1997 non-null   float64
 9   language        2046 non-null   object 
 10  published_date  2039 non-null   object 
 11  genre           1950 non-null   object 
 12  ISBN_13         2045 non-null   object 
dtypes: float64(3), object(10)
memory usage: 207.9+ KB


In [45]:
# Convert to int64, handling non-integer values
df['ISBN_13'] = pd.to_numeric(df['ISBN_13'], errors='coerce').astype('Int64')

# Convert published_date to datetime
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

In [15]:
# get book cover images

def get_cover_url(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    response = requests.get(url)
    data = response.json()
    if data.get("totalItems", 0) > 0:
        volume_info = data["items"][0].get("volumeInfo", {})
        image_links = volume_info.get("imageLinks", {})
        cover_url = image_links.get("thumbnail") if "thumbnail" in image_links else None
        return cover_url
    else:
        return None

# Add a new column 'cover_url' to your DataFrame
df['cover_url'] = df['ISBN_13'].apply(get_cover_url)

In [23]:
# # get book cover images

# def get_book_cover(isbn13):
#     url = f"https://covers.openlibrary.org/b/isbn/{isbn}-L.jpg"
#     return url

# # Add a new column for book cover URLs
# df['cover_url_ol'] = df['ISBN_13'].apply(get_book_cover)

In [22]:
list(df['cover_url'].unique())

['http://books.google.com/books/content?id=zNKPEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=yuCYEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=XWmtEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=sG-SEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=u-EmCwAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=RSBikuephAIC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=y1LVEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 None,
 'http://books.google.com/books/content?id=_XedEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api',
 'http://books.google.com/books/content?id=pa4LUt0FCB0C&printsec=

In [51]:
df1 = df.copy()

In [53]:
# text preprocessing function
def preprocess_text(text):
    # Handle NaN or float values
    if isinstance(text, float) or text is None or pd.isnull(text):
        return ''

    # Lowercase the text
    text = text.lower()

    # Remove URLs, hashtags, mentions, and special characters
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)

    # Remove numbers/digits
    text = re.sub(r'\b[0-9]+\b\s*', '', text)

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    return ' '.join(tokens)

# Features to preprocess
features_to_preprocess = ['title', 'description', 'author', 'genre', 'publisher', 'language', 'rating', 'page_count']

# Data preprocessing for each feature
for feature in features_to_preprocess:
    # Apply text preprocessing to each entry in the feature
    df1[feature] = df1[feature].apply(preprocess_text)

In [47]:
df['cover_url'] = cover_url

In [41]:
cover_url = df['cover_url']

In [58]:
# Concatenate the preprocessed features into a single text feature
df1['combined_features'] = df1[features_to_preprocess].apply(lambda x: ' '.join(x), axis=1)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF Vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(df1['combined_features'])

print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (2046, 25568)


In [63]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display the cosine similarity matrix
print("Cosine Similarity Matrix shape:", cosine_sim.shape)
print("Cosine Similarity Matrix:")
print(cosine_sim)

Cosine Similarity Matrix shape: (2046, 2046)
Cosine Similarity Matrix:
[[1.00000000e+00 3.26712493e-01 3.35407131e-02 ... 2.06669561e-03
  7.72921700e-04 2.38160265e-02]
 [3.26712493e-01 1.00000000e+00 2.55213604e-03 ... 2.33323632e-03
  8.72605029e-04 2.68875676e-02]
 [3.35407131e-02 2.55213604e-03 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.06669561e-03 2.33323632e-03 0.00000000e+00 ... 1.00000000e+00
  4.15346224e-02 5.19108622e-02]
 [7.72921700e-04 8.72605029e-04 0.00000000e+00 ... 4.15346224e-02
  1.00000000e+00 1.61450544e-03]
 [2.38160265e-02 2.68875676e-02 0.00000000e+00 ... 5.19108622e-02
  1.61450544e-03 1.00000000e+00]]


In [88]:
def display_books_with_pagination(book_titles, items_per_page=5):
    total_books = len(book_titles)
    num_pages = (total_books + items_per_page - 1) // items_per_page

    page = 1
    while True:
        print(f"Page {page}/{num_pages}:")
        start_idx = (page - 1) * items_per_page
        end_idx = start_idx + items_per_page
        for i in range(start_idx, min(end_idx, total_books)):
            print(f"{i+1}. {book_titles[i]}")

        choice = input("Enter 'n' for next page, 'p' for previous page, or 'q' to quit: ").lower()
        if choice == 'n' and page < num_pages:
            page += 1
        elif choice == 'p' and page > 1:
            page -= 1
        elif choice == 'q':
            break
        else:
            print("Invalid choice. Please try again.")

book_titles = list(df['title'].sample(2046).values)

display_books_with_pagination(book_titles)

Page 1/410:
1. Candide
2. A Brief History Of Mechanical Engineering
3. Heart And Brain: An Awkward Yeti Collection
4. By Dark
5. Animal Short Stories


Enter 'n' for next page, 'p' for previous page, or 'q' to quit:  n


Page 2/410:
6. A Game Of Thrones: A Song Of Ice And Fire: Book One
7. Research Traditions In Marketing
8. Emerging Issues And Challenges In Business & Economics
9. Murderous Schemes
10. The Art Of Timing


Enter 'n' for next page, 'p' for previous page, or 'q' to quit:  n


Page 3/410:
11. Emotions
12. The Tale Of Mr Jeremy Fisher
13. The Key To Rebecca
14. Social Justice, Multicultural Counseling, And Practice
15. Hard-Boiled


Enter 'n' for next page, 'p' for previous page, or 'q' to quit:  n


Page 4/410:
16. Culture, Innovation, And Growth Dynamics
17. Rambles Beyond Railways
18. The Seven Spiritual Laws Of Success For Parents
19. Batman And Robin Vol. 1: Batman Reborn
20. "A" Is For Alibi


Enter 'n' for next page, 'p' for previous page, or 'q' to quit:  q


In [89]:
def recommend_books(book_title, cosine_sim, df, top_n=5):
    # Get the index of the book using its title
    book_index = df[df['title'] == book_title].index[0]

    # Get the similarity scores for the book
    similar_books = list(enumerate(cosine_sim[book_index]))

    # Sort the books based on similarity scores in descending order
    similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)

    # Get the top similar books (excluding the book itself)
    top_similar_books = similar_books[1:top_n+1]

    # Get the indices and details of the top similar books
    similar_books_info = []
    for index, similarity in top_similar_books:
        book_info = {
            'title': df['title'].iloc[index],
            'author': df['author'].iloc[index],
            'genre': df['genre'].iloc[index],
            'publisher': df['publisher'].iloc[index],
            'similarity_score': similarity
        }
        similar_books_info.append(book_info)

    return similar_books_info

book_title = "Batman And Robin Vol. 1: Batman Reborn"
recommended_books = recommend_books(book_title, cosine_sim, df)

print("Recommended Books for '{}':\n".format(book_title))
for book_info in recommended_books:
    print(f"Title: {book_info['title']}")
    print(f"Author: {book_info['author']}")
    print(f"Genre: {book_info['genre']}")
    print(f"Publisher: {book_info['publisher']}")
    print(f"Similarity Score: {book_info['similarity_score']}")
    print("----------")

Recommended Books for 'Batman And Robin Vol. 1: Batman Reborn':

Title: Dark Nights: Metal: Deluxe Edition: Issues 1-6
Author: Scott Snyder
Genre: nan
Publisher: Dc Comics
Similarity Score: 0.2938523206950436
----------
Title: Batman Vs. Superman: The Greatest Battles
Author: Geoff Johns
Genre: nan
Publisher: Dc
Similarity Score: 0.26478793873359213
----------
Title: Batman: Arkham Knight Vol. 3
Author: Peter J. Tomasi
Genre: nan
Publisher: Dc
Similarity Score: 0.25075427342978385
----------
Title: Batman: The Man Who Laughs
Author: Ed Brubaker
Genre: nan
Publisher: Dc
Similarity Score: 0.25028732940687376
----------
Title: Batman: Arkham Unhinged Vol. 1
Author: Derek Fridolfs
Genre: nan
Publisher: Dc
Similarity Score: 0.23408684560631066
----------


In [78]:
vectors = tfidf_matrix.toarray()

# Build the index using faiss
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors.astype('float32'))

# Function to get nearest neighbors using faiss
def get_nearest_neighbors(query_vector, k=10):
    _, indices = index.search(query_vector.reshape(1, -1).astype('float32'), k)
    return indices[0]

In [81]:
# Function to recommend books using nearest neighbors
def recommend_books(book_title, df, top_n=5):
    # Get the index of the book using its title
    book_index = df[df['title'] == book_title].index

    # If the book title is not found, return an empty list
    if len(book_index) == 0:
        print("Book not found.")
        return []

    book_index = book_index[0]

    # Get the nearest neighbors
    nearest_neighbors = get_nearest_neighbors(vectors[book_index], top_n+1)  # +1 to exclude the book itself

    # Get the indices and similarity scores of the recommended books
    recommended_books_info = []
    for index in nearest_neighbors:
        if index != book_index:
            title = df['title'].iloc[index]
            author = df['author'].iloc[index]
            genre = df['genre'].iloc[index]
            publisher = df['publisher'].iloc[index]
            similarity_score = cosine_sim[book_index][index]

            recommended_books_info.append({
                'title': title,
                'author': author,
                'genre': genre,
                'publisher': publisher,
                'similarity_score': similarity_score
            })

    return recommended_books_info

# Example usage
book_title = "The Art Of Seduction"
recommended_books_info = recommend_books(book_title, df, top_n=5)

print("Recommended Books using Faiss for '{}':\n".format(book_title))
for book_info in recommended_books_info:
    print("Title:", book_info['title'])
    print("Author:", book_info['author'])
    print("Genre:", book_info['genre'])
    print("Publisher:", book_info['publisher'])
    print("Similarity Score:", book_info['similarity_score'])
    print("\n")

Recommended Books using Faiss for 'The Art Of Seduction':

Title: The Complete Art Of War
Author: Sun Tzu
Genre: Political Science , Political Ideologies , General
Publisher: Simon And Schuster
Similarity Score: 0.10881453667666077


Title: Military Strategy Of Middle Powers
Author: Håkan Edström', 'Jacob Westberg
Genre: History , Military , Strategy
Publisher: Routledge
Similarity Score: 0.09400320280192608


Title: The Seducer'S Diary
Author: Søren Kierkegaard
Genre: Religion
Publisher: Princeton University Press
Similarity Score: 0.08778386403214333


Title: Game Art
Author: Matt Sainsbury
Genre: Art , Video Game Art
Publisher: No Starch Press
Similarity Score: 0.08740777080041978


Title: Art And Videogames
Author: Debora Ferrari', 'Luca Traini
Genre: Art , Video Game Art
Publisher: nan
Similarity Score: 0.08401806865097462




In [101]:
class RecommenderSystem:
    def __init__(self, data, tfidf_matrix, cosine_sim):
        self.data = data
        self.tfidf_matrix = tfidf_matrix
        self.cosine_sim = cosine_sim

        # Build the index using Faiss
        self.vectors = self.tfidf_matrix.toarray()
        self.index = faiss.IndexFlatL2(self.vectors.shape[1])
        self.index.add(self.vectors.astype('float32'))

    def recommend_books(self, book_title, top_n=5):
        # Get the index of the book using its title
        book_index = self.data[self.data['title'] == book_title].index

        # If the book title is not found, return an empty list
        if len(book_index) == 0:
            print("Book not found.")
            return []

        book_index = book_index[0]

        # Get the nearest neighbors using Faiss
        _, nearest_neighbors = self.index.search(self.vectors[book_index].reshape(1, -1).astype('float32'), top_n+1)

        # Get the indices and similarity scores of the recommended books
        recommended_books_info = []
        for index in nearest_neighbors[0]:
            if index != book_index:
                title = self.data['title'].iloc[index]
                author = self.data['author'].iloc[index]
                genre = self.data['genre'].iloc[index]
                publisher = self.data['publisher'].iloc[index]
                similarity_score = self.cosine_sim[book_index][index]

                recommended_books_info.append({
                    'title': title,
                    'author': author,
                    'genre': genre,
                    'publisher': publisher,
                    'similarity_score': similarity_score
                })

        return recommended_books_info

recommender_system = RecommenderSystem(df, tfidf_matrix, cosine_sim)
while True:
    book_title = input("Enter a book title (type 'q' to quit): ")

    if book_title.lower() == 'q':
        break

    recommended_books_info = recommender_system.recommend_books(book_title, top_n=5)

    if len(recommended_books_info) == 0:
        print("No recommendations found for '{}'.".format(book_title))
    else:
        print("Recommended books for '{}':".format(book_title))
        for book_info in recommended_books_info:
            print("Title:", book_info['title'])
            print("Author:", book_info['author'])
            print("Genre:", book_info['genre'])
            print("Publisher:", book_info['publisher'])
            print("Similarity Score:", book_info['similarity_score'])
            print("\n")

Enter a book title (type 'q' to quit):  The Seducer'S Diary


Recommended books for 'The Seducer'S Diary':
Title: Kierkegaard On Self, Ethics, And Religion
Author: Roe Fremstedal
Genre: Philosophy , History &Amp, Surveys , Modern
Publisher: Cambridge University Press
Similarity Score: 0.3514385650551825


Title: The Art Of Seduction
Author: Robert Greene
Genre: Self-Help , Personal Growth , General
Publisher: Profile Books
Similarity Score: 0.08778386403214333


Title: Women And Appletrees
Author: Moa Martinson
Genre: Fiction , Women
Publisher: Feminist Press At Cuny
Similarity Score: 0.08508663488027866


Title: Twas The Nightshift Before Christmas: Festive Hospital Diaries From The Author Of Million-Copy Hit This Is Going To Hurt
Author: Adam Kay
Genre: Biography &Amp, Autobiography , Medical (Incl. Patients)
Publisher: Pan Macmillan
Similarity Score: 0.07105611101014746


Title: Wrecking Ball (Diary Of A Wimpy Kid Book 14)
Author: Jeff Kinney
Genre: nan
Publisher: Abrams
Similarity Score: 0.06692141267657047




Enter a book title (type 'q' to quit):  q


In [111]:
class RecommenderSystem:
    def __init__(self, data, tfidf_matrix, cosine_sim):
        self.data = data
        self.tfidf_matrix = tfidf_matrix
        self.cosine_sim = cosine_sim

        # Build the index using Faiss
        self.vectors = self.tfidf_matrix.toarray()
        self.index = faiss.IndexFlatL2(self.vectors.shape[1])
        self.index.add(self.vectors.astype('float32'))

    def recommend_books(self, book_title, top_n=5):
        # Get the index of the book using its title
        book_index = self.data[self.data['title'] == book_title].index

        # If the book title is not found, return an empty list
        if len(book_index) == 0:
            print("Book not found.")
            return []

        book_index = book_index[0]

        # Get the nearest neighbors using Faiss
        _, nearest_neighbors = self.index.search(self.vectors[book_index].reshape(1, -1).astype('float32'), top_n+1)

        # Get the indices and similarity scores of the recommended books
        recommended_books_info = []
        for index in nearest_neighbors[0]:
            if index != book_index:
                title = self.data['title'].iloc[index]
                author = self.data['author'].iloc[index]
                genre = self.data['genre'].iloc[index]
                publisher = self.data['publisher'].iloc[index]
                similarity_score = self.cosine_sim[book_index][index]
                image_url = self.data['cover_url'].iloc[index]

                recommended_books_info.append({
                    'title': title,
                    'author': author,
                    'genre': genre,
                    'publisher': publisher,
                    'similarity_score': similarity_score,
                    'image_url': image_url
                })

        return recommended_books_info

recommender_system = RecommenderSystem(df, tfidf_matrix, cosine_sim)
while True:
    book_title = input("Enter a book title (type 'q' to quit): ")

    if book_title.lower() == 'q':
        break

    recommended_books_info = recommender_system.recommend_books(book_title, top_n=5)

    if len(recommended_books_info) == 0:
        print("No recommendations found for '{}'.".format(book_title))
    else:
        print("Recommended books for '{}':".format(book_title))
        for book_info in recommended_books_info:
            print("Title:", book_info['title'])
            print("Author:", book_info['author'])
            print("Genre:", book_info['genre'])
            print("Publisher:", book_info['publisher'])
            print("Similarity Score:", book_info['similarity_score'])
            print("Image_URL:", book_info['image_url'])
            print("\n")

Enter a book title (type 'q' to quit):  The Seducer'S Diary


Recommended books for 'The Seducer'S Diary':
Title: Kierkegaard On Self, Ethics, And Religion
Author: Roe Fremstedal
Genre: Philosophy , History &Amp, Surveys , Modern
Publisher: Cambridge University Press
Similarity Score: 0.3514385650551825
Image_URL: http://books.google.com/books/content?id=Ii1TEAAAQBAJ&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api


Title: The Art Of Seduction
Author: Robert Greene
Genre: Self-Help , Personal Growth , General
Publisher: Profile Books
Similarity Score: 0.08778386403214333
Image_URL: http://books.google.com/books/content?id=3pcQosflgoQC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api


Title: Women And Appletrees
Author: Moa Martinson
Genre: Fiction , Women
Publisher: Feminist Press At Cuny
Similarity Score: 0.08508663488027866
Image_URL: http://books.google.com/books/content?id=mDmzrepyOVkC&printsec=frontcover&img=1&zoom=1&edge=curl&source=gbs_api


Title: Twas The Nightshift Before Christmas: Festive Hospital Diaries From The 

Enter a book title (type 'q' to quit):  q


In [114]:
list(df['description'])

['Reproduction Of The Original.',
 'Reproduction Of The Original.',
 'Ung Kærlighed Med Udfordringer I Rørende Colleen Hoover-Roman. Et Uforudset Dødsfald Tvinger Beyah Til At Bo Med Sin Far I Texas Hen Over Sommeren, Selvom Hun Knapt Nok Kender Ham. Beyahs Plan Er At Holde Lav Profil Og Lade Sommeren Gå Af Sig Selv, Men Hendes Nye Nabo Samson Vender Op Og Ned På Det Hele. På Overfladen Har Samson Og Beyah Intet Til Fælles. Hun Kommer Fra Fattigdom Og Misrøgt, Han Kommer Fra En Familie Med Rigdom Og Privilegier. Men Én Ting Har De Til Fælles: De Tiltrækkes Af Triste Ting. Hvilket Betyder, At De Tiltrækker Hinanden. Deres Forbindelse Er Næsten Øjeblikkelig Og Fuldstændig Unægtelig, Men Beyah Og Samson Bliver Enige Om En Ukompliceret Sommerflirt. Hvad Beyah Ikke Ved, Er At Tunge Skyer Er På Vej Ind, Og At Hendes Hjerte Snart Kommer Ud I Stormvejr.',
 'Reprint Of The Original, First Published In 1898.',
 'A Novel That Stood The Test Of Time “But I Do Know That About Ten Times As Many Peop

## Model Evaluation

In [85]:
print(df.columns.tolist())

['title', 'author', 'rating', 'voters', 'price', 'currency', 'description', 'publisher', 'page_count', 'language', 'published_date', 'genre', 'ISBN_13', 'cover_url']


In [106]:
df.to_csv('recommender_books_with_url.csv', index=False)