In [1]:
# Data from: https://en.wikipedia.org/wiki/List_of_best-selling_books

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 200)
# URL of the page to scrape
url = "https://en.m.wikipedia.org/wiki/List_of_best-selling_books"

# Send a GET request to the URL
response = requests.get(url)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    
    books = []
    # Find the tables containing the book data
    tables = soup.find_all("table", class_="wikitable")
    if tables:
        for table_index, table in enumerate(tables, start=1):
            # print(f"Table {table_index}:")

            # Extract the rows
            rows = table.find_all("tr")
            for row in rows:
                # Extract cells (td and th elements)
                cells = row.find_all(["td", "th"])
                cell_data = [cell.get_text(strip=True) for cell in cells]
                books.append(cell_data)

In [3]:
len(books)

345

In [4]:
df = pd.DataFrame(data=books,columns=['Book', 'Author(s)', 'Original language', 'First published', 'Approximate sales', 'Genre',''])

In [5]:
df=df.drop(columns='')
df=df.drop(index=[0])

#### We're only interested in the List of "best-selling individual books" tables since they include the genres of each book which will be used as a feature for our recommendation system

In [7]:
df=df[:173]

In [8]:
df = df.drop(index=df[df['Book']=='Book'].index)

In [9]:
# Extract the integer from the strings in each row
df['Approximate sales'] = df['Approximate sales'].str.extract(r'(\d+)').astype(int)

In [10]:
df = df.reset_index(drop=True)

In [11]:
# Convert each entry in the Approximate sales column to int type
df['Approximate sales'] = df['Approximate sales']*1_000_000

In [12]:
len(df)

170

In [13]:
df.index.name='book_id'

In [14]:
df

Unnamed: 0_level_0,Book,Author(s),Original language,First published,Approximate sales,Genre
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,A Tale of Two Cities,Charles Dickens,English,1859,200000000,Historical fiction
1,The Little Prince(Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200000000,"Fantasy,children's fiction"
2,The Alchemist(O Alquimista),Paulo Coelho,Portuguese,1988,150000000,Fantasy
3,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120000000,"Fantasy,children's fiction"
4,And Then There Were None,Agatha Christie,English,1939,100000000,Mystery
5,Dream of the Red Chamber(紅樓夢),Cao Xueqin,Chinese,1791,100000000,Family saga
6,The Hobbit,J. R. R. Tolkien,English,1937,100000000,"Fantasy,children's fiction"
7,Alice's Adventures in Wonderland,Lewis Carroll,English,1865,100000000,"Fantasy,absurdist fiction"
8,"The Lion, the Witch and the Wardrobe",C.S. Lewis,English,1950,85000000,"Fantasy,children's fiction"
9,She: A History of Adventure,H. Rider Haggard,English,1887,83000000,Adventure


In [15]:
df.isna().sum()

Book                 0
Author(s)            0
Original language    0
First published      0
Approximate sales    0
Genre                0
dtype: int64

In [16]:
# Import TensorFlow and scikit-learn libraries
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Lambda, TextVectorization
from tensorflow.keras.models import Model

In [17]:
# Preprocessing
vectorizer = TextVectorization(output_mode='int', max_tokens=5000) # Converts text to integer sequences
vectorizer.adapt(df['Book'])

# Ensure 'Author(s)' column contains only strings
df['Author(s)'] = df['Author(s)'].astype(str).fillna("Unknown")

# Now apply text vectorization
author_vectorizer = TextVectorization(output_mode='int', max_tokens=5000)
author_vectorizer.adapt(df['Author(s)'])

label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])
df['Original language'] = label_encoder.fit_transform(df['Original language'])

df['First published'] = pd.to_numeric(df['First published'], errors='coerce')
df.dropna(subset=['First published'], inplace=True)

scaler = MinMaxScaler()
df['First published'] = scaler.fit_transform(df[['First published']])
df['Approximate sales'] = scaler.fit_transform(df[['Approximate sales']])

In [18]:
# Define Model Inputs
book_input = Input(shape=(1,), dtype=tf.string, name="Book")
author_input = Input(shape=(1,), dtype=tf.string, name="Author")
genre_input = Input(shape=(1,), name="Genre")
lang_input = Input(shape=(1,), name="Language")
published_input = Input(shape=(1,), name="Published")
sales_input = Input(shape=(1,), name="Sales")

In [19]:
# Text Vectorization and Embedding
book_vectorized = vectorizer(book_input)
author_vectorized = author_vectorizer(author_input)

book_embedding = Embedding(input_dim=5000, output_dim=32)(book_vectorized)
author_embedding = Embedding(input_dim=5000, output_dim=32)(author_vectorized)

In [20]:
# Compute mean embeddings
book_mean = Lambda(lambda x: tf.compat.v1.reduce_mean(x, axis=1))(book_embedding)
author_mean = Lambda(lambda x: tf.compat.v1.reduce_mean(x, axis=1))(author_embedding)

# Genre and Language Embedding
genre_embedding = Embedding(input_dim=df['Genre'].nunique(), output_dim=16)(genre_input)
lang_embedding = Embedding(input_dim=df['Original language'].nunique(), output_dim=16)(lang_input)




In [21]:
# Squeeze the embeddings to remove the extra dimension
genre_squeezed = Lambda(lambda x: tf.squeeze(x, axis=1))(genre_embedding)
lang_squeezed = Lambda(lambda x: tf.squeeze(x, axis=1))(lang_embedding)

In [22]:
# Combine Features
combined_features = Concatenate()([
    book_mean,
    author_mean,
    genre_squeezed,
    lang_squeezed,
    published_input,
    sales_input
])

In [23]:
# Preprocessing
vectorizer = TextVectorization(output_mode='int', max_tokens=5000) # Converts text to integer sequences
vectorizer.adapt(df['Book'])

# Ensure 'Author(s)' column contains only strings
df['Author(s)'] = df['Author(s)'].astype(str).fillna("Unknown")

# Now apply text vectorization
author_vectorizer = TextVectorization(output_mode='int', max_tokens=5000)
author_vectorizer.adapt(df['Author(s)'])

label_encoder = LabelEncoder()
df['Genre'] = label_encoder.fit_transform(df['Genre'])
df['Original language'] = label_encoder.fit_transform(df['Original language'])

df['First published'] = pd.to_numeric(df['First published'], errors='coerce')
df.dropna(subset=['First published'], inplace=True)

scaler = MinMaxScaler()
df['First published'] = scaler.fit_transform(df[['First published']])
df['Approximate sales'] = scaler.fit_transform(df[['Approximate sales']])

# Define Model Inputs
book_input = Input(shape=(1,), dtype=tf.string, name="Book")
author_input = Input(shape=(1,), dtype=tf.string, name="Author")
genre_input = Input(shape=(1,), name="Genre")
lang_input = Input(shape=(1,), name="Language")
published_input = Input(shape=(1,), name="Published")
sales_input = Input(shape=(1,), name="Sales")

# Text Vectorization and Embedding
book_vectorized = vectorizer(book_input)
author_vectorized = author_vectorizer(author_input)

book_embedding = Embedding(input_dim=5000, output_dim=32)(book_vectorized)
author_embedding = Embedding(input_dim=5000, output_dim=32)(author_vectorized)

# Compute mean embeddings
book_mean = Lambda(lambda x: tf.compat.v1.reduce_mean(x, axis=1))(book_embedding)
author_mean = Lambda(lambda x: tf.compat.v1.reduce_mean(x, axis=1))(author_embedding)

# Genre and Language Embedding
genre_embedding = Embedding(input_dim=df['Genre'].nunique(), output_dim=16)(genre_input)
lang_embedding = Embedding(input_dim=df['Original language'].nunique(), output_dim=16)(lang_input)

# Squeeze the embeddings to remove the extra dimension
genre_squeezed = Lambda(lambda x: tf.squeeze(x, axis=1))(genre_embedding)
lang_squeezed = Lambda(lambda x: tf.squeeze(x, axis=1))(lang_embedding)

# Combine Features
combined_features = Concatenate()([
    book_mean,
    author_mean,
    genre_squeezed,
    lang_squeezed,
    published_input,
    sales_input
])

In [24]:
# Dense Layers
dense_layer = Dense(128, activation='relu')(combined_features)
output_layer = Dense(64, activation='relu')(dense_layer)

# Define Model
model = Model(
    inputs=[book_input, author_input, genre_input, lang_input, published_input, sales_input],
    outputs=output_layer
)

model.compile(optimizer='adam', loss='mse')

# Prepare Input Data
book_data = df['Book'].astype(str).to_numpy()
author_data = df['Author(s)'].astype(str).to_numpy()
genre_data = df['Genre'].to_numpy()
lang_data = df['Original language'].to_numpy()
published_data = df['First published'].to_numpy()
sales_data = df['Approximate sales'].to_numpy()

# Train the Model
book_vectors = model.predict([book_data, author_data, genre_data, lang_data, published_data, sales_data])

# Recommendation Function
def recommend(book_title, top_n=5):
    # Get the index of the book in the dataset
    book_idx = df[df['Book'] == book_title].index[0]
    
    # Convert all inputs to tensors
    book_input = tf.convert_to_tensor(df['Book'].values, dtype=tf.string)
    author_input = tf.convert_to_tensor(df['Author(s)'].values, dtype=tf.string)
    genre_input = tf.convert_to_tensor(df['Genre'].values, dtype=tf.int32)
    lang_input = tf.convert_to_tensor(df['Original language'].values, dtype=tf.int32)
    published_input = tf.convert_to_tensor(df['First published'].values, dtype=tf.float32)
    sales_input = tf.convert_to_tensor(df['Approximate sales'].values, dtype=tf.float32)
    
    # Use the model to generate book vectors
    book_vectors = model.predict([
        book_input,
        author_input,
        genre_input,
        lang_input,
        published_input,
        sales_input
    ])
    
    # Extract the vector for the given book
    book_vector = book_vectors[book_idx]
    
    # Compute similarities between the given book and all others
    similarities = np.dot(book_vectors, book_vector) / (
        np.linalg.norm(book_vectors, axis=1) * np.linalg.norm(book_vector)
    )
    
    # Get indices of the most similar books (excluding the input book itself)
    similar_books = np.argsort(similarities)[::-1][1 : top_n + 1]
    
    # Return the titles of the recommended books
    return df.iloc[similar_books]['Book'].tolist()

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


In [25]:
# Example Usage
recommendations = recommend("Harry Potter and the Philosopher's Stone", top_n=10)
for book in recommendations:
    print(book)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
The Hobbit
The Lion, the Witch and the Wardrobe
And Then There Were None
The Alchemist(O Alquimista)
Alice's Adventures in Wonderland
She: A History of Adventure
Harry Potter and the Chamber of Secrets
Dream of the Red Chamber(紅樓夢)
The Da Vinci Code
Harry Potter and the Goblet of Fire
