# Machine Learning 🚀

## Import Libraries

In [None]:
import pandas as pd
import json
import numpy as np
import requests
import matplotlib.pyplot as plt                                                           
import seaborn as sns
import plotly.express as px
import bertopic
import re
import torch
from sklearn.decomposition import PCA
from nltk.stem import SnowballStemmer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer, WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer, word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from bertopic import BERTopic
from transformers import BertTokenizer, BertModel
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
import re
import gensim.downloader as api
from umap import UMAP
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import pickle
pd.set_option('display.max_columns', None)

## Load Data

In [None]:
# Loading the dataframe containing the information in regards to authors
df_authors = pd.read_json(r"C:\Users\biave\Desktop\goodreads_book_authors.json", lines=True)

# Loading the dataframe containing the books related to mystery, thriller and crime
df_books = pd.read_json(r"C:\Users\biave\Desktop\goodreads_books_history_biography.json", lines=True)


## Clean & Transform Data 

#### a. Books & Authors

In [None]:
df_books['authors'].iloc[1]

In [None]:
def extract_author_ids(authors):
    """Extracting the author IDs"""
    author_ids = []  
    if isinstance(authors, list):  
        for author in authors:
            if "author_id" in author:
                author_ids.append(author["author_id"]) 
    return author_ids  


df_books["author_ids"] = df_books["authors"].apply(extract_author_ids)

In [None]:
def get_first_author_id(author_ids):
    """Getting the first author ID"""
    return author_ids[0] if isinstance(author_ids, list) else None

df_books['first_author_id'] = df_books['author_ids'].apply(get_first_author_id)

In [None]:
# Dropping unecessary columns
df_books.drop(columns=["text_reviews_count", "series", "country_code", "popular_shelves", "asin", 
                       "kindle_asin", "edition_information", "url", "work_id", "link", "publication_day", 
                       "publication_month", "title_without_series", "publisher", "isbn13",
                       "author_ids", "authors","similar_books", "image_url"], 
                       inplace=True)

In [None]:
# Converting both columns from both DFs to strings to be able to merge them
df_books['first_author_id'] = df_books['first_author_id'].astype(str)
df_authors['author_id'] = df_authors['author_id'].astype(str)

In [None]:
# Merging both DFs to get the authors' names
df_books = pd.merge(df_books, df_authors, left_on='first_author_id', right_on='author_id', how='left')


In [None]:
# Renaming the columns
df_books.rename(columns={'average_rating_x': 'avg_rating_books', 'ratings_count_x': 'rating_count_books', 'average_rating_y': 'avg_rating_authors', 'ratings_count_y': 'rating_count_authors'}, inplace=True)

In [None]:
# Choosing the most convenient column order
desided_order = ['isbn', "book_id", "title", "description", "publication_year", "avg_rating_books", "rating_count_books", "name", "first_author_id", "avg_rating_authors", "format", "num_pages", "language_code"]

df_books = df_books[desided_order]

In [None]:
print(df_books['language_code'].unique())  
english_codes = {'eng', 'en-GB', 'en-US', 'en-CA', 'en', 'aus'}

df_books = df_books[df_books['language_code'].isin(english_codes)].reset_index(drop=True)
df_books.dropna(inplace=True)

In [None]:
print(df_books['language_code'].unique())  

In [None]:
df_books

In [None]:
df_books['publication_year'] = df_books['publication_year'].astype(str).str.split('.').str[0]

In [None]:
# Checking for rows with empty strings 
(df_books == '').sum()

In [None]:
# Deleting instances in which the rows contain empty strings
df_books = df_books[(df_books != '').all(axis=1)]

In [None]:
df_books.to_csv('books.csv', index=False)

In [None]:
print("Books Dataframe:")
display(df_books.head(5))
print("Authors Dataframe:")
display(df_authors.head(5))

## Explore Data

In [None]:
def data_exploration(df):
    """Data Exploration"""
    display(df.head())
    print("Data Types:")
    display(df.dtypes)
    print("Data Overview:")
    display(df.info())
    print("Summary Statistics:")
    display(df.describe())
    print("Missing Values:")
    display(df.isnull().sum())
    print("Rows with empty Strings:")
    print((df_books == '').sum())
    print("Missing Values %:")
    missing_percentage = df.isnull().mean() * 100
    display(missing_percentage)
    print("Duplicates:")
    display(df.duplicated().sum())
    print("Duplicates %:")
    duplicates = df.duplicated().sum()
    percentage = (duplicates / len(df)) * 100
    print(f'The dataset contains {duplicates} duplicate rows, making up {percentage}% of the total number of records.')

data_exploration(df_books)

## Working with the clean data

In [None]:
df_books = pd.read_csv(r"C:\Users\biave\Desktop\books.csv")

In [None]:
df_books

### Trying SBERT + BERTopic

In [None]:
#Download the necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# Download the necessary stopwords, tokenizer, lemmatizer and SBERT Model
tokenizer_nltk = WordPunctTokenizer()
stopwords_set = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
sbert_model = SentenceTransformer('all-mpnet-base-v2')

#### Using Lemmatizer

In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() 
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) 

def clean_text(text):
    """Cleans and preprocesses text."""
    text = re.sub('<[^>]*>', '', text) 
    text = text.lower() 
    tokens = tokenizer_nltk.tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords_set]
    tokens = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tokens]
    return " ".join(tokens)

In [None]:
def get_sbert_embedding(text):
    """Generates SBERT embeddings for the given text."""
    return sbert_model.encode(text)

In [None]:
# Drop missing descriptions
df_books = df_books.dropna(subset=['clean_description'])
df_books = df_books.sample(frac=0.2, random_state=42)

In [None]:
df_books.reset_index(inplace=True)
df_books.drop(columns=['index'], inplace=True)

In [None]:
# Generate embeddings from description only
embeddings = np.array([get_sbert_embedding(desc) for desc in df_books['clean_description']])

#### Initializing and fitting the BERTopic model

In [None]:
umap_model = UMAP(n_neighbors=5, n_components=5, metric='cosine')

In [None]:
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
topic_model = BERTopic(
    embedding_model=sbert_model,  
    umap_model=umap_model,        
    hdbscan_model=hdbscan_model,  
)


In [None]:
topics, _ = topic_model.fit_transform(df_books['clean_description'], embeddings)
df_books['topic'] = topics

In [None]:
topic_model.fit(df_books['clean_description'].tolist())

In [None]:
df_books['topic'] = topic_model.transform(df_books['clean_description'].tolist())[0]

In [None]:
df_books.to_csv('books_w_topics.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import umap

for n_neighbors in [5, 15, 50]:
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=2)
    reduced_data = umap_model.fit_transform(embeddings)
    
    plt.figure(figsize=(8, 6))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], label=f"n_neighbors={n_neighbors}")
    plt.title(f"UMAP with n_neighbors={n_neighbors}")
    plt.legend()
    plt.show()

In [None]:
def recommend_books_bertopic(user_description, df_books, top_n=10, min_rating=4.0):
    # Get the user's input embedding
    user_input_embedding = get_sbert_embedding(user_description).reshape(1, -1)

    # Calculate similarities between user input and book descriptions
    similarities = cosine_similarity(user_input_embedding, embeddings).flatten()

    # Find the most similar topics to the user's input (using cosine similarity)
    user_topic = topic_model.transform([user_description])[0][0] 

    # Filter the books that belong to the same topic
    recommendations = df_books[df_books['topic'] == user_topic]

    # Get the indices of the recommendations
    recommendation_indices = recommendations.index

    # Filter similarities to only include recommendations
    similarities_filtered = similarities[recommendation_indices]

    # Add the similarity score to the recommendations
    recommendations['similarity'] = similarities_filtered

    # Filter by rating
    recommendations = recommendations[recommendations['avg_rating_books'] >= min_rating]

    # Sort by similarity and rating
    recommendations = recommendations.sort_values(by=['similarity', 'avg_rating_books'], ascending=[False, False])

    # Return top N books
    return recommendations[['title', 'avg_rating_books', 'name', 'description', 'similarity', 'topic']].head(top_n)

# Example Usage:
user_description = "Books about LGBTQ+ Activism"
recommended_books = recommend_books_bertopic(user_description, df_books, top_n=5)
print(recommended_books)

### Trying Bert - Just to test

In [None]:
# Download necessary stopwords and tokenizer
nltk.download('stopwords')
nltk.download('punkt')
tokenizer_nltk = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() 
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) 

def clean_text(text):
    """Cleans and preprocesses text."""
    text = re.sub('<[^>]*>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    tokens = tokenizer_nltk.tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords_set]
    tokens = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tokens]
    return " ".join(tokens)

In [None]:
df_books['clean_description'] = df_books['description'].apply(clean_text)

In [None]:
def get_bert_embedding(text):
    """Generates BERT embeddings for the given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

In [None]:
# Drop missing descriptions
df_books = df_books.dropna(subset=['clean_description'])
df_books = df_books.sample(frac=0.3, random_state=42)

In [None]:
df_books.to_csv('reduced_sample.csv', index=False)

In [None]:
df_books.reset_index(inplace=True)
df_books.drop(columns=['index'], inplace=True)

In [None]:
# Generate embeddings from description only
embeddings = np.array([get_bert_embedding(desc) for desc in df_books['clean_description']])

#### Lemmatizer

In [None]:
def recommend_books_bert(user_description, df_books, top_n=10, preferred_format=None, min_rating=0):
    # Finding the book's description from the user chosen title
    user_input_embedding = get_bert_embedding(user_description).reshape(1, -1)
    
    # Finding the similarities
    similarities = cosine_similarity(user_input_embedding, embeddings).flatten()
    
    # Creating a DF with the similatities
    recommendations = df_books.copy()
    recommendations['similarity'] = similarities
    
    # Filtering by format
    if preferred_format:
        recommendations = recommendations[recommendations['format'] == preferred_format]
    
    recommendations = recommendations[recommendations['avg_rating_books'] >= min_rating]
    
    # Sort by similarity and rating (higher ratings first)
    recommendations = recommendations.sort_values(by=['similarity', 'avg_rating_books'], ascending=[False, False])

    # Return the top N recommended books
    return recommendations[['title', 'avg_rating_books', 'name', 'description', 'similarity', 'format']].head(top_n)

# Example: User inputs a book title
user_description = "Biographies of writers, artists, and philosophers."

recommended_books = recommend_books_bert(user_description, df_books, top_n=5, preferred_format="Paperback", min_rating=4.0)

# Display recommendations
print(recommended_books)

#### Stemmer

In [None]:
stemmer = PorterStemmer()
stopwords_set = set(stopwords.words('english'))
def clean_text(text):
    """Cleans and preprocesses text with stemming."""
    text = re.sub('<[^>]*>', '', text)  
    text = text.lower() 
    tokens = word_tokenize(text)  
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords_set]  
    tokens = [stemmer.stem(word) for word in tokens]  
    return " ".join(tokens)

In [None]:
def recommend_books_bert(user_input_title, df_books, top_n=10, preferred_format=None, min_rating=0):
    # Finding the book's description from the user chosen title
    user_input_embedding = get_bert_embedding(user_description).reshape(1, -1)
    
    # Finding the similarities
    similarities = cosine_similarity(user_input_embedding, embeddings).flatten()
    
    # Creating a DF with the similatities
    recommendations = df_books.copy()
    recommendations['similarity'] = similarities
    
    # Filtering by format
    if preferred_format:
        recommendations = recommendations[recommendations['format'] == preferred_format]
    
    recommendations = recommendations[recommendations['avg_rating_books'] >= min_rating]
    
    # Sort by similarity and rating (higher ratings first)
    recommendations = recommendations.sort_values(by=['similarity', 'avg_rating_books'], ascending=[False, False])

    # Return the top N recommended books
    return recommendations[['title', 'avg_rating_books', 'name', 'description', 'similarity', 'format']].head(top_n)

# Example: User inputs a book title
user_description = "Biographies of writers, artists, and philosophers."

recommended_books = recommend_books_sbert(user_input_title, df_books, top_n=5, preferred_format="Paperback", min_rating=4.0)

# Display recommendations
print(recommended_books)

#### Clustering

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

# Initialize KMeans model
kmeans_model = KMeans(random_state=42)

visualizer = KElbowVisualizer(kmeans_model, k=(2, 20)) 

visualizer.fit(embeddings)
visualizer.show()

In [None]:
# Applying KMeans clustering
# Change the number of clusters based on the Elbow vizualizer above
kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(embeddings)

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(embeddings)

In [None]:
# Visualize clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=labels, palette='PuRd', s=50, edgecolor='k')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='lightgreen', marker='X', s=200, label='Centroids')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Cluster Visualization using PCA")
plt.legend()
plt.show()

### Save the model

In [None]:
# Save the KMeans model
with open("kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

# Save the PCA model
with open("pca_model.pkl", "wb") as f:
    pickle.dump(pca, f)

# Save BERT embeddings
with open("sbert_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

# Save BERTopic model
with open('bertopic_model.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

# Save SentenceTransformer (SBERT) model
sbert_model.save('sbert_model') 

with open('umap_model.pkl', 'wb') as f:
    pickle.dump(umap_model, f)

with open('hdbscan_model.pkl', 'wb') as f:
    pickle.dump(hdbscan_model, f)