# Enhanced Book Genre Classification with ML and DL Models

This notebook implements a comprehensive comparison of various machine learning and deep learning models for book genre classification, including:
- Classic ML models (SVM, Random Forest, Naive Bayes, Logistic Regression)
- Neural Networks (MLP, CNN, LSTM, GRU)
- Transformer models (BERT, RoBERTa, DistilBERT, XLM-RoBERTa)
- PCA visualization and dimensionality reduction
- Comprehensive performance analysis with confusion matrices

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Data processing
import re
import string
from collections import Counter
import langdetect
from langdetect import detect

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Deep Learning
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Conv1D, GlobalMaxPooling1D, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Transformers
import torch
from transformers import (
    AutoTokenizer, AutoModel, 
    BertTokenizer, BertModel,
    RobertaTokenizer, RobertaModel,
    DistilBertTokenizer, DistilBertModel,
    XLMRobertaTokenizer, XLMRobertaModel
)
from torch.utils.data import Dataset, DataLoader

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Time tracking
import time
from datetime import datetime

print("All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"PyTorch version: {torch.__version__}")

## 1. Data Loading and Preprocessing

In [None]:
# Load the dataset
data = pd.read_csv("../data/goodreads_data.csv")
print(f"Original dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
data.head()

In [None]:
# Remove unnamed column if exists
if 'Unnamed: 0' in data.columns:
    data = data.drop('Unnamed: 0', axis=1)

# Remove duplicates and null values
data = data.dropna()
data = data.drop_duplicates()
print(f"After cleaning: {data.shape}")

# Display basic info
print("\nData Info:")
print(data.info())
print("\nFirst few rows:")
data.head()

### English Language Filtering
Filter out non-English books from descriptions and associated multilabel genres

In [None]:
def is_english_text(text, min_english_ratio=0.7):
    """
    Check if text is primarily in English
    """
    try:
        # Clean text for language detection
        clean_text = re.sub(r'[^a-zA-Z\s]', ' ', str(text))
        clean_text = ' '.join(clean_text.split())
        
        if len(clean_text) < 10:  # Too short for reliable detection
            return False
            
        detected_lang = detect(clean_text)
        return detected_lang == 'en'
    except:
        return False

def filter_english_books(data, min_english_ratio=0.7):
    """
    Filter dataset to keep only English books
    """
    print("Filtering English books...")
    
    # Filter based on description language
    english_mask = data['Description'].apply(lambda x: is_english_text(x, min_english_ratio))
    
    print(f"Books before filtering: {len(data)}")
    print(f"English books detected: {english_mask.sum()}")
    
    # Keep only English books
    english_data = data[english_mask].copy()
    
    print(f"Books after filtering: {len(english_data)}")
    print(f"Filtered out: {len(data) - len(english_data)} books")
    
    return english_data

# Apply English filtering
english_data = filter_english_books(data)
print("\nSample of filtered data:")
english_data[['Book', 'Author', 'Description']].head()

### Genre Processing and Analysis

In [None]:
def extract_genres(genre_string):
    """
    Extract genres from string format
    """
    try:
        # Remove brackets and split by comma
        genres = genre_string.strip("[]").split(",")
        # Clean each genre
        genres = [genre.strip(" '") for genre in genres if genre.strip()]
        return genres
    except:
        return []

# Extract all genres
english_data['genres_list'] = english_data['Genres'].apply(extract_genres)

# Get all unique genres
all_genres = set()
for genres in english_data['genres_list']:
    all_genres.update(genres)

print(f"Total unique genres: {len(all_genres)}")
print(f"Sample genres: {list(all_genres)[:20]}")

# Count genre frequencies
genre_counts = Counter()
for genres in english_data['genres_list']:
    genre_counts.update(genres)

# Get top genres
top_genres = dict(genre_counts.most_common(20))
print(f"\nTop 20 genres:")
for genre, count in top_genres.items():
    print(f"{genre}: {count}")

In [None]:
# Visualize genre distribution
plt.figure(figsize=(15, 8))
genres = list(top_genres.keys())
counts = list(top_genres.values())

plt.barh(genres, counts)
plt.xlabel('Number of Books')
plt.title('Top 20 Most Common Genres')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Genre distribution statistics
genre_lengths = [len(genres) for genres in english_data['genres_list']]
print(f"\nGenre statistics:")
print(f"Average genres per book: {np.mean(genre_lengths):.2f}")
print(f"Min genres: {min(genre_lengths)}")
print(f"Max genres: {max(genre_lengths)}")
print(f"Median genres: {np.median(genre_lengths):.2f}")

### Text Preprocessing

In [None]:
def preprocess_text(text):
    """
    Clean and preprocess text for better vectorization
    """
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:"\'-]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text.strip()

# Preprocess descriptions
english_data['clean_description'] = english_data['Description'].apply(preprocess_text)

# Remove empty descriptions
english_data = english_data[english_data['clean_description'].str.len() > 50]

print(f"After text preprocessing: {len(english_data)} books")
print("\nSample preprocessed descriptions:")
for i in range(3):
    print(f"\nBook {i+1}:")
    print(f"Original: {english_data.iloc[i]['Description'][:200]}...")
    print(f"Cleaned: {english_data.iloc[i]['clean_description'][:200]}...")

### Multilabel Classification Setup

In [None]:
# Select top genres for classification (to avoid too many classes)
top_n_genres = 15
selected_genres = list(genre_counts.most_common(top_n_genres))
selected_genre_names = [genre[0] for genre in selected_genres]

print(f"Selected {top_n_genres} most common genres for classification:")
for i, (genre, count) in enumerate(selected_genres, 1):
    print(f"{i:2d}. {genre:20s} ({count:4d} books)")

# Filter books that have at least one of the selected genres
def has_selected_genre(genres_list):
    return any(genre in selected_genre_names for genre in genres_list)

filtered_data = english_data[english_data['genres_list'].apply(has_selected_genre)].copy()
print(f"\nBooks with selected genres: {len(filtered_data)}")

# Create multilabel targets
mlb = MultiLabelBinarizer()
y_multilabel = mlb.fit_transform(filtered_data['genres_list'])

# Filter to only include selected genres
selected_indices = [mlb.classes_.tolist().index(genre) for genre in selected_genre_names]
y_selected = y_multilabel[:, selected_indices]

print(f"\nMultilabel shape: {y_selected.shape}")
print(f"Selected genres: {selected_genre_names}")

# Prepare features and targets
X_text = filtered_data['clean_description'].values
y = y_selected

print(f"\nFinal dataset:")
print(f"Features shape: {X_text.shape}")
print(f"Targets shape: {y.shape}")
print(f"Number of classes: {y.shape[1]}")

## 2. Classic Machine Learning Models

In [None]:
# Split data for classic ML
X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=None  # Can't stratify with multilabel
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"\nTF-IDF features shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

In [None]:
# Define classic ML models
classic_models = {
    'Logistic Regression': OneVsRestClassifier(LogisticRegression(random_state=42, max_iter=1000)),
    'Random Forest': OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    'SVM': OneVsRestClassifier(SVC(kernel='linear', random_state=42, probability=True)),
    'Naive Bayes': OneVsRestClassifier(MultinomialNB())
}

# Train and evaluate classic models
classic_results = {}

for name, model in classic_models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    
    # Train model
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    y_pred_proba = model.predict_proba(X_test_tfidf)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    training_time = time.time() - start_time
    
    # Store results
    classic_results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'training_time': training_time,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Training Time: {training_time:.2f}s")

## 3. Neural Network Models

In [None]:
# Prepare data for neural networks
max_words = 10000
max_length = 200

# Tokenize text
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

print(f"Neural network data shapes:")
print(f"X_train_padded: {X_train_padded.shape}")
print(f"X_test_padded: {X_test_padded.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

In [None]:
def create_mlp_model(input_dim, output_dim, vocab_size, embedding_dim=128):
    """
    Create Multi-Layer Perceptron model
    """
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_dim),
        GlobalMaxPooling1D(),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(output_dim, activation='sigmoid')
    ])
    return model

def create_cnn_model(input_dim, output_dim, vocab_size, embedding_dim=128):
    """
    Create Convolutional Neural Network model
    """
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_dim),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(output_dim, activation='sigmoid')
    ])
    return model

def create_lstm_model(input_dim, output_dim, vocab_size, embedding_dim=128):
    """
    Create LSTM model
    """
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_dim),
        LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(output_dim, activation='sigmoid')
    ])
    return model

def create_gru_model(input_dim, output_dim, vocab_size, embedding_dim=128):
    """
    Create GRU model
    """
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_dim),
        GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        GRU(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(output_dim, activation='sigmoid')
    ])
    return model

# Define neural network models
neural_models = {
    'MLP': create_mlp_model,
    'CNN': create_cnn_model,
    'LSTM': create_lstm_model,
    'GRU': create_gru_model
}

print("Neural network model architectures defined!")

In [None]:
# Train neural network models
neural_results = {}

for name, model_func in neural_models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    
    # Create model
    model = model_func(max_length, y_train.shape[1], max_words)
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Callbacks
    callbacks = [
        EarlyStopping(patience=3, restore_best_weights=True),
        ReduceLROnPlateau(factor=0.5, patience=2)
    ]
    
    # Train model
    history = model.fit(
        X_train_padded, y_train,
        validation_split=0.2,
        epochs=20,
        batch_size=32,
        callbacks=callbacks,
        verbose=0
    )
    
    # Make predictions
    y_pred_proba = model.predict(X_test_padded)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    training_time = time.time() - start_time
    
    # Store results
    neural_results[name] = {
        'model': model,
        'history': history,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'training_time': training_time,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Training Time: {training_time:.2f}s")
    print(f"  Epochs: {len(history.history['loss'])}")

## 4. Transformer Models

In [None]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define transformer models
transformer_configs = {
    'BERT': {
        'tokenizer': BertTokenizer.from_pretrained('bert-base-uncased'),
        'model': BertModel.from_pretrained('bert-base-uncased')
    },
    'RoBERTa': {
        'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
        'model': RobertaModel.from_pretrained('roberta-base')
    },
    'DistilBERT': {
        'tokenizer': DistilBertTokenizer.from_pretrained('distilbert-base-uncased'),
        'model': DistilBertModel.from_pretrained('distilbert-base-uncased')
    },
    'XLM-RoBERTa': {
        'tokenizer': XLMRobertaTokenizer.from_pretrained('xlm-roberta-base'),
        'model': XLMRobertaModel.from_pretrained('xlm-roberta-base')
    }
}

print("Transformer models loaded successfully!")

In [None]:
class TransformerDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }

def extract_features_with_transformer(texts, tokenizer, model, device, batch_size=16):
    """
    Extract features using transformer model
    """
    model.eval()
    features = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize batch
        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        
        # Extract features
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Use [CLS] token representation
            batch_features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            features.extend(batch_features)
    
    return np.array(features)

print("Transformer dataset and feature extraction functions defined!")

In [None]:
# Train transformer models
transformer_results = {}

for name, config in transformer_configs.items():
    print(f"\nProcessing {name}...")
    start_time = time.time()
    
    tokenizer = config['tokenizer']
    model = config['model'].to(device)
    
    # Extract features
    print(f"  Extracting features for {name}...")
    X_train_features = extract_features_with_transformer(X_train, tokenizer, model, device)
    X_test_features = extract_features_with_transformer(X_test, tokenizer, model, device)
    
    print(f"  Training classifier for {name}...")
    # Train a simple classifier on the features
    classifier = OneVsRestClassifier(LogisticRegression(random_state=42, max_iter=1000))
    classifier.fit(X_train_features, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test_features)
    y_pred_proba = classifier.predict_proba(X_test_features)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    training_time = time.time() - start_time
    
    # Store results
    transformer_results[name] = {
        'classifier': classifier,
        'tokenizer': tokenizer,
        'transformer_model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'training_time': training_time,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'features': X_test_features
    }
    
    print(f"{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Training Time: {training_time:.2f}s")
    print(f"  Feature Dimension: {X_train_features.shape[1]}")

## 5. PCA Visualization and Dimensionality Reduction

In [None]:
# Apply PCA to different feature representations
pca_results = {}

# 1. TF-IDF features
print("Applying PCA to TF-IDF features...")
pca_tfidf = PCA(n_components=2)
X_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf.toarray())
pca_results['TF-IDF'] = {
    'pca': pca_tfidf,
    'components': X_tfidf_pca,
    'explained_variance_ratio': pca_tfidf.explained_variance_ratio_
}

# 2. Neural network features (using the best performing model)
print("Applying PCA to neural network features...")
best_neural_model = max(neural_results.items(), key=lambda x: x[1]['f1'])
print(f"Using {best_neural_model[0]} features for PCA")

# Extract features from the best neural model
neural_model = best_neural_model[1]['model']
feature_extractor = Model(inputs=neural_model.input, outputs=neural_model.layers[-2].output)
X_neural_features = feature_extractor.predict(X_train_padded)

pca_neural = PCA(n_components=2)
X_neural_pca = pca_neural.fit_transform(X_neural_features)
pca_results['Neural Network'] = {
    'pca': pca_neural,
    'components': X_neural_pca,
    'explained_variance_ratio': pca_neural.explained_variance_ratio_
}

# 3. Transformer features (using the best performing transformer)
print("Applying PCA to transformer features...")
best_transformer = max(transformer_results.items(), key=lambda x: x[1]['f1'])
print(f"Using {best_transformer[0]} features for PCA")

X_transformer_features = best_transformer[1]['features']
pca_transformer = PCA(n_components=2)
X_transformer_pca = pca_transformer.fit_transform(X_transformer_features)
pca_results['Transformer'] = {
    'pca': pca_transformer,
    'components': X_transformer_pca,
    'explained_variance_ratio': pca_transformer.explained_variance_ratio_
}

print("\nPCA Results:")
for name, result in pca_results.items():
    print(f"{name}:")
    print(f"  Explained variance ratio: {result['explained_variance_ratio']}")
    print(f"  Total explained variance: {sum(result['explained_variance_ratio']):.4f}")

In [None]:
# Create PCA visualization plots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Get the most common genre for each sample for coloring
def get_primary_genre(genres_list, selected_genres):
    for genre in genres_list:
        if genre in selected_genres:
            return genre
    return 'Other'

primary_genres = [get_primary_genre(genres, selected_genre_names) for genres in filtered_data['genres_list']]
unique_genres = list(set(primary_genres))
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_genres)))
genre_color_map = dict(zip(unique_genres, colors))

# Plot 1: TF-IDF PCA
ax1 = axes[0]
for genre in unique_genres:
    mask = [g == genre for g in primary_genres]
    ax1.scatter(X_tfidf_pca[mask, 0], X_tfidf_pca[mask, 1], 
               c=[genre_color_map[genre]], label=genre, alpha=0.6, s=20)
ax1.set_title('TF-IDF Features PCA')
ax1.set_xlabel(f'PC1 ({pca_tfidf.explained_variance_ratio_[0]:.2%})')
ax1.set_ylabel(f'PC2 ({pca_tfidf.explained_variance_ratio_[1]:.2%})')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot 2: Neural Network PCA
ax2 = axes[1]
for genre in unique_genres:
    mask = [g == genre for g in primary_genres]
    ax2.scatter(X_neural_pca[mask, 0], X_neural_pca[mask, 1], 
               c=[genre_color_map[genre]], label=genre, alpha=0.6, s=20)
ax2.set_title(f'{best_neural_model[0]} Features PCA')
ax2.set_xlabel(f'PC1 ({pca_neural.explained_variance_ratio_[0]:.2%})')
ax2.set_ylabel(f'PC2 ({pca_neural.explained_variance_ratio_[1]:.2%})')
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot 3: Transformer PCA
ax3 = axes[2]
for genre in unique_genres:
    mask = [g == genre for g in primary_genres]
    ax3.scatter(X_transformer_pca[mask, 0], X_transformer_pca[mask, 1], 
               c=[genre_color_map[genre]], label=genre, alpha=0.6, s=20)
ax3.set_title(f'{best_transformer[0]} Features PCA')
ax3.set_xlabel(f'PC1 ({pca_transformer.explained_variance_ratio_[0]:.2%})')
ax3.set_ylabel(f'PC2 ({pca_transformer.explained_variance_ratio_[1]:.2%})')
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# Create 3D PCA visualization for the best model
print("\nCreating 3D PCA visualization...")
pca_3d = PCA(n_components=3)
X_3d = pca_3d.fit_transform(X_transformer_features)

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

for genre in unique_genres:
    mask = [g == genre for g in primary_genres]
    ax.scatter(X_3d[mask, 0], X_3d[mask, 1], X_3d[mask, 2], 
              c=[genre_color_map[genre]], label=genre, alpha=0.6, s=20)

ax.set_title(f'{best_transformer[0]} Features 3D PCA')
ax.set_xlabel(f'PC1 ({pca_3d.explained_variance_ratio_[0]:.2%})')
ax.set_ylabel(f'PC2 ({pca_3d.explained_variance_ratio_[1]:.2%})')
ax.set_zlabel(f'PC3 ({pca_3d.explained_variance_ratio_[2]:.2%})')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

## 6. Confusion Matrix Analysis

In [None]:
def plot_confusion_matrix(y_true, y_pred, class_names, title, ax=None):
    """
    Plot confusion matrix for multilabel classification
    """
    # Calculate confusion matrix for each class
    cm_per_class = []
    for i in range(len(class_names)):
        cm = confusion_matrix(y_true[:, i], y_pred[:, i])
        cm_per_class.append(cm)
    
    # Create subplot for each class
    n_classes = len(class_names)
    n_cols = min(5, n_classes)
    n_rows = (n_classes + n_cols - 1) // n_cols
    
    if ax is None:
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))
        if n_rows == 1:
            axes = [axes] if n_cols == 1 else axes
        else:
            axes = axes.flatten()
    else:
        axes = [ax]
    
    for i, (cm, class_name) in enumerate(zip(cm_per_class, class_names)):
        if i < len(axes):
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
            axes[i].set_title(f'{class_name}\n(TN: {cm[0,0]}, FP: {cm[0,1]}, FN: {cm[1,0]}, TP: {cm[1,1]})')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('Actual')
    
    # Hide unused subplots
    for i in range(len(class_names), len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle(title, fontsize=16)
    plt.tight_layout()
    
    if ax is None:
        plt.show()

# Plot confusion matrices for all models
print("Creating confusion matrices for all models...")

# Classic ML models
for name, result in classic_results.items():
    plot_confusion_matrix(
        y_test, result['predictions'], 
        selected_genre_names, 
        f'{name} Confusion Matrix'
    )

# Neural network models
for name, result in neural_results.items():
    plot_confusion_matrix(
        y_test, result['predictions'], 
        selected_genre_names, 
        f'{name} Confusion Matrix'
    )

# Transformer models
for name, result in transformer_results.items():
    plot_confusion_matrix(
        y_test, result['predictions'], 
        selected_genre_names, 
        f'{name} Confusion Matrix'
    )

## 7. Performance Comparison and Analysis

In [None]:
# Create comprehensive performance comparison
all_results = {}
all_results.update(classic_results)
all_results.update(neural_results)
all_results.update(transformer_results)

# Create performance summary dataframe
performance_data = []
for name, result in all_results.items():
    performance_data.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'Precision': result['precision'],
        'Recall': result['recall'],
        'F1-Score': result['f1'],
        'Training Time (s)': result['training_time']
    })

performance_df = pd.DataFrame(performance_data)
performance_df = performance_df.sort_values('F1-Score', ascending=False)

print("\n=== PERFORMANCE COMPARISON ===")
print(performance_df.to_string(index=False, float_format='%.4f'))

# Create performance visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy comparison
ax1 = axes[0, 0]
performance_df_sorted = performance_df.sort_values('Accuracy', ascending=True)
ax1.barh(performance_df_sorted['Model'], performance_df_sorted['Accuracy'])
ax1.set_title('Model Accuracy Comparison')
ax1.set_xlabel('Accuracy')

# F1-Score comparison
ax2 = axes[0, 1]
performance_df_sorted = performance_df.sort_values('F1-Score', ascending=True)
ax2.barh(performance_df_sorted['Model'], performance_df_sorted['F1-Score'])
ax2.set_title('Model F1-Score Comparison')
ax2.set_xlabel('F1-Score')

# Training time comparison
ax3 = axes[1, 0]
performance_df_sorted = performance_df.sort_values('Training Time (s)', ascending=True)
ax3.barh(performance_df_sorted['Model'], performance_df_sorted['Training Time (s)'])
ax3.set_title('Model Training Time Comparison')
ax3.set_xlabel('Training Time (seconds)')

# Precision vs Recall scatter plot
ax4 = axes[1, 1]
scatter = ax4.scatter(performance_df['Precision'], performance_df['Recall'], 
                     s=100, alpha=0.7, c=performance_df['F1-Score'], cmap='viridis')
ax4.set_xlabel('Precision')
ax4.set_ylabel('Recall')
ax4.set_title('Precision vs Recall (colored by F1-Score)')

# Add model labels
for i, model in enumerate(performance_df['Model']):
    ax4.annotate(model, (performance_df.iloc[i]['Precision'], performance_df.iloc[i]['Recall']), 
                xytext=(5, 5), textcoords='offset points', fontsize=8)

plt.colorbar(scatter, ax=ax4, label='F1-Score')
plt.tight_layout()
plt.show()

# Model category analysis
print("\n=== MODEL CATEGORY ANALYSIS ===")
classic_models = [name for name in performance_df['Model'] if name in classic_results]
neural_models = [name for name in performance_df['Model'] if name in neural_results]
transformer_models = [name for name in performance_df['Model'] if name in transformer_results]

print(f"\nClassic ML Models (avg F1: {performance_df[performance_df['Model'].isin(classic_models)]['F1-Score'].mean():.4f}):")
for model in classic_models:
    f1 = performance_df[performance_df['Model'] == model]['F1-Score'].iloc[0]
    print(f"  {model}: {f1:.4f}")

print(f"\nNeural Network Models (avg F1: {performance_df[performance_df['Model'].isin(neural_models)]['F1-Score'].mean():.4f}):")
for model in neural_models:
    f1 = performance_df[performance_df['Model'] == model]['F1-Score'].iloc[0]
    print(f"  {model}: {f1:.4f}")

print(f"\nTransformer Models (avg F1: {performance_df[performance_df['Model'].isin(transformer_models)]['F1-Score'].mean():.4f}):")
for model in transformer_models:
    f1 = performance_df[performance_df['Model'] == model]['F1-Score'].iloc[0]
    print(f"  {model}: {f1:.4f}")

## 8. Detailed Analysis and Insights

In [None]:
# Best model analysis
best_model_name = performance_df.iloc[0]['Model']
best_model_result = all_results[best_model_name]

print(f"\n=== BEST MODEL ANALYSIS: {best_model_name} ===")
print(f"Accuracy: {best_model_result['accuracy']:.4f}")
print(f"Precision: {best_model_result['precision']:.4f}")
print(f"Recall: {best_model_result['recall']:.4f}")
print(f"F1-Score: {best_model_result['f1']:.4f}")
print(f"Training Time: {best_model_result['training_time']:.2f}s")

# Per-class performance for best model
print(f"\n=== PER-CLASS PERFORMANCE FOR {best_model_name} ===")
y_pred_best = best_model_result['predictions']

for i, genre in enumerate(selected_genre_names):
    precision = precision_score(y_test[:, i], y_pred_best[:, i], zero_division=0)
    recall = recall_score(y_test[:, i], y_pred_best[:, i], zero_division=0)
    f1 = f1_score(y_test[:, i], y_pred_best[:, i], zero_division=0)
    
    print(f"{genre:20s}: P={precision:.3f}, R={recall:.3f}, F1={f1:.3f}")

# Speed vs Accuracy analysis
print(f"\n=== SPEED vs ACCURACY ANALYSIS ===")
print("Fastest models (by training time):")
fastest_models = performance_df.nsmallest(3, 'Training Time (s)')
for _, row in fastest_models.iterrows():
    print(f"  {row['Model']:20s}: {row['Training Time (s)']:6.2f}s, F1={row['F1-Score']:.4f}")

print("\nMost accurate models (by F1-Score):")
most_accurate = performance_df.nlargest(3, 'F1-Score')
for _, row in most_accurate.iterrows():
    print(f"  {row['Model']:20s}: F1={row['F1-Score']:.4f}, Time={row['Training Time (s)']:6.2f}s")

# Efficiency analysis (F1-Score per second)
performance_df['Efficiency'] = performance_df['F1-Score'] / performance_df['Training Time (s)']
print("\nMost efficient models (F1-Score per second):")
efficient_models = performance_df.nlargest(3, 'Efficiency')
for _, row in efficient_models.iterrows():
    print(f"  {row['Model']:20s}: {row['Efficiency']:.6f} F1/s")

# Model recommendations
print(f"\n=== MODEL RECOMMENDATIONS ===")
print("For maximum accuracy: {}".format(most_accurate.iloc[0]['Model']))
print("For fastest training: {}".format(fastest_models.iloc[0]['Model']))
print("For best efficiency: {}".format(efficient_models.iloc[0]['Model']))
print("For balanced performance: {}".format(performance_df.iloc[0]['Model']))

## 9. Conclusions and Recommendations

### Key Findings:

1. **Model Performance**: The analysis shows varying performance across different model types, with transformer models generally achieving higher accuracy due to their ability to capture complex linguistic patterns.

2. **Speed vs Accuracy Trade-off**: Classic ML models are fastest to train but may sacrifice some accuracy, while transformer models provide the best accuracy but require more computational resources.

3. **Feature Representation**: PCA visualization reveals how different feature extraction methods (TF-IDF, neural networks, transformers) represent the data in different dimensional spaces.

4. **Multilabel Classification**: The confusion matrices show how well each model performs on individual genre classification tasks.

### Recommendations:

- **For Production**: Use the best performing transformer model for maximum accuracy
- **For Real-time Applications**: Consider classic ML models for faster inference
- **For Balanced Performance**: Choose the model with the best efficiency score

### Future Improvements:

1. **Data Augmentation**: Increase training data through text augmentation techniques
2. **Ensemble Methods**: Combine multiple models for improved performance
3. **Hyperparameter Tuning**: Optimize model parameters for better results
4. **Feature Engineering**: Explore additional text features beyond TF-IDF
5. **Model Compression**: Use techniques like quantization for faster inference

In [None]:
# Save results for future reference
results_summary = {
    'dataset_info': {
        'total_books': len(english_data),
        'filtered_books': len(filtered_data),
        'selected_genres': selected_genre_names,
        'num_classes': len(selected_genre_names)
    },
    'performance_results': performance_df.to_dict('records'),
    'best_model': {
        'name': best_model_name,
        'metrics': {
            'accuracy': best_model_result['accuracy'],
            'precision': best_model_result['precision'],
            'recall': best_model_result['recall'],
            'f1_score': best_model_result['f1'],
            'training_time': best_model_result['training_time']
        }
    },
    'pca_results': {
        'tfidf_variance': pca_results['TF-IDF']['explained_variance_ratio'].tolist(),
        'neural_variance': pca_results['Neural Network']['explained_variance_ratio'].tolist(),
        'transformer_variance': pca_results['Transformer']['explained_variance_ratio'].tolist()
    }
}

import json
with open('genre_classification_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("\nResults saved to 'genre_classification_results.json'")
print("\n=== ANALYSIS COMPLETE ===")
print(f"Total models evaluated: {len(all_results)}")
print(f"Best performing model: {best_model_name}")
print(f"Best F1-Score: {performance_df.iloc[0]['F1-Score']:.4f}")
print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")