In [None]:
# ----------------------- Environment Variable Configuration -----------------------
import os

# Limit the number of threads globally to prevent libraries from using all CPU cores
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["OPENBLAS_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["VECLIB_MAXIMUM_THREADS"] = "2"
os.environ["NUMEXPR_NUM_THREADS"] = "2"

# ----------------------- Imports -----------------------
import sys
import subprocess
import re
import json
import logging
import psutil
import string
from typing import List, Dict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, hamming_loss, f1_score
from sklearn.feature_selection import SelectKBest, chi2

from rich.logging import RichHandler
from rich.console import Console
from rich.traceback import install as install_rich_traceback

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from tqdm import tqdm

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from wordcloud import WordCloud

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ----------------------- Logging Configuration -----------------------
install_rich_traceback()
console = Console()
logger = logging.getLogger('NaiveBayesLogger')
logger.setLevel(logging.DEBUG)
rich_handler = RichHandler(console=console, rich_tracebacks=True)
rich_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
rich_handler.setFormatter(formatter)
if not logger.handlers:
    logger.addHandler(rich_handler)

# ----------------------- Utility Functions -----------------------
def log_memory_usage(stage: str):
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 ** 2)
    logger.info(f'Memory Usage after {stage}: {mem:.2f} MB')

def handle_exception(e: Exception, stage: str):
    logger.error(f'Exception in {stage}: {e}', exc_info=True)

# ----------------------- NLTK Resource Setup -----------------------
def setup_nltk():
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        logger.info("Downloading NLTK stopwords...")
        nltk.download('stopwords')
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        logger.info("Downloading NLTK WordNet...")
        nltk.download('wordnet')
    try:
        nltk.data.find('corpora/omw-1.4')
    except LookupError:
        logger.info("Downloading NLTK omw-1.4...")
        nltk.download('omw-1.4')
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        logger.info("Downloading NLTK Punkt tokenizer...")
        nltk.download('punkt')

setup_nltk()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load custom stopwords if needed
file_path = 'stopwords.txt'
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_lines = [line.strip() for line in file]
    logger.info(f"Loaded {len(stop_lines)} custom stopwords from '{file_path}'.")
except FileNotFoundError:
    logger.error(f"The stopwords file '{file_path}' was not found.")
    stop_lines = []
except Exception as e:
    handle_exception(e, f'Reading Stopwords from {file_path}')
    stop_lines = []

def preprocess_text(text: str, debug: bool=False) -> str:
    try:
        if not isinstance(text, str):
            text = ''

        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)

        # Normalize whitespace
        text = ' '.join(text.split())

        # Convert to lowercase
        text = text.lower()

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords and short words
        filtered_tokens = [word for word in tokens if word not in stop_lines and len(word) > 2]

        # Lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

        return ' '.join(lemmatized_tokens)
    except Exception as e:
        handle_exception(e, "Preprocessing Text")
        return ''

# ----------------------- Data Loading -----------------------
file_path = "collected_books.json"
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        novels = json.load(f)
    logger.info(f"Loaded {len(novels)} novels from {file_path}.")
    log_memory_usage("Data Loading")
except Exception as e:
    handle_exception(e, "Loading Data")
    novels = []

if not novels:
    logger.error("No data loaded. Please check the file path and JSON format.")
else:
    logger.info("Data loaded successfully.")

# ----------------------- DataFrame Conversion & Preprocessing -----------------------
# Convert novels list of dicts to DataFrame
data = pd.DataFrame(novels)

required_columns = {'id', 'title', 'genres', 'text'}
if not required_columns.issubset(data.columns):
    missing = required_columns - set(data.columns)
    logger.error(f"The following required columns are missing from the data: {missing}")
    raise ValueError(f"Missing columns: {missing}")
else:
    logger.info("All required columns are present in the data.")

logger.info(f"DataFrame shape: {data.shape}")
log_memory_usage("Data Preprocessing")

# Preprocess the text using progress_apply (similar approach as BasicModels)
logger.info("Starting text preprocessing using DataFrame and progress_apply...")
tqdm.pandas(desc="Preprocessing Text")
data['processed_text'] = data['text'].progress_apply(preprocess_text)
logger.info("Text preprocessing completed.")
log_memory_usage("Text Preprocessing Completed")

# Genre Binarization
try:
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(data["genres"])
    genre_labels = mlb.classes_
    logger.info(f"Genres have been binarized. Number of genres: {len(genre_labels)}.")
    log_memory_usage("Genre Binarization")
except Exception as e:
    handle_exception(e, "Genre Binarization")
    y = None

# Calculate TF-IDF features
try:
    vectorizer = TfidfVectorizer(
        max_features=20000,
        ngram_range=(1,3),
        max_df=0.70,
        min_df=5,
        norm='l2'
    )
    X = vectorizer.fit_transform(data["processed_text"])
    logger.info(f"TF-IDF vectorization completed. Feature matrix shape: {X.shape}.")
    log_memory_usage("TF-IDF Vectorization")
except Exception as e:
    handle_exception(e, "TF-IDF Vectorization")
    X = None

# Handle text length as a feature
try:
    data['text_length'] = data['processed_text'].apply(lambda x: len(x.split()))
    scaler = MinMaxScaler()
    from scipy.sparse import hstack, csr_matrix
    X_length_scaled = scaler.fit_transform(data['text_length'].values.reshape(-1, 1))
    X_combined = hstack([X, csr_matrix(X_length_scaled)])
    logger.info(f"Combined TF-IDF features with text length. Shape: {X_combined.shape}.")
    log_memory_usage("Handling Varying Text Lengths")

    # Verify no negative values
    if isinstance(X_combined, csr_matrix):
        if (X_combined.data < 0).any():
            logger.error("Negative values found in X_combined.")
            raise ValueError("Negative values in X_combined after scaling.")
    else:
        if (X_combined < 0).any():
            logger.error("Negative values found in X_combined.")
            raise ValueError("Negative values in X_combined after scaling.")
except Exception as e:
    handle_exception(e, "Handling Varying Text Lengths")
    X_combined = X

# Feature Selection
try:
    selector = SelectKBest(chi2, k=10000)
    X_selected = selector.fit_transform(X_combined, y)
    logger.info(f"Feature selection completed. Selected feature matrix shape: {X_selected.shape}.")
    log_memory_usage("Feature Selection")
except Exception as e:
    handle_exception(e, "Feature Selection")
    X_selected = X_combined

# Stratified Train-Test Split
try:
    if X_selected is None or y is None:
        raise ValueError("Feature matrix 'X_selected' or labels 'y' are not defined.")
    logger.info("Performing stratified train-test split using MultilabelStratifiedShuffleSplit...")
    msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in msss.split(X_selected, y):
        X_train, X_test = X_selected[train_index], X_selected[test_index]
        y_train, y_test = y[train_index], y[test_index]

    logger.info("Train-Test split completed.")
    logger.info(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
    logger.info(f"Testing set shape: X_test={X_test.shape}, y_test={y_test.shape}")
    log_memory_usage("Train-Test Split")
except Exception as e:
    handle_exception(e, 'Train-Test Split')
    X_train, X_test, y_train, y_test = None, None, None, None

# Multinomial Naive Bayes Training
try:
    logger.info("Starting training for Multinomial Naive Bayes...")
    nb_model = OneVsRestClassifier(MultinomialNB(alpha=1.0))
    nb_model.fit(X_train, y_train)
    logger.info("Multinomial Naive Bayes training completed.")
    log_memory_usage("Multinomial Naive Bayes Training")
except Exception as e:
    handle_exception(e, "Multinomial Naive Bayes Training")
    nb_model = None

# Model Evaluation
try:
    if nb_model:
        logger.info("Making predictions on the test set with Multinomial Naive Bayes...")
        y_pred = nb_model.predict(X_test)
        logger.info("Predictions completed.")
        log_memory_usage("Naive Bayes Predictions")

        # Compute Metrics
        h_loss = hamming_loss(y_test, y_pred)
        micro_f1 = f1_score(y_test, y_pred, average='micro')
        macro_f1 = f1_score(y_test, y_pred, average='macro')

        logger.info(f"Hamming Loss: {h_loss:.4f}")
        logger.info(f"Micro F1 Score: {micro_f1:.4f}")
        logger.info(f"Macro F1 Score: {macro_f1:.4f}")

        report = classification_report(y_test, y_pred, target_names=mlb.classes_, output_dict=True)
        logger.info("Classification Report:")
        logger.info(f"{classification_report(y_test, y_pred, target_names=mlb.classes_)}")

        console.print(f"\n[bold green]Multinomial Naive Bayes Model Performance Metrics:[/bold green]")
        console.print(f"Hamming Loss: [bold]{h_loss:.4f}[/bold]")
        console.print(f"Micro F1 Score: [bold]{micro_f1:.4f}[/bold]")
        console.print(f"Macro F1 Score: [bold]{macro_f1:.4f}[/bold]")
    else:
        logger.error("Model is not trained. Skipping evaluation.")
except Exception as e:
    handle_exception(e, "Model Evaluation")

logger.info("Naive Bayes processing completed successfully.")


[nltk_data] Downloading package wordnet to C:\Users\newye/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[nltk_data] Downloading package omw-1.4 to C:\Users\newye/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Preprocessing Text:  75%|█████████████████████████████████████████████▏              | 574/762 [07:53<01:45,  1.78it/s]