In [1]:
# ==============================
# 1. Setup and Installation
# ==============================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check CUDA and driver versions
!nvcc --version  # Check CUDA version
!nvidia-smi      # Check driver version

# Install RAPIDS and other required libraries
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Mounted at /content/drive
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Sat Nov  2 22:12:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|            

In [2]:
## After restarting, install remaining necessary libraries
# Run this cell after restarting the runtime
!pip install bertopic==0.16.3
!pip install octis
!pip install sentence-transformers
!pip install umap-learn==0.5.3  # Specify a compatible version
!pip install hdbscan
!pip install tqdm
!pip install pandas
!pip install gensim
!pip install wandb
!pip install umap
!pip install scipy
!pip install nltk

Collecting bertopic==0.16.3
  Downloading bertopic-0.16.3-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic==0.16.3)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic==0.16.3)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic==0.16.3)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.3-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [38]:
# =====================================
# 1. Import Libraries
# =====================================

import os
import time
import logging
import pandas as pd
import numpy as np
import re
from datetime import datetime
import torch
import gc
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Import RAPIDS' UMAP and HDBSCAN
from cuml.manifold import UMAP  # GPU-accelerated UMAP
from cuml.cluster import HDBSCAN  # GPU-accelerated HDBSCAN
import cupy as cp  # For GPU arrays

# Import NLTK modules for text processing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Import BERTopic's representation models
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# Import json module
import json

# Import string module for punctuation handling
import string

# =====================================
# 2. Configure Logging
# =====================================

# Define logging configuration
LOG_FILENAME = '/content/drive/MyDrive/bertopic_training.log'

# Create a custom logger
logger = logging.getLogger('BERTopic_Training')
logger.setLevel(logging.DEBUG)  # Set to DEBUG to capture all levels of logs

# Prevent adding multiple handlers in environments like Google Colab
if not logger.handlers:
    # Create handlers
    c_handler = logging.StreamHandler()
    f_handler = logging.FileHandler(LOG_FILENAME)
    c_handler.setLevel(logging.INFO)  # Console handler set to INFO
    f_handler.setLevel(logging.DEBUG)  # File handler set to DEBUG

    # Create formatters and add them to handlers
    c_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s',
                                 datefmt='%Y-%m-%d %H:%M:%S')
    f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s',
                                 datefmt='%Y-%m-%d %H:%M:%S')
    c_handler.setFormatter(c_format)
    f_handler.setFormatter(f_format)

    # Add handlers to the logger
    logger.addHandler(c_handler)
    logger.addHandler(f_handler)

# =====================================
# 3. Define Paths and Load Stop Words
# =====================================

# Define the paths for datasets, models, and resources
dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new.csv'
additional_stop_words_characters_names = '/content/drive/MyDrive/character_names.txt'

def preprocess_stopwords(stopwords_file_path):
    """
    Preprocess the stop words list by lowercasing, removing punctuation, and splitting multi-word entries.

    Args:
        stopwords_file_path (str): Path to the stop words text file.

    Returns:
        set: A set of preprocessed stop words.
    """
    try:
        with open(stopwords_file_path, 'r', encoding='utf-8') as file:
            raw_stop_words = file.read().splitlines()

        # Initialize a set to store processed stop words
        processed_stop_words = set()

        # Define a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)

        for stop_word in raw_stop_words:
            # Lowercase the stop word
            stop_word = stop_word.lower()
            # Remove punctuation
            stop_word = stop_word.translate(translator)
            # Split into individual words if it's a multi-word stop word
            words = stop_word.split()
            for word in words:
                if word:  # Ensure the word is not empty
                    processed_stop_words.add(word)

        logger.info(f"Processed stop words count: {len(processed_stop_words)}")
        return processed_stop_words

    except Exception as e:
        logger.error(f"Error processing stop words: {e}")
        return set(stopwords.words('english'))  # Fallback to NLTK's stop words

# 1. Load additional stop words (character names) and standard English stop words
logger.info("Loading and preprocessing additional stop words...")
custom_stop_words = preprocess_stopwords(additional_stop_words_characters_names)

# Combine with NLTK's stop words
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stop_words)
logger.info(f"Total stop words after preprocessing: {len(stop_words)}")

# =====================================
# 4. Load and Preprocess the Dataset
# =====================================

def load_dataset(path, stop_words, test_mode=False, sample_size=10000, chunksize=None):
    """
    Load and preprocess the dataset.

    Args:
        path (str): Path to the dataset CSV file.
        stop_words (set): Set of stop words to remove.
        test_mode (bool): If True, use a subset of the dataset for testing.
        sample_size (int): Number of sentences to sample if test_mode is True.
        chunksize (int, optional): If specified, read the CSV in chunks of this size.

    Returns:
        list: A list of raw, preprocessed sentences.
    """
    logger.info("Loading and preprocessing dataset...")
    start_time = time.time()

    try:
        if test_mode:
            logger.info(f"Test mode enabled. Sampling {sample_size} sentences from the dataset.")
            if chunksize is None:
                chunksize = 1000  # Set a default chunksize
            df_iter = pd.read_csv(path, chunksize=chunksize)
            sampled_chunks = []
            total_sampled = 0
            for chunk in df_iter:
                remaining = sample_size - total_sampled
                if remaining <= 0:
                    break
                n_samples = min(remaining, len(chunk))
                sampled = chunk.sample(n=n_samples, random_state=42)
                sampled_chunks.append(sampled)
                total_sampled += n_samples
            df = pd.concat(sampled_chunks) if sampled_chunks else pd.DataFrame()
            logger.info(f"Sampled {len(df)} sentences.")
        else:
            # Read the entire CSV file
            df = pd.read_csv(path)
            logger.info(f"Dataset loaded. Total sentences: {len(df)}")

        # Preprocess the sentences: remove newlines, extra spaces, convert to lowercase
        logger.debug("Removing newline characters and extra spaces, converting to lowercase...")
        df['Sentence'] = df['Sentence'].astype(str).apply(lambda x: re.sub(r'\n+', ' ', x))
        df['Sentence'] = df['Sentence'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

        # Tokenize sentences and remove stop words
        logger.debug("Tokenizing sentences and removing stop words...")
        processed_docs = []
        for sentence in df['Sentence']:
            tokens = word_tokenize(sentence)
            # Keep only alphabetic tokens and remove stop words
            tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
            processed_sentence = ' '.join(tokens)  # Join tokens back into string
            processed_docs.append(processed_sentence)

        # Additional Data Quality Checks
        logger.debug("Performing additional data quality checks...")
        sentence_lengths = [len(sentence.split()) for sentence in processed_docs]
        average_length = np.mean(sentence_lengths)
        min_length = np.min(sentence_lengths)
        max_length = np.max(sentence_lengths)
        logger.info(f"Average sentence length: {average_length:.2f}")
        logger.info(f"Minimum sentence length: {min_length}")
        logger.info(f"Maximum sentence length: {max_length}")

        # Optionally, filter out sentences with too few words
        corpus_filtered = [sentence for sentence in processed_docs if len(sentence.split()) >= 3]  # Adjust as needed
        logger.info(f"Filtered corpus size: {len(corpus_filtered)} out of {len(processed_docs)}")

        logger.info(f"Dataset loaded and preprocessed. Total sentences: {len(corpus_filtered)}")
        logger.info(f"Time taken for loading and preprocessing: {time.time() - start_time:.2f} seconds")
        return corpus_filtered

    except Exception as e:
        logger.error(f"Failed to load and preprocess dataset: {e}")
        return []

# Set TEST_MODE to True for initial testing with a subset of the dataset
TEST_MODE = False  # Change to False to process the entire dataset
SAMPLE_SIZE = 10000  # Number of sentences to sample if TEST_MODE is True

# Load and preprocess the dataset
logger.info("Starting data loading and preprocessing...")
corpus = load_dataset(
    path=dataset_path,
    stop_words=stop_words,
    test_mode=TEST_MODE,
    sample_size=SAMPLE_SIZE
    # chunksize is set internally when test_mode=True
)
if not corpus:
    logger.error("No data to train on. Exiting.")
    raise SystemExit
logger.info("Data loading and preprocessing completed.")

# =====================================
# 5. Create Parameters DataFrame
# =====================================

def create_dataframe():
    """
    Create the dataframe containing model parameters.

    Returns:
        pd.DataFrame: The dataframe with model parameters.
    """
    logger.info("Creating dataframe with model parameters...")
    # Define the data as a list of dictionaries to handle each embedding model
    data = []

    # Define your models and their parameters
    models = [
        {
            'Embeddings_Model': 'all-MiniLM-L12-v2',
            'Iteration': 66,
            'Coherence': 0.577551,
            'Topic_Diversity': 0.45,
            'bertopic__min_topic_size': 102,
            'bertopic__top_n_words': 30,
            'hdbscan__min_cluster_size': 281,
            'hdbscan__min_samples': 72,
            'umap__min_dist': 0.005022,
            'umap__n_components': 2,
            'umap__n_neighbors': 7,
            'vectorizer__min_df': 0.001504
        },
        {
            'Embeddings_Model': 'paraphrase-mpnet-base-v2',
            'Iteration': 14,
            'Coherence': 0.469187,
            'Topic_Diversity': 0.8,
            'bertopic__min_topic_size': 63,
            'bertopic__top_n_words': 22,
            'hdbscan__min_cluster_size': 500,
            'hdbscan__min_samples': 72,
            'umap__min_dist': 0.077818,
            'umap__n_components': 9,
            'umap__n_neighbors': 11,
            'vectorizer__min_df': 0.009372
        },
        {
            'Embeddings_Model': 'all-MiniLM-L12-v2',
            'Iteration': 75,
            'Coherence': 0.543105,
            'Topic_Diversity': 0.466667,
            'bertopic__min_topic_size': 142,
            'bertopic__top_n_words': 10,
            'hdbscan__min_cluster_size': 473,
            'hdbscan__min_samples': 14,
            'umap__min_dist': 0.004634,
            'umap__n_components': 5,
            'umap__n_neighbors': 15,
            'vectorizer__min_df': 0.001947
        },
        {
            'Embeddings_Model': 'paraphrase-mpnet-base-v2',
            'Iteration': 0,
            'Coherence': 0.463245,
            'Topic_Diversity': 0.82,
            'bertopic__min_topic_size': 127,
            'bertopic__top_n_words': 31,
            'hdbscan__min_cluster_size': 494,
            'hdbscan__min_samples': 28,
            'umap__min_dist': 0.058341,
            'umap__n_components': 10,
            'umap__n_neighbors': 11,
            'vectorizer__min_df': 0.007313
        },
        {
            'Embeddings_Model': 'paraphrase-MiniLM-L6-v2',
            'Iteration': 19,
            'Coherence': 0.425237,
            'Topic_Diversity': 0.94,
            'bertopic__min_topic_size': 64,
            'bertopic__top_n_words': 27,
            'hdbscan__min_cluster_size': 143,
            'hdbscan__min_samples': 32,
            'umap__min_dist': 0.085702,
            'umap__n_components': 9,
            'umap__n_neighbors': 44,
            'vectorizer__min_df': 0.005932
        },
        {
            'Embeddings_Model': 'paraphrase-mpnet-base-v2',
            'Iteration': 13,
            'Coherence': 0.452912,
            'Topic_Diversity': 0.8,
            'bertopic__min_topic_size': 14,
            'bertopic__top_n_words': 18,
            'hdbscan__min_cluster_size': 497,
            'hdbscan__min_samples': 32,
            'umap__min_dist': 0.086975,
            'umap__n_components': 8,
            'umap__n_neighbors': 9,
            'vectorizer__min_df': 0.009857
        },
        {
            'Embeddings_Model': 'multi-qa-mpnet-base-cos-v1',
            'Iteration': 23,
            'Coherence': 0.430575,
            'Topic_Diversity': 0.797059,
            'bertopic__min_topic_size': 28,
            'bertopic__top_n_words': 28,
            'hdbscan__min_cluster_size': 492,
            'hdbscan__min_samples': 12,
            'umap__min_dist': 0.095922,
            'umap__n_components': 9,
            'umap__n_neighbors': 19,
            'vectorizer__min_df': 0.008294
        },
        {
            'Embeddings_Model': 'all-MiniLM-L12-v2',
            'Iteration': 67,
            'Coherence': 0.489419,
            'Topic_Diversity': 0.527273,
            'bertopic__min_topic_size': 99,
            'bertopic__top_n_words': 24,
            'hdbscan__min_cluster_size': 258,
            'hdbscan__min_samples': 37,
            'umap__min_dist': 0.004852,
            'umap__n_components': 7,
            'umap__n_neighbors': 42,
            'vectorizer__min_df': 0.001174
        },
        {
            'Embeddings_Model': 'multi-qa-mpnet-base-cos-v1',
            'Iteration': 28,
            'Coherence': 0.439447,
            'Topic_Diversity': 0.749474,
            'bertopic__min_topic_size': 29,
            'bertopic__top_n_words': 14,
            'hdbscan__min_cluster_size': 427,
            'hdbscan__min_samples': 11,
            'umap__min_dist': 0.008103,
            'umap__n_components': 9,
            'umap__n_neighbors': 18,
            'vectorizer__min_df': 0.005862
        },
        {
            'Embeddings_Model': 'multi-qa-mpnet-base-cos-v1',
            'Iteration': 11,
            'Coherence': 0.419208,
            'Topic_Diversity': 0.828571,
            'bertopic__min_topic_size': 105,
            'bertopic__top_n_words': 24,
            'hdbscan__min_cluster_size': 497,
            'hdbscan__min_samples': 13,
            'umap__min_dist': 0.022149,
            'umap__n_components': 8,
            'umap__n_neighbors': 14,
            'vectorizer__min_df': 0.009229
        }
    ]

    # Create a DataFrame from the models list
    df = pd.DataFrame(models)
    logger.info(f"Dataframe created with {len(df)} embedding models.")
    return df

# Create the parameters dataframe
params_df = create_dataframe()
logger.info(f"Parameters DataFrame Sample:\n{params_df.head()}")

# =====================================
# 6. Load Embedding Models
# =====================================

def load_embedding_models(model_names):
    """
    Load all unique embedding models.

    Args:
        model_names (list): List of embedding model names to load.

    Returns:
        dict: A dictionary mapping model names to loaded embedding models.
    """
    embedding_models = {}
    # Use GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info(f"Using device for embeddings: {device}")

    for model_name in model_names:
        logger.info(f"Loading embedding model: {model_name}")
        try:
            # Load the embedding model onto the specified device
            embedding_model = SentenceTransformer(model_name, device=device)
            embedding_models[model_name] = embedding_model
            logger.info(f"Model {model_name} loaded successfully.")
        except Exception as e:
            logger.error(f"Failed to load embedding model {model_name}: {e}")
    logger.info("All embedding models loaded.")
    return embedding_models

# Extract unique embedding model names from the dataframe
embedding_model_names = params_df['Embeddings_Model'].unique()
logger.info(f"Unique embedding models to load: {embedding_model_names}")

# Load the embedding models
embedding_models = load_embedding_models(embedding_model_names)

# =====================================
# 7. Train and Save BERTopic Models with Representations
# =====================================

def train_and_save_models(corpus, params_df, embedding_models, stop_words, pos_configuration='a'):
    """
    Train and save BERTopic models based on parameters.

    Args:
        corpus (list): List of preprocessed sentences.
        params_df (pd.DataFrame): DataFrame containing model parameters.
        embedding_models (dict): Dictionary of embedding models.
        stop_words (set): Set of stop words to use in CountVectorizer.
        pos_configuration (str): POS configuration to use ('a', 'b', or 'c').

    Configurations:
        a) 'a' - ONLY NOUNS
        b) 'b' - individual NOUNS AND individual ADJECTIVES
        c) 'c' - individual NOUNS AND individual VERBS
    """
    # Define the main output directory
    main_output_dir = "/content/drive/MyDrive/BERTTopic_Models"
    os.makedirs(main_output_dir, exist_ok=True)
    logger.info(f"Main output directory created at: {main_output_dir}")

    for idx, row in tqdm(params_df.iterrows(), total=params_df.shape[0], desc="Training Models"):
        embedding_model_name = row['Embeddings_Model']
        model_number = idx + 1
        iteration = row['Iteration']
        logger.info(f"\nStarting training for model {model_number}/{len(params_df)} with embedding: {embedding_model_name}")

        # Retrieve the embedding model
        embedding_model = embedding_models.get(embedding_model_name)
        if embedding_model is None:
            logger.error(f"Embedding model {embedding_model_name} not found. Skipping model {model_number}.")
            continue

        # Parameter validation and conversion
        try:
            umap_n_neighbors = int(row['umap__n_neighbors'])
            umap_n_components = int(row['umap__n_components'])
            umap_min_dist = float(row['umap__min_dist'])
            hdbscan_min_cluster_size = int(row['hdbscan__min_cluster_size'])
            hdbscan_min_samples = int(row['hdbscan__min_samples'])
            vectorizer_min_df = float(row['vectorizer__min_df'])
            bertopic_top_n_words = int(row['bertopic__top_n_words'])
            bertopic_min_topic_size = int(row['bertopic__min_topic_size'])
        except ValueError as e:
            logger.error(f"Parameter conversion error for model {model_number}: {e}")
            continue

        # Initialize RAPIDS' UMAP model
        logger.debug("Initializing UMAP model...")
        try:
            umap_model = UMAP(
                n_neighbors=umap_n_neighbors,
                n_components=umap_n_components,
                min_dist=umap_min_dist,
                metric='cosine',
                random_state=42
            )
            logger.debug("UMAP model initialized.")
        except Exception as e:
            logger.error(f"Failed to initialize UMAP model for model {model_number}: {e}")
            continue

        # Initialize RAPIDS' HDBSCAN model
        logger.debug("Initializing HDBSCAN model...")
        try:
            hdbscan_model = HDBSCAN(
                min_cluster_size=hdbscan_min_cluster_size,
                min_samples=hdbscan_min_samples,
                cluster_selection_method='eom',
                prediction_data=True,
                gen_min_span_tree=True
            )
            logger.debug("HDBSCAN model initialized.")
        except Exception as e:
            logger.error(f"Failed to initialize HDBSCAN model for model {model_number}: {e}")
            continue

        # Initialize CountVectorizer (CPU-based) with custom stop words
        logger.debug("Initializing CountVectorizer...")
        try:
            vectorizer_model = CountVectorizer(
                stop_words=stop_words,  # Use the preprocessed stop words
                min_df=vectorizer_min_df,
                ngram_range=(1, 1)  # Ensure single-word tokens
            )
            logger.debug("CountVectorizer initialized.")
        except Exception as e:
            logger.error(f"Failed to initialize CountVectorizer for model {model_number}: {e}")
            continue

        # Define the representation model based on POS configuration
        logger.debug("Defining representation model based on POS configuration...")
        try:
            # The main representation of a topic
            main_representation = KeyBERTInspired(top_n_words=bertopic_top_n_words)

            # Initialize representation models based on configuration
            if pos_configuration == 'a':
                # a) ONLY NOUNS
                pos_nouns = PartOfSpeech(pos_patterns=[[{"POS": "NOUN"}]])
                representation_model = {
                    "Main": main_representation,
                    "NOUNS": pos_nouns,
                    "KeyBERT_MMR": [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=0.5)]
                }
            elif pos_configuration == 'b':
                # b) individual NOUNS AND individual ADJECTIVES
                pos_nouns = PartOfSpeech(pos_patterns=[[{"POS": "NOUN"}]])
                pos_adjectives = PartOfSpeech(pos_patterns=[[{"POS": "ADJ"}]])
                representation_model = {
                    "Main": main_representation,
                    "NOUNS": pos_nouns,
                    "ADJECTIVES": pos_adjectives,
                    "KeyBERT_MMR": [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=0.5)]
                }
            elif pos_configuration == 'c':
                # c) individual NOUNS and individual VERBS
                pos_nouns = PartOfSpeech(pos_patterns=[[{"POS": "NOUN"}]])
                pos_verbs = PartOfSpeech(pos_patterns=[[{"POS": "VERB"}]])
                representation_model = {
                    "Main": main_representation,
                    "NOUNS": pos_nouns,
                    "VERBS": pos_verbs,
                    "KeyBERT_MMR": [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=0.5)]
                }
            else:
                logger.error(f"Invalid POS configuration: {pos_configuration}. Skipping model {model_number}.")
                continue

            logger.debug("Representation model defined successfully.")
        except Exception as e:
            logger.error(f"Failed to define representation model for model {model_number}: {e}")
            continue

        # Initialize BERTopic model with the embedding_model and multiple representations
        logger.debug("Initializing BERTopic model with multiple representations...")
        try:
            topic_model = BERTopic(
                embedding_model=embedding_model,
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                vectorizer_model=vectorizer_model,
                representation_model=representation_model,  # Include multiple representation models
                top_n_words=bertopic_top_n_words,
                min_topic_size=bertopic_min_topic_size,
                language='english',
                calculate_probabilities=True,
                verbose=False  # Set to False to reduce verbosity
            )
            logger.debug("BERTopic model initialized with multiple representations.")
        except Exception as e:
            logger.error(f"Failed to initialize BERTopic model for model {model_number}: {e}")
            continue

        # Train BERTopic model
        logger.info("Training BERTopic model...")
        start_train_time = time.time()
        try:
            # Fit the model with the corpus only
            topics, probs = topic_model.fit_transform(corpus)
            logger.info(f"Model {model_number} training completed in {time.time() - start_train_time:.2f} seconds.")
        except Exception as e:
            logger.error(f"Error during model {model_number} training: {e}")
            continue

        # Define output directories
        safe_embedding = embedding_model_name.replace('/', '_').replace('\\', '_')
        model_output_dir = os.path.join(main_output_dir, safe_embedding)
        os.makedirs(model_output_dir, exist_ok=True)
        logger.info(f"Model output directory created at: {model_output_dir}")

        # Save the model
        logger.debug("Saving BERTopic model...")
        try:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_filename = f"bertopic_model_{model_number}_iter_{iteration}_{safe_embedding}_{timestamp}.pkl"
            model_path = os.path.join(model_output_dir, model_filename)
            topic_model.save(model_path)
            logger.info(f"Model {model_number} saved at: {model_path}")
        except Exception as e:
            logger.error(f"Failed to save model {model_number}: {e}")
            continue

        # Save output topics in CSV and JSON files
        logger.info("Saving topics to CSV and JSON...")
        try:
            # Extract topics information
            topics_info = topic_model.get_topic_info()
            topics_dict = topic_model.get_topics()

            # Create output subdirectories for topics
            topics_output_dir = os.path.join(model_output_dir, f"topics_{model_number}_{timestamp}")
            os.makedirs(topics_output_dir, exist_ok=True)

            # Save topics to CSV using get_topic_info()
            csv_filename = f"bertopic_model_{model_number}_iter_{iteration}_{safe_embedding}_{timestamp}_topics_info.csv"
            csv_path = os.path.join(topics_output_dir, csv_filename)
            topics_info.to_csv(csv_path, index=False)
            logger.info(f"Topics info saved to {csv_path}")

            # Save topics to JSON using get_topics(), excluding empty strings and logging occurrences
            topics_json = {}
            for topic_num, words in topics_dict.items():
                # Skip -1 topic which usually represents outliers
                if topic_num == -1:
                    continue
                # Filter out empty strings and ensure words are valid
                filtered_words = [word for word, _ in words if word.strip()]
                empty_count = len(words) - len(filtered_words)
                if empty_count > 0:
                    logger.warning(f"Topic {topic_num} has {empty_count} empty words.")
                topics_json[str(topic_num)] = filtered_words

            json_filename = f"bertopic_model_{model_number}_iter_{iteration}_{safe_embedding}_{timestamp}_topics.json"
            json_path = os.path.join(topics_output_dir, json_filename)
            with open(json_path, 'w') as json_file:
                json.dump(topics_json, json_file, indent=4)
            logger.info(f"Topics JSON saved to {json_path}")
        except Exception as e:
            logger.error(f"Failed to save topics for model {model_number}: {e}")

        # Cleanup to free memory
        del topic_model
        gc.collect()

    logger.info("All models trained and saved.")

# =====================================
# 8. Execute the Training Pipeline
# =====================================

def main():
    """
    Main function to orchestrate data loading, model training, and saving.
    """
    # Define POS configuration
    # Options:
    # 'a' - ONLY NOUNS
    # 'b' - individual NOUNS AND individual ADJECTIVES
    # 'c' - individual NOUNS AND individual VERBS
    POS_CONFIGURATION = 'a'  # Change to 'b' or 'c' as needed

    logger.info(f"Starting BERTopic model training with POS configuration: {POS_CONFIGURATION}")
    train_and_save_models(
        corpus=corpus,
        params_df=params_df,
        embedding_models=embedding_models,
        stop_words=stop_words,
        pos_configuration=POS_CONFIGURATION
    )
    logger.info("BERTopic model training and saving completed.")

if __name__ == "__main__":
    main()

# =====================================
# 9. Verify Installed RAPIDS Libraries
# =====================================

import cuml
logger.info(f"cuML version: {cuml.__version__}")

import cugraph
logger.info(f"cuGraph version: {cugraph.__version__}")

import cuspatial
logger.info(f"cuSpatial version: {cuspatial.__version__}")

import cuxfilter
logger.info(f"cuxfilter version: {cuxfilter.__version__}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2024-11-02 20:21:26 - INFO - Loading and preprocessing additional stop words...
2024-11-02 20:21:26 - INFO - Loading and preprocessing additional stop words...
2024-11-02 20:21:26 - INFO - Loading and preprocessing additional stop words...
INFO:BERTopic_Training:Loading and preprocessing additional stop words...
2024-11-02 20:21:26 - INFO - Processed stop words count: 4923
2024-11-02 20:21:26 - INFO - Processed stop words count: 4923
2024-11-02 20:21:26 - INFO - Processed stop words count: 4923
INFO:BERTopic_Training:Processed stop words count: 4923
2024-11-02 20:21:26 - INFO - Total stop words after preprocessing: 5038
2024-11-02 20:21:26 - INFO - Total stop words after preprocessing: 5038
2024-11-02 20:21:26 - INFO - Total stop words after preprocessing: 503