# COPY TO RETRAIN ONLY ONE MODEL

In [None]:
# Install RAPIDS (if not already installed)
# Note: Run these commands in a separate cell before running the rest of the code.

# !wget -nc https://raw.githubusercontent.com/rapidsai/rapidsai/main/utils/colab/rapids-colab.sh
# !bash rapids-colab.sh stable

# Update environment variables
import sys
sys.path.append('/usr/local/lib/python3.10/site-packages')

# Import necessary libraries
import os
import time
import logging
import pandas as pd
import numpy as np
import re
from datetime import datetime
import torch
import gc
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Import cuML's UMAP and HDBSCAN
from cuml.manifold import UMAP  # GPU-accelerated UMAP
from cuml.cluster import HDBSCAN  # GPU-accelerated HDBSCAN
import cupy as cp  # For GPU arrays

def load_dataset(path, chunksize=None):
    """
    Load and preprocess the dataset.

    Args:
        path (str): Path to the dataset CSV file.
        chunksize (int, optional): If specified, read the CSV in chunks of this size.

    Returns:
        list: A list of preprocessed sentences.
    """
    print("Loading and preprocessing dataset...")
    start_time = time.time()

    if chunksize:
        # Read the CSV file in chunks if chunksize is specified
        df_iter = pd.read_csv(path, chunksize=chunksize)
        df = pd.concat([chunk for chunk in df_iter])
    else:
        # Read the entire CSV file
        df = pd.read_csv(path)

    # Preprocess the sentences
    df['Sentence'] = df['Sentence'].str.replace(r'\n+', ' ', regex=True)
    df['Sentence'] = df['Sentence'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()

    # Convert the 'Sentence' column to a list
    corpus = df['Sentence'].tolist()
    print(f"Dataset loaded. Total sentences: {len(corpus)}")
    print(f"Time taken for loading and preprocessing: {time.time() - start_time:.2f} seconds")
    return corpus

def create_dataframe():
    """
    Create the dataframe containing the top 10 models based on the updated combined score.

    Returns:
        pd.DataFrame: The dataframe with model parameters.
    """
    print("Creating dataframe with model parameters...")
    # Define the data as a dictionary
    data = {
        'Embeddings_Model': [
            'all-MiniLM-L12-v2',
            'paraphrase-mpnet-base-v2',
            'all-MiniLM-L12-v2',
            'paraphrase-mpnet-base-v2',
            'paraphrase-MiniLM-L6-v2',
            'paraphrase-mpnet-base-v2',
            'multi-qa-mpnet-base-cos-v1',
            'all-MiniLM-L12-v2',
            'multi-qa-mpnet-base-cos-v1',
            'multi-qa-mpnet-base-cos-v1'
        ],
        'Iteration': [66, 14, 75, 0, 19, 13, 23, 67, 28, 11],
        'Coherence': [0.577551, 0.469187, 0.543105, 0.463245, 0.425237, 0.452912, 0.430575, 0.489419, 0.439447, 0.419208],
        'Topic_Diversity': [0.45, 0.8, 0.466667, 0.82, 0.94, 0.8, 0.797059, 0.527273, 0.749474, 0.828571],
        'bertopic__min_topic_size': [102, 63, 142, 127, 64, 14, 28, 99, 29, 105],
        'bertopic__top_n_words': [30, 22, 10, 31, 27, 18, 28, 24, 14, 24],
        'hdbscan__min_cluster_size': [281, 500, 473, 494, 143, 497, 492, 258, 427, 497],
        'hdbscan__min_samples': [72, 72, 14, 28, 32, 32, 12, 37, 11, 13],
        'umap__min_dist': [0.005022, 0.077818, 0.004634, 0.058341, 0.085702, 0.086975, 0.095922, 0.004852, 0.008103, 0.022149],
        'umap__n_components': [2, 9, 5, 10, 9, 8, 9, 7, 9, 8],
        'umap__n_neighbors': [7, 11, 15, 11, 44, 9, 19, 42, 18, 14],
        'vectorizer__min_df': [0.001504, 0.009372, 0.001947, 0.007313, 0.005932, 0.009857, 0.008294, 0.001174, 0.005862, 0.009229],
    }

    # Create a DataFrame from the data dictionary
    df = pd.DataFrame(data)
    print("Dataframe created with model parameters.")
    return df

def load_embedding_models(model_names):
    """
    Load all unique embedding models.

    Args:
        model_names (list): List of embedding model names to load.

    Returns:
        dict: A dictionary mapping model names to loaded embedding models.
    """
    embedding_models = {}
    # Use GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    for model_name in model_names:
        print(f"Loading embedding model: {model_name}")
        try:
            # Load the embedding model onto the specified device
            embedding_model = SentenceTransformer(model_name, device=device)
            embedding_models[model_name] = embedding_model
            print(f"Model {model_name} loaded successfully.")
        except Exception as e:
            print(f"Failed to load embedding model {model_name}: {e}")
    print("All embedding models loaded.")
    return embedding_models

def compute_embeddings(corpus, embedding_models):
    """
    Compute embeddings for each embedding model.

    Args:
        corpus (list): List of preprocessed sentences.
        embedding_models (dict): Dictionary of embedding models.

    Returns:
        dict: A dictionary mapping model names to their computed embeddings.
    """
    precalculated_embeddings = {}
    for model_name, embedding_model in embedding_models.items():
        print(f"Computing embeddings for model: {model_name}")
        start_time = time.time()
        try:
            # Compute embeddings using the embedding model
            embeddings = embedding_model.encode(corpus, show_progress_bar=True, batch_size=64)
            # Keep embeddings as NumPy arrays
            precalculated_embeddings[model_name] = embeddings
            print(f"Embeddings computed for model: {model_name} in {time.time() - start_time:.2f} seconds.")
        except Exception as e:
            print(f"Failed to compute embeddings for model {model_name}: {e}")
    print("All embeddings computed.")
    return precalculated_embeddings

def train_and_save_models(corpus, params_df, embedding_models, precalculated_embeddings):
    """
    Train and save BERTopic models based on parameters.

    Args:
        corpus (list): List of preprocessed sentences.
        params_df (pd.DataFrame): DataFrame containing model parameters.
        embedding_models (dict): Dictionary of embedding models.
        precalculated_embeddings (dict): Dictionary of embeddings.
    """
    for idx, row in tqdm(params_df.iterrows(), total=params_df.shape[0]):
        iteration = row['Iteration']
        embedding_model_name = row['Embeddings_Model']
        print(f"\nStarting training for model {idx+1}/{len(params_df)} with embedding: {embedding_model_name}")

        # Check if embeddings are available
        if embedding_model_name not in precalculated_embeddings:
            print(f"Embeddings for model {embedding_model_name} not available. Skipping this model.")
            continue

        # Retrieve embeddings
        embeddings = precalculated_embeddings[embedding_model_name]

        # Convert embeddings to CuPy array for cuML processing
        embeddings_cupy = cp.asarray(embeddings)

        # Parameter validation and conversion
        try:
            umap_n_neighbors = int(row['umap__n_neighbors'])
            umap_n_components = int(row['umap__n_components'])
            umap_min_dist = float(row['umap__min_dist'])
            hdbscan_min_cluster_size = int(row['hdbscan__min_cluster_size'])
            hdbscan_min_samples = int(row['hdbscan__min_samples'])
            vectorizer_min_df = float(row['vectorizer__min_df'])
            bertopic_top_n_words = int(row['bertopic__top_n_words'])
            bertopic_min_topic_size = int(row['bertopic__min_topic_size'])
        except ValueError as e:
            print(f"Parameter conversion error: {e}")
            continue

        # Initialize cuML's UMAP model
        print("Initializing UMAP model...")
        umap_model = UMAP(
            n_neighbors=umap_n_neighbors,
            n_components=umap_n_components,
            min_dist=umap_min_dist,
            metric='cosine',
            random_state=42
        )

        # Initialize cuML's HDBSCAN model
        print("Initializing HDBSCAN model...")
        hdbscan_model = HDBSCAN(
            min_cluster_size=hdbscan_min_cluster_size,
            min_samples=hdbscan_min_samples,
            cluster_selection_method='eom',
            prediction_data=True,
            gen_min_span_tree=True
        )

        # Initialize CountVectorizer (CPU-based)
        print("Initializing CountVectorizer...")
        vectorizer_model = CountVectorizer(
            stop_words='english',
            min_df=vectorizer_min_df
        )

        # Initialize BERTopic model
        print("Initializing BERTopic model...")
        topic_model = BERTopic(
            embedding_model=None,  # We provide embeddings directly
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            top_n_words=bertopic_top_n_words,
            min_topic_size=bertopic_min_topic_size,
            language='english',
            calculate_probabilities=True,
            verbose=True
        )

        # Train BERTopic model
        print("Training BERTopic model...")
        start_train_time = time.time()
        try:
            # Fit the model with embeddings and corpus
            topics, probs = topic_model.fit_transform(corpus, embeddings)
            print(f"Model training completed in {time.time() - start_train_time:.2f} seconds.")
        except Exception as e:
            print(f"Error during model training: {e}")
            continue

        # Save the model
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_filename = f"bertopic_model_{idx}_iter_{iteration}_{timestamp}.pkl"
        model_path = os.path.join("/content/drive/MyDrive/", model_filename)
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        try:
            topic_model.save(model_path)
            print(f"Model saved at: {model_path}")
        except Exception as e:
            print(f"Failed to save model: {e}")
            continue

        # Cleanup
        del topic_model
        gc.collect()

    print("All models trained and saved.")

def main():
    """
    Main function to orchestrate data loading, model training, and saving.
    """
    # Path to the dataset CSV file
    dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new_romantic.gsheet'

    # Load and preprocess the dataset
    corpus = load_dataset(dataset_path)

    # Create the dataframe containing model parameters
    params_df = create_dataframe()

    # Extract unique embedding model names from the dataframe
    embedding_model_names = params_df['Embeddings_Model'].unique()
    print(f"Unique embedding models to load: {embedding_model_names}")

    # Load the embedding models
    embedding_models = load_embedding_models(embedding_model_names)

    # Compute embeddings for each embedding model
    precalculated_embeddings = compute_embeddings(corpus, embedding_models)

    # Train and save BERTopic models based on parameters and embeddings
    train_and_save_models(corpus, params_df, embedding_models, precalculated_embeddings)

if __name__ == "__main__":
    main()

Loading and preprocessing dataset...
Dataset loaded. Total sentences: 680822
Time taken for loading and preprocessing: 6.08 seconds
Creating dataframe with model parameters...
Dataframe created with model parameters.
Unique embedding models to load: ['all-MiniLM-L12-v2' 'paraphrase-mpnet-base-v2' 'paraphrase-MiniLM-L6-v2'
 'multi-qa-mpnet-base-cos-v1']
Using device: cuda
Loading embedding model: all-MiniLM-L12-v2
Model all-MiniLM-L12-v2 loaded successfully.
Loading embedding model: paraphrase-mpnet-base-v2
Model paraphrase-mpnet-base-v2 loaded successfully.
Loading embedding model: paraphrase-MiniLM-L6-v2
Model paraphrase-MiniLM-L6-v2 loaded successfully.
Loading embedding model: multi-qa-mpnet-base-cos-v1
Model multi-qa-mpnet-base-cos-v1 loaded successfully.
All embedding models loaded.
Computing embeddings for model: all-MiniLM-L12-v2


Batches:   0%|          | 0/10638 [00:00<?, ?it/s]

Embeddings computed for model: all-MiniLM-L12-v2 in 311.89 seconds.
Computing embeddings for model: paraphrase-mpnet-base-v2


Batches:   0%|          | 0/10638 [00:00<?, ?it/s]

Embeddings computed for model: paraphrase-mpnet-base-v2 in 1052.22 seconds.
Computing embeddings for model: paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/10638 [00:00<?, ?it/s]

Embeddings computed for model: paraphrase-MiniLM-L6-v2 in 172.57 seconds.
Computing embeddings for model: multi-qa-mpnet-base-cos-v1


Batches:   0%|          | 0/10638 [00:00<?, ?it/s]

Embeddings computed for model: multi-qa-mpnet-base-cos-v1 in 1051.52 seconds.
All embeddings computed.


  0%|          | 0/10 [00:00<?, ?it/s]


Starting training for model 1/10 with embedding: all-MiniLM-L12-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:07:42,116 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-01 23:12:09,409 - BERTopic - Dimensionality - Completed ✓
2024-11-01 23:12:09,416 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-01 23:14:12,498 - BERTopic - Cluster - Completed ✓
2024-11-01 23:14:12,601 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-01 23:14:19,538 - BERTopic - Representation - Completed ✓


Model training completed in 397.93 seconds.
Model saved at: /content/drive/MyDrive/bertopic_model_0_iter_66_20241101_231419.pkl


 10%|█         | 1/10 [06:43<1:00:30, 403.42s/it]


Starting training for model 2/10 with embedding: paraphrase-mpnet-base-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:14:26,433 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-01 23:22:09,869 - BERTopic - Dimensionality - Completed ✓
2024-11-01 23:22:09,895 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-01 23:24:18,739 - BERTopic - Cluster - Completed ✓
2024-11-01 23:24:18,840 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-01 23:24:26,162 - BERTopic - Representation - Completed ✓


Model training completed in 600.14 seconds.


 20%|██        | 2/10 [16:54<1:10:06, 525.86s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_1_iter_14_20241101_232426.pkl

Starting training for model 3/10 with embedding: all-MiniLM-L12-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:24:36,745 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-01 23:29:04,222 - BERTopic - Dimensionality - Completed ✓
2024-11-01 23:29:04,237 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-01 23:31:08,316 - BERTopic - Cluster - Completed ✓
2024-11-01 23:31:08,415 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-01 23:31:15,541 - BERTopic - Representation - Completed ✓


Model training completed in 399.22 seconds.


 30%|███       | 3/10 [23:41<54:59, 471.33s/it]  

Model saved at: /content/drive/MyDrive/bertopic_model_2_iter_75_20241101_233115.pkl

Starting training for model 4/10 with embedding: paraphrase-mpnet-base-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:31:23,449 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-01 23:39:07,819 - BERTopic - Dimensionality - Completed ✓
2024-11-01 23:39:07,848 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-01 23:41:10,287 - BERTopic - Cluster - Completed ✓
2024-11-01 23:41:10,393 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-01 23:41:17,902 - BERTopic - Representation - Completed ✓


Model training completed in 597.62 seconds.


 40%|████      | 4/10 [33:52<52:39, 526.64s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_3_iter_0_20241101_234120.pkl

Starting training for model 5/10 with embedding: paraphrase-MiniLM-L6-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:41:34,592 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-01 23:46:20,808 - BERTopic - Dimensionality - Completed ✓
2024-11-01 23:46:20,834 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-01 23:48:25,097 - BERTopic - Cluster - Completed ✓
2024-11-01 23:48:25,197 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-01 23:48:32,481 - BERTopic - Representation - Completed ✓


Model training completed in 418.61 seconds.


 50%|█████     | 5/10 [41:01<40:56, 491.31s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_4_iter_19_20241101_234832.pkl

Starting training for model 6/10 with embedding: paraphrase-mpnet-base-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:48:43,529 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-01 23:56:23,572 - BERTopic - Dimensionality - Completed ✓
2024-11-01 23:56:23,593 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-01 23:58:27,494 - BERTopic - Cluster - Completed ✓
2024-11-01 23:58:27,602 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-01 23:58:34,868 - BERTopic - Representation - Completed ✓


Model training completed in 594.54 seconds.


 60%|██████    | 6/10 [51:06<35:19, 529.91s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_5_iter_13_20241101_235837.pkl

Starting training for model 7/10 with embedding: multi-qa-mpnet-base-cos-v1
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-01 23:58:48,349 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-02 00:06:33,204 - BERTopic - Dimensionality - Completed ✓
2024-11-02 00:06:33,229 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-02 00:08:32,331 - BERTopic - Cluster - Completed ✓
2024-11-02 00:08:32,433 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-02 00:08:40,004 - BERTopic - Representation - Completed ✓


Model training completed in 594.29 seconds.


 70%|███████   | 7/10 [1:01:11<27:43, 554.62s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_6_iter_23_20241102_000842.pkl

Starting training for model 8/10 with embedding: all-MiniLM-L12-v2
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-02 00:08:53,584 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-02 00:13:32,237 - BERTopic - Dimensionality - Completed ✓
2024-11-02 00:13:32,257 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-02 00:15:39,695 - BERTopic - Cluster - Completed ✓
2024-11-02 00:15:39,797 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-02 00:15:46,925 - BERTopic - Representation - Completed ✓


Model training completed in 413.85 seconds.


 80%|████████  | 8/10 [1:08:12<17:03, 511.85s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_7_iter_67_20241102_001547.pkl

Starting training for model 9/10 with embedding: multi-qa-mpnet-base-cos-v1
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-02 00:15:54,135 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-02 00:23:38,593 - BERTopic - Dimensionality - Completed ✓
2024-11-02 00:23:38,619 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-02 00:25:38,021 - BERTopic - Cluster - Completed ✓
2024-11-02 00:25:38,133 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-02 00:25:45,980 - BERTopic - Representation - Completed ✓


Model training completed in 594.15 seconds.


 90%|█████████ | 9/10 [1:18:17<09:01, 541.18s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_8_iter_28_20241102_002548.pkl

Starting training for model 10/10 with embedding: multi-qa-mpnet-base-cos-v1
Initializing UMAP model...
Initializing HDBSCAN model...
Initializing CountVectorizer...
Initializing BERTopic model...
Training BERTopic model...


2024-11-02 00:25:59,784 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-02 00:33:40,110 - BERTopic - Dimensionality - Completed ✓
2024-11-02 00:33:40,131 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-02 00:35:40,219 - BERTopic - Cluster - Completed ✓
2024-11-02 00:35:40,323 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-02 00:35:48,109 - BERTopic - Representation - Completed ✓


Model training completed in 590.14 seconds.


100%|██████████| 10/10 [1:28:18<00:00, 529.82s/it]

Model saved at: /content/drive/MyDrive/bertopic_model_9_iter_11_20241102_003549.pkl
All models trained and saved.





# COPY TO RETRAIN WITH POS

In [11]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# ==============================
# 1. Setup and Installation
# ==============================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check CUDA and driver versions
!nvcc --version  # Check CUDA version
!nvidia-smi      # Check driver version

# Install RAPIDS and other required libraries
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Fri Oct 31 21:16:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |           

In [13]:
## After restarting, install remaining necessary libraries
# Run this cell after restarting the runtime
!pip install bertopic==0.16.3
!pip install octis
!pip install sentence-transformers
!pip install umap-learn==0.5.3  # Specify a compatible version
!pip install hdbscan
!pip install tqdm
!pip install pandas
!pip install gensim
!pip install wandb
!pip install umap
!pip install scipy
!pip install nltk

Collecting bertopic==0.16.3
  Downloading bertopic-0.16.3-py3-none-any.whl.metadata (23 kB)
Downloading bertopic-0.16.3-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.16.3
Collecting octis
  Using cached octis-1.14.0-py2.py3-none-any.whl.metadata (27 kB)
Collecting gensim<5.0,>=4.2.0 (from octis)
  Using cached gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting scikit-learn==1.1.0 (from octis)
  Using cached scikit-learn-1.1.0.tar.gz (6.8 MB)
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a 

In [14]:
!pip list | grep umap

umap                                     0.1.1
umap-learn                               0.5.3


In [15]:
!pip uninstall -y umap

Found existing installation: umap 0.1.1
Uninstalling umap-0.1.1:
  Successfully uninstalled umap-0.1.1


In [16]:
!find . -type d -name "__pycache__" -exec rm -r {} +

In [17]:
!pip install --upgrade bertopic umap-learn

Collecting bertopic
  Using cached bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: umap-learn, bertopic
  Attempting uninstall: umap-learn
    Found existing installation: umap-learn 0.5.3
    Uninstalling umap-learn-0.5.3:
      Successfully uninstalled umap-learn-0.5.3
  Attempting uninstall: bertopic
    Found existing installation: bertopic 0.16.3
    Uninstalling bertopic-0.16.3:
      Successfully uninstalled bertopic-0.16.3
Successfully installed bertopic-0.17.3 umap-learn-0.5.9.post2


In [34]:
# path: scripts/train_bertopic_grid.py
"""
BERTopic grid training:
- c_npmi coherence + per-topic breakdown
- GPU/CPU fallback for UMAP/HDBSCAN
- Embedding caching per model
- Fixed PartOfSpeech construction for older/newer BERTopic (no embedding_model arg)
"""

from __future__ import annotations

import gc
import json
import logging
import math
import os
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from tqdm import tqdm

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, PartOfSpeech

# Optional coherence (auto-disabled if missing)
try:
    from gensim.corpora import Dictionary
    from gensim.models.coherencemodel import CoherenceModel
    HAS_GENSIM = True
except Exception:
    HAS_GENSIM = False

import spacy
from spacy.cli import download as spacy_download


# -----------------------------
# 1) Configuration
# -----------------------------
@dataclass
class Config:
    dataset_csv: Path
    text_column: str = "Sentence"
    output_dir: Path = Path("./bertopic_runs")
    custom_stopwords_file: Optional[Path] = None
    test_mode: bool = False
    sample_size: int = 10000
    min_doc_tokens: int = 3
    seed: int = 42

    # scoring weights
    w_diversity: float = 0.5
    w_coherence: float = 0.5  # applied to c_npmi if available
    penalty_outliers: float = 0.25
    target_topics: Optional[Tuple[int, int]] = None

    embeddings_dir: Path = Path("./bertopic_runs/embeddings")
    pos_configs: Tuple[str, ...] = ("a", "b", "c")
    param_cols: Tuple[str, ...] = (
        "Embeddings_Model",
        "Iteration",
        "bertopic__min_topic_size",
        "bertopic__top_n_words",
        "hdbscan__min_cluster_size",
        "hdbscan__min_samples",
        "umap__min_dist",
        "umap__n_components",
        "umap__n_neighbors",
        "vectorizer__min_df",
    )


# -----------------------------
# 2) Logging
# -----------------------------
def setup_logging(out_dir: Path) -> logging.Logger:
    out_dir.mkdir(parents=True, exist_ok=True)
    log_file = out_dir / "train.log"

    logger = logging.getLogger("bertopic_grid")
    logger.setLevel(logging.INFO)

    if logger.handlers:
        for h in list(logger.handlers):
            logger.removeHandler(h)

    fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

    fh = logging.FileHandler(log_file, encoding="utf-8")
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(fmt)
    logger.addHandler(fh)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(fmt)
    logger.addHandler(ch)

    return logger


# -----------------------------
# 3) CPU/GPU fallback
# -----------------------------
def make_umap_hdbscan(
    n_neighbors: int,
    n_components: int,
    min_dist: float,
    hdb_min_cluster_size: int,
    hdb_min_samples: int,
    seed: int,
    logger: logging.Logger,
):
    try:
        from cuml.manifold import UMAP as GPU_UMAP  # type: ignore
        from cuml.cluster import HDBSCAN as GPU_HDBSCAN  # type: ignore

        logger.info("Using GPU (cuML) UMAP + HDBSCAN")
        umap_model = GPU_UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric="cosine",
            random_state=seed,
        )
        hdbscan_model = GPU_HDBSCAN(
            min_cluster_size=hdb_min_cluster_size,
            min_samples=hdb_min_samples,
            cluster_selection_method="eom",
            prediction_data=True,
            gen_min_span_tree=True,
        )
        return umap_model, hdbscan_model, True
    except Exception:
        from umap import UMAP as CPU_UMAP  # type: ignore
        import hdbscan as CPU_HDBSCAN  # type: ignore

        logger.info("Using CPU UMAP + HDBSCAN (fallback)")
        umap_model = CPU_UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric="cosine",
            random_state=seed,
        )
        hdbscan_model = CPU_HDBSCAN.HDBSCAN(
            min_cluster_size=hdb_min_cluster_size,
            min_samples=hdb_min_samples,
            cluster_selection_method="eom",
            prediction_data=True,
            gen_min_span_tree=True,
        )
        return umap_model, hdbscan_model, False


# -----------------------------
# 4) Stopwords
# -----------------------------
def load_stopwords(custom_file: Optional[Path]) -> set:
    stops = set(w.lower() for w in ENGLISH_STOP_WORDS)
    if custom_file and custom_file.exists():
        punct = re.compile(r"[^\w\s]")
        with custom_file.open("r", encoding="utf-8") as f:
            for line in f:
                line = punct.sub("", line.strip().lower())
                for token in line.split():
                    if token:
                        stops.add(token)
    return stops


# -----------------------------
# 5) Data loading & cleaning
# -----------------------------
def simple_clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    return " ".join(tok for tok in text.split() if tok)


def load_corpus(cfg: Config, logger: logging.Logger, stopwords: set) -> List[str]:
    df = pd.read_csv(cfg.dataset_csv)
    if cfg.text_column not in df.columns:
        raise ValueError(f"Missing column '{cfg.text_column}' in dataset")

    docs = [simple_clean(str(x)) for x in df[cfg.text_column].tolist()]
    docs = [" ".join(tok for tok in d.split() if tok not in stopwords) for d in docs]
    docs = [d for d in docs if len(d.split()) >= cfg.min_doc_tokens]

    if cfg.test_mode and len(docs) > cfg.sample_size:
        rng = np.random.default_rng(cfg.seed)
        idx = rng.choice(len(docs), size=cfg.sample_size, replace=False)
        docs = [docs[i] for i in idx]

    logger.info(f"Loaded {len(docs):,} cleaned documents")
    return docs


# -----------------------------
# 6) spaCy (for POS repr)
# -----------------------------
def ensure_spacy(logger: logging.Logger):
    try:
        _ = spacy.load("en_core_web_sm")
    except OSError:
        logger.info("Downloading spaCy model 'en_core_web_sm'...")
        spacy_download("en_core_web_sm")


# -----------------------------
# 7) Embeddings cache
# -----------------------------
def get_embeddings(
    model_name: str, docs: List[str], out_dir: Path, device: str, logger: logging.Logger
) -> np.ndarray:
    out_dir.mkdir(parents=True, exist_ok=True)
    key = model_name.replace("/", "_")
    fpath = out_dir / f"{key}.npy"

    if fpath.exists():
        logger.info(f"Loading cached embeddings: {fpath}")
        return np.load(fpath)

    logger.info(f"Encoding embeddings: {model_name} on {device}")
    model = SentenceTransformer(model_name, device=device)
    embs = model.encode(docs, show_progress_bar=True, batch_size=64)
    np.save(fpath, embs)
    logger.info(f"Saved embeddings: {fpath}")
    return embs


# -----------------------------
# 8) Representation models (POS) — fixed
# -----------------------------
def build_representation(pos_cfg: str, logger: logging.Logger):
    """Return a dict of representation models. No embedding_model passed to PartOfSpeech (compat)."""
    main = KeyBERTInspired(top_n_words=30)

    # Older/newer BERTopic: PartOfSpeech takes only pos_patterns (+ optional spacy pipeline args)
    try:
        pos_map = {
            "a": {"NOUNS": PartOfSpeech(pos_patterns=[[{"POS": "NOUN"}]])},
            "b": {
                "NOUNS": PartOfSpeech(pos_patterns=[[{"POS": "NOUN"}]]),
                "ADJECTIVES": PartOfSpeech(pos_patterns=[[{"POS": "ADJ"}]]),
            },
            "c": {
                "NOUNS": PartOfSpeech(pos_patterns=[[{"POS": "NOUN"}]]),
                "VERBS": PartOfSpeech(pos_patterns=[[{"POS": "VERB"}]]),
            },
        }
        if pos_cfg not in pos_map:
            raise ValueError(f"Invalid POS configuration: {pos_cfg}")
        rep = {"Main": main}
        rep.update(pos_map[pos_cfg])
        return rep
    except Exception as e:
        # Why: ensure training continues even if POS component mismatches current BERTopic version
        logger.warning(f"PartOfSpeech unavailable ({e}); falling back to KeyBERTInspired only.")
        return {"Main": main}


# -----------------------------
# 9) Metrics: diversity + c_npmi
# -----------------------------
def topic_diversity(topics_dict: Dict[int, List[Tuple[str, float]]], top_k: int = 10) -> float:
    words: List[str] = []
    for k, items in topics_dict.items():
        if k == -1:
            continue
        words.extend([w for (w, _) in items[:top_k]])
    if not words:
        return 0.0
    return len(set(words)) / float(len(words))


def compute_coherence_npmi(
    docs: List[str],
    topics_dict: Dict[int, List[Tuple[str, float]]],
    top_k: int = 10,
) -> Tuple[Optional[float], Dict[int, float]]:
    if not HAS_GENSIM:
        return None, {}

    tokenized = [d.split() for d in docs]
    topic_ids = sorted(t for t in topics_dict.keys() if t != -1)
    topic_words = [[w for (w, _) in topics_dict[t][:top_k]] for t in topic_ids]
    if not topic_words:
        return None, {}

    dictionary = Dictionary(tokenized)
    corpus = [dictionary.doc2bow(toks) for toks in tokenized]

    cm = CoherenceModel(
        topics=topic_words,
        texts=tokenized,
        corpus=corpus,
        dictionary=dictionary,
        coherence="c_npmi",
    )
    overall = float(cm.get_coherence())
    per_topic_vals = cm.get_coherence_per_topic()
    per_topic = {tid: float(val) for tid, val in zip(topic_ids, per_topic_vals)}
    return overall, per_topic


# -----------------------------
# 10) Training single run
# -----------------------------
def train_single(
    *,
    docs: List[str],
    embs: np.ndarray,
    params_row: pd.Series,
    pos_cfg: str,
    cfg: Config,
    logger: logging.Logger,
    device: str,
) -> Dict:
    vectorizer = CountVectorizer(stop_words="english", min_df=float(params_row["vectorizer__min_df"]))

    umap_model, hdbscan_model, using_gpu = make_umap_hdbscan(
        n_neighbors=int(params_row["umap__n_neighbors"]),
        n_components=int(params_row["umap__n_components"]),
        min_dist=float(params_row["umap__min_dist"]),
        hdb_min_cluster_size=int(params_row["hdbscan__min_cluster_size"]),
        hdb_min_samples=int(params_row["hdbscan__min_samples"]),
        seed=cfg.seed,
        logger=logger,
    )

    # No SentenceTransformer object for representations (not needed)
    rep = build_representation(pos_cfg, logger)

    topic_model = BERTopic(
        embedding_model=None,  # using precomputed embeddings
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer,
        representation_model=rep,
        top_n_words=int(params_row["bertopic__top_n_words"]),
        min_topic_size=int(params_row["bertopic__min_topic_size"]),
        language="english",
        calculate_probabilities=True,
        verbose=False,
    )

    t0 = time.time()
    topics, probs = topic_model.fit_transform(docs, embs)
    fit_secs = time.time() - t0

    topics_info = topic_model.get_topic_info()
    topics_dict = topic_model.get_topics()

    n_outliers = int((np.array(topics) == -1).sum())
    outlier_ratio = n_outliers / max(1, len(topics))
    n_topics = int((topics_info["Topic"] != -1).sum())
    diversity = topic_diversity(
        topics_dict, top_k=min(10, int(params_row["bertopic__top_n_words"]))
    )

    coh_overall, coh_per_topic = compute_coherence_npmi(docs, topics_dict, top_k=10)

    safe_embed = str(params_row["Embeddings_Model"]).replace("/", "_")
    run_dir = (
        cfg.output_dir
        / f"{safe_embed}"
        / f"iter_{int(params_row['Iteration'])}"
        / f"POS_{pos_cfg}"
    )
    run_dir.mkdir(parents=True, exist_ok=True)

    topic_model.save(run_dir / "bertopic_model.pkl")

    topics_info = topics_info.copy()
    topics_info["c_npmi"] = topics_info["Topic"].map(coh_per_topic)
    topics_info.to_csv(run_dir / "topics_info.csv", index=False)

    if coh_per_topic:
        pd.DataFrame(
            [{"Topic": k, "c_npmi": v} for k, v in sorted(coh_per_topic.items())]
        ).to_csv(run_dir / "per_topic_coherence.csv", index=False)

    topics_json = {
        str(k): [{w: float(p)} for (w, p) in v[: int(params_row["bertopic__top_n_words"])]]
        for k, v in topics_dict.items()
        if k != -1 and v
    }
    with (run_dir / "topics.json").open("w", encoding="utf-8") as f:
        json.dump(topics_json, f, indent=2)

    # Visuals (best-effort)
    try:
        topic_model.visualize_barchart(top_n_topics=15).write_html(str(run_dir / "barchart.html"))
    except Exception:
        pass
    try:
        topic_model.visualize_topics().write_html(str(run_dir / "topics.html"))
    except Exception:
        pass
    try:
        topic_model.visualize_hierarchy().write_html(str(run_dir / "hierarchy.html"))
    except Exception:
        pass

    preview_rows = []
    for _, r in topics_info[topics_info["Topic"] != -1].head(15).iterrows():
        t_id = int(r["Topic"])
        top_words = ", ".join([w for w, _ in topics_dict[t_id]][:10])
        preview_rows.append(
            {"Topic": t_id, "Count": int(r["Count"]), "C_nPMI": coh_per_topic.get(t_id, np.nan), "TopWords": top_words}
        )
    pd.DataFrame(preview_rows).to_csv(run_dir / "top_topics_preview.csv", index=False)

    del topic_model, probs
    gc.collect()

    return {
        "run_dir": str(run_dir),
        "using_gpu": using_gpu,
        "n_topics": n_topics,
        "outlier_ratio": outlier_ratio,
        "topic_diversity": diversity,
        "coherence_c_npmi": coh_overall,
        "fit_seconds": fit_secs,
    }


# -----------------------------
# 11) Scoring & ranking
# -----------------------------
def rank_runs(summary_df: pd.DataFrame, cfg: Config, logger: logging.Logger) -> pd.DataFrame:
    df = summary_df.copy()

    def minmax(col):
        vals = df[col].astype(float)
        lo, hi = vals.min(), vals.max()
        if math.isclose(lo, hi):
            return pd.Series([0.5] * len(vals), index=vals.index)
        return (vals - lo) / (hi - lo)

    df["norm_diversity"] = minmax("topic_diversity")

    if "coherence_c_npmi" in df.columns and df["coherence_c_npmi"].notna().any():
        df["coherence_c_npmi"] = pd.to_numeric(df["coherence_c_npmi"], errors="coerce")
        df["norm_coherence"] = minmax("coherence_c_npmi")
        w_coh = cfg.w_coherence if HAS_GENSIM else 0.0
    else:
        df["norm_coherence"] = 0.0
        w_coh = 0.0

    df["penalty"] = cfg.penalty_outliers * df["outlier_ratio"]

    if cfg.target_topics:
        lo, hi = cfg.target_topics
        center = (lo + hi) / 2.0
        span = max(1.0, (hi - lo) / 2.0)
        df["topics_window_penalty"] = np.clip(np.abs(df["n_topics"] - center) / span, 0, 1) * 0.2
    else:
        df["topics_window_penalty"] = 0.0

    df["score"] = (
        cfg.w_diversity * df["norm_diversity"]
        + w_coh * df["norm_coherence"]
        - df["penalty"]
        - df["topics_window_penalty"]
    )

    df = df.sort_values("score", ascending=False).reset_index(drop=True)
    return df


# -----------------------------
# 12) Main
# -----------------------------
def main():
    cfg = Config(
        dataset_csv=Path("/content/drive/MyDrive/Billionaire_CSV_Processed_Novels_Chapters_Sentences - processed_novels_sentences_new.csv"),
        custom_stopwords_file=Path("/content/drive/MyDrive/Billionaire_Character_Names_Extracted.txt"),
        output_dir=Path("/content/drive/MyDrive/Billionaire_BERTTopic_Models_POS_simplified"),
        test_mode=False,
        sample_size=1000,
        target_topics=None,
    )

    def create_params_df() -> pd.DataFrame:
        models = [
            {"Embeddings_Model": "paraphrase-mpnet-base-v2","Iteration": 1,"bertopic__min_topic_size": 57,"bertopic__top_n_words": 37,"hdbscan__min_cluster_size": 132,"hdbscan__min_samples": 57,"umap__min_dist": 0.053015,"umap__n_components": 4,"umap__n_neighbors": 39,"vectorizer__min_df": 0.004806},
            {"Embeddings_Model": "paraphrase-MiniLM-L6-v2","Iteration": 1,"bertopic__min_topic_size": 57,"bertopic__top_n_words": 37,"hdbscan__min_cluster_size": 132,"hdbscan__min_samples": 57,"umap__min_dist": 0.053015,"umap__n_components": 4,"umap__n_neighbors": 39,"vectorizer__min_df": 0.004806},
            {"Embeddings_Model": "all-MiniLM-L12-v2","Iteration": 101,"bertopic__min_topic_size": 127,"bertopic__top_n_words": 26,"hdbscan__min_cluster_size": 128,"hdbscan__min_samples": 68,"umap__min_dist": 0.066224,"umap__n_components": 2,"umap__n_neighbors": 20,"vectorizer__min_df": 0.002257},
            {"Embeddings_Model": "all-MiniLM-L12-v2","Iteration": 66,"bertopic__min_topic_size": 228,"bertopic__top_n_words": 27,"hdbscan__min_cluster_size": 105,"hdbscan__min_samples": 22,"umap__min_dist": 0.011738,"umap__n_components": 2,"umap__n_neighbors": 41,"vectorizer__min_df": 0.006573},
            {"Embeddings_Model": "paraphrase-MiniLM-L6-v2","Iteration": 112,"bertopic__min_topic_size": 131,"bertopic__top_n_words": 22,"hdbscan__min_cluster_size": 86,"hdbscan__min_samples": 73,"umap__min_dist": 0.083975,"umap__n_components": 2,"umap__n_neighbors": 3,"vectorizer__min_df": 0.001335},
            {"Embeddings_Model": "paraphrase-MiniLM-L6-v2","Iteration": 59,"bertopic__min_topic_size": 26,"bertopic__top_n_words": 38,"hdbscan__min_cluster_size": 120,"hdbscan__min_samples": 95,"umap__min_dist": 0.052454,"umap__n_components": 2,"umap__n_neighbors": 2,"vectorizer__min_df": 0.007219},
            {"Embeddings_Model": "paraphrase-mpnet-base-v2","Iteration": 38,"bertopic__min_topic_size": 187,"bertopic__top_n_words": 35,"hdbscan__min_cluster_size": 60,"hdbscan__min_samples": 88,"umap__min_dist": 0.011252,"umap__n_components": 4,"umap__n_neighbors": 42,"vectorizer__min_df": 0.009032},
            {"Embeddings_Model": "paraphrase-MiniLM-L6-v2","Iteration": 13,"bertopic__min_topic_size": 59,"bertopic__top_n_words": 16,"hdbscan__min_cluster_size": 54,"hdbscan__min_samples": 74,"umap__min_dist": 0.020023,"umap__n_components": 4,"umap__n_neighbors": 11,"vectorizer__min_df": 0.002484},
            {"Embeddings_Model": "all-MiniLM-L12-v2","Iteration": 60,"bertopic__min_topic_size": 77,"bertopic__top_n_words": 13,"hdbscan__min_cluster_size": 144,"hdbscan__min_samples": 51,"umap__min_dist": 0.069807,"umap__n_components": 4,"umap__n_neighbors": 2,"vectorizer__min_df": 0.002135},
            {"Embeddings_Model": "all-MiniLM-L12-v2","Iteration": 52,"bertopic__min_topic_size": 245,"bertopic__top_n_words": 38,"hdbscan__min_cluster_size": 98,"hdbscan__min_samples": 30,"umap__min_dist": 0.044896,"umap__n_components": 9,"umap__n_neighbors": 28,"vectorizer__min_df": 0.005585},
        ]
        return pd.DataFrame(models)

    logger = setup_logging(cfg.output_dir)
    logger.info("Starting BERTopic grid with c_npmi coherence")

    if not HAS_GENSIM:
        logger.info("Gensim not found: c_npmi coherence will be skipped and weight set to 0")
        cfg.w_coherence = 0.0

    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)

    ensure_spacy(logger)
    stops = load_stopwords(cfg.custom_stopwords_file)
    docs = load_corpus(cfg, logger, stops)

    params_df = create_params_df()
    for col in cfg.param_cols:
        if col not in params_df.columns:
            raise ValueError(f"Params DF missing column '{col}'")

    emb_models = params_df["Embeddings_Model"].unique().tolist()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Embedding device: {device}")

    embed_cache: Dict[str, np.ndarray] = {}
    for name in emb_models:
        embs = get_embeddings(name, docs, cfg.embeddings_dir, device, logger)
        embed_cache[name] = embs

    summary_rows: List[Dict] = []
    for _, row in tqdm(params_df.iterrows(), total=len(params_df), desc="Training grid"):
        emb_name = str(row["Embeddings_Model"])
        embs = embed_cache[emb_name]

        for pos_cfg in cfg.pos_configs:
            try:
                result = train_single(
                    docs=docs,
                    embs=embs,
                    params_row=row,
                    pos_cfg=pos_cfg,
                    cfg=cfg,
                    logger=logger,
                    device=device,
                )
            except Exception as e:
                logger.exception(f"Run failed for {emb_name} iter={row['Iteration']} POS={pos_cfg}: {e}")
                continue

            summary_rows.append(
                {
                    **{k: row[k] for k in cfg.param_cols},
                    "POS_Config": pos_cfg,
                    **result,
                }
            )
            gc.collect()

    if not summary_rows:
        logger.error("No successful runs.")
        return

    summary_df = pd.DataFrame(summary_rows)
    summary_path = cfg.output_dir / "summary_raw.csv"
    summary_df.to_csv(summary_path, index=False)
    logger.info(f"Wrote raw summary: {summary_path}")

    ranked = rank_runs(summary_df, cfg, logger)
    ranked_path = cfg.output_dir / "summary_ranked.csv"
    ranked.to_csv(ranked_path, index=False)
    logger.info(f"Wrote ranked summary: {ranked_path}")

    top5 = ranked.head(5)[
        [
            "Embeddings_Model",
            "Iteration",
            "POS_Config",
            "n_topics",
            "outlier_ratio",
            "topic_diversity",
            "coherence_c_npmi",
            "score",
            "run_dir",
        ]
    ]
    top5.to_csv(cfg.output_dir / "leaderboard_top5.csv", index=False)

    best = ranked.iloc[0].to_dict()
    with (cfg.output_dir / "best_overall.txt").open("w", encoding="utf-8") as f:
        json.dump(best, f, indent=2)

    logger.info("Done. Best run:")
    logger.info(json.dumps(best, indent=2))


if __name__ == "__main__":
    main()


2025-10-31 22:47:53,395 - INFO - Starting BERTopic grid with c_npmi coherence
INFO:bertopic_grid:Starting BERTopic grid with c_npmi coherence
2025-10-31 22:48:03,553 - INFO - Loaded 391,594 cleaned documents
INFO:bertopic_grid:Loaded 391,594 cleaned documents
2025-10-31 22:48:03,589 - INFO - Embedding device: cuda
INFO:bertopic_grid:Embedding device: cuda
2025-10-31 22:48:03,590 - INFO - Loading cached embeddings: bertopic_runs/embeddings/paraphrase-mpnet-base-v2.npy
INFO:bertopic_grid:Loading cached embeddings: bertopic_runs/embeddings/paraphrase-mpnet-base-v2.npy
2025-10-31 22:48:03,980 - INFO - Loading cached embeddings: bertopic_runs/embeddings/paraphrase-MiniLM-L6-v2.npy
INFO:bertopic_grid:Loading cached embeddings: bertopic_runs/embeddings/paraphrase-MiniLM-L6-v2.npy
2025-10-31 22:48:04,170 - INFO - Loading cached embeddings: bertopic_runs/embeddings/all-MiniLM-L12-v2.npy
INFO:bertopic_grid:Loading cached embeddings: bertopic_runs/embeddings/all-MiniLM-L12-v2.npy
Training grid:  

[2025-10-31 22:48:04.375] [CUML] [info] build_algo set to brute_force_knn because random_state is given


2025-10-31 22:51:33,904 - ERROR - Run failed for paraphrase-mpnet-base-v2 iter=1 POS=a: 'NoneType' object has no attribute 'embed_documents'
Traceback (most recent call last):
  File "/tmp/ipython-input-1103535406.py", line 546, in main
    result = train_single(
             ^^^^^^^^^^^^^
  File "/tmp/ipython-input-1103535406.py", line 358, in train_single
    topics, probs = topic_model.fit_transform(docs, embs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/bertopic/_bertopic.py", line 515, in fit_transform
    self._extract_topics(
  File "/usr/local/lib/python3.12/dist-packages/bertopic/_bertopic.py", line 4049, in _extract_topics
    self.topic_representations_ = self._extract_words_per_topic(
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/bertopic/_bertopic.py", line 4376, in _extract_words_per_topic
    topics = main_model.extract_topics(self, documents, 

[2025-10-31 22:51:33.934] [CUML] [info] build_algo set to brute_force_knn because random_state is given


2025-10-31 22:55:07,605 - ERROR - Run failed for paraphrase-mpnet-base-v2 iter=1 POS=b: 'NoneType' object has no attribute 'embed_documents'
Traceback (most recent call last):
  File "/tmp/ipython-input-1103535406.py", line 546, in main
    result = train_single(
             ^^^^^^^^^^^^^
  File "/tmp/ipython-input-1103535406.py", line 358, in train_single
    topics, probs = topic_model.fit_transform(docs, embs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/bertopic/_bertopic.py", line 515, in fit_transform
    self._extract_topics(
  File "/usr/local/lib/python3.12/dist-packages/bertopic/_bertopic.py", line 4049, in _extract_topics
    self.topic_representations_ = self._extract_words_per_topic(
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/bertopic/_bertopic.py", line 4376, in _extract_words_per_topic
    topics = main_model.extract_topics(self, documents, 

[2025-10-31 22:55:07.636] [CUML] [info] build_algo set to brute_force_knn because random_state is given


Training grid:   0%|          | 0/10 [08:30<?, ?it/s]


KeyboardInterrupt: 