In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

fatal: destination path 'rapidsai-csp-utils' already exists and is not an empty directory.
Installing RAPIDS remaining 24.10.* libraries
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com

        ***********************************************************************
        The pip install of RAPIDS is complete.
        
        Please do not run any further installation from the conda based installation methods, as they may cause issues!
        
        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files. 
            - If an installation failure persists when using the latest script, please make an issue on https://github.com/rapidsai-community/rapidsai-csp-utils
        ****************************************************************

In [None]:
# Install other required libraries
!pip install bertopic==0.16.3
!pip install sentence-transformers
!pip install gensim
!pip install nltk
!pip install scikit-learn==1.0.2

# Import necessary libraries
import cudf
import cuml
import cupy as cp
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import os
import re
import pandas as pd
import numpy as np
import json
import warnings

# Import BERTopic and its components
import bertopic  # Import the entire bertopic module
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import CoherenceModel

# Verify BERTopic version
print(f"BERTopic version: {bertopic.__version__}")

# Define the paths for datasets, models, and resources
dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new.csv'
additional_stop_words_characters_names = '/content/drive/MyDrive/character_names.txt'
dir_with_trained_models = '/content/drive/MyDrive/models_best_ten'

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 1. Load additional stop words (character names) and standard English stop words
print("Loading additional stop words and standard English stop words...")
with open(additional_stop_words_characters_names, 'r', encoding='utf-8') as file:
    custom_stop_words = file.read().splitlines()
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stop_words)
print(f"Stop words loaded. Total stop words: {len(stop_words)}")

# 2. Load and preprocess the dataset
print("Loading dataset...")
df = pd.read_csv(dataset_path)
print("Dataset loaded.")

print("Preprocessing sentences...")
# Remove newline characters and extra spaces, convert to lowercase
df['Sentence'] = df['Sentence'].astype(str).apply(lambda x: re.sub(r'\n+', ' ', x))
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub(r'\s+', ' ', x).strip().lower())
print("Sentences preprocessed.")

# List of sentence strings
dataset_as_list_of_strings = df['Sentence'].tolist()
print(f"Total sentences in dataset: {df.shape[0]}")

# 3. Tokenize sentences and remove stop words
print("Tokenizing sentences and removing stop words...")
processed_docs = []
for sentence in dataset_as_list_of_strings:
    tokens = word_tokenize(sentence)
    # Keep only alphabetic tokens and remove stop words
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    processed_docs.append(tokens)
print("Tokenization and stop word removal completed.")

# 4. Create a dictionary and corpus for coherence model
print("Creating dictionary and corpus for coherence model...")
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print("Dictionary and corpus created.")

# 5. Define model parameters and create a DataFrame mapping models to their parameters
# Model filenames
model_files = [
    'bertopic_model_0_iter_66_20241101_231419.pkl',
    'bertopic_model_1_iter_14_20241101_232426.pkl',
    'bertopic_model_2_iter_75_20241101_233115.pkl',
    'bertopic_model_3_iter_0_20241101_234120.pkl',
    'bertopic_model_4_iter_19_20241101_234832.pkl',
    'bertopic_model_5_iter_13_20241101_235837.pkl',
    'bertopic_model_6_iter_23_20241102_000842.pkl',
    'bertopic_model_7_iter_67_20241102_001547.pkl',
    'bertopic_model_8_iter_28_20241102_002548.pkl',
    'bertopic_model_9_iter_11_20241102_003549.pkl'
]

# Corresponding embedding models
embedding_models = [
    'all-MiniLM-L12-v2',
    'paraphrase-mpnet-base-v2',
    'all-MiniLM-L12-v2',
    'paraphrase-mpnet-base-v2',
    'paraphrase-MiniLM-L6-v2',
    'paraphrase-mpnet-base-v2',
    'multi-qa-mpnet-base-cos-v1',
    'all-MiniLM-L12-v2',
    'multi-qa-mpnet-base-cos-v1',
    'multi-qa-mpnet-base-cos-v1'
]

# Iterations
iterations = [66, 14, 75, 0, 19, 13, 23, 67, 28, 11]

# 'bertopic__top_n_words' parameter for each model (set to 10 for all to avoid mismatches)
bertopic_top_n_words = [10] * 10

# UMAP and HDBSCAN parameters
umap_n_neighbors = [7, 11, 15, 11, 44, 9, 19, 42, 18, 14]
umap_n_components = [2, 9, 5, 10, 9, 8, 9, 7, 9, 8]
umap_min_dist = [0.005022, 0.077818, 0.004634, 0.058341, 0.085702, 0.086975, 0.095922, 0.004852, 0.008103, 0.022149]
hdbscan_min_cluster_size = [281, 500, 473, 494, 143, 497, 492, 258, 427, 497]
hdbscan_min_samples = [72, 72, 14, 28, 32, 32, 12, 37, 11, 13]

# Ensure that all lists are of the same length
assert len(model_files) == len(embedding_models) == len(iterations) == len(bertopic_top_n_words) == len(umap_n_neighbors) == len(umap_n_components) == len(umap_min_dist) == len(hdbscan_min_cluster_size) == len(hdbscan_min_samples), "List lengths do not match."

# Create a DataFrame with model information
data = {
    'model_name': model_files,
    'embedding_model': embedding_models,
    'iteration': iterations,
    'bertopic_top_n_words': bertopic_top_n_words,
    'umap__n_neighbors': umap_n_neighbors,
    'umap__n_components': umap_n_components,
    'umap__min_dist': umap_min_dist,
    'hdbscan__min_cluster_size': hdbscan_min_cluster_size,
    'hdbscan__min_samples': hdbscan_min_samples
}

model_embedding_df = pd.DataFrame(data)
print("Model and Embedding DataFrame created:\n", model_embedding_df)

# 6. Initialize a DataFrame to store metrics for all models
all_models_metrics = pd.DataFrame()

# 7. Process each model
print("Starting post-processing for each model...")
for index, row in model_embedding_df.iterrows():
    model_filename = row['model_name']
    embedding_model_name = row['embedding_model']
    iteration = row['iteration']
    top_n_words = row['bertopic_top_n_words']
    print(f"\nProcessing model: {model_filename}")
    model_path = os.path.join(dir_with_trained_models, model_filename)

    print(f"Using embedding model '{embedding_model_name}' for model '{model_filename}'")

    # Load the embedding model
    try:
        print("Loading embedding model...")
        embedding_model = SentenceTransformer(embedding_model_name)
        print("Embedding model loaded.")
    except Exception as e:
        print(f"Error loading embedding model '{embedding_model_name}': {e}")
        continue  # Skip to the next model

    # Load the BERTopic model with the embedding model
    try:
        print("Loading BERTopic model...")
        topic_model = BERTopic.load(model_path, embedding_model=embedding_model)
        print("BERTopic model loaded.")

        # Recreate UMAP and HDBSCAN models with parameters
        umap_params = {
            'n_neighbors': int(row['umap__n_neighbors']),
            'n_components': int(row['umap__n_components']),
            'min_dist': float(row['umap__min_dist']),
            'metric': 'cosine',
            'random_state': 42
        }
        hdbscan_params = {
            'min_cluster_size': int(row['hdbscan__min_cluster_size']),
            'min_samples': int(row['hdbscan__min_samples']),
            'cluster_selection_method': 'eom',
            'prediction_data': True,
            'gen_min_span_tree': True
        }
        umap_model = UMAP(**umap_params)
        hdbscan_model = HDBSCAN(**hdbscan_params)
        topic_model.umap_model = umap_model
        topic_model.hdbscan_model = hdbscan_model
    except Exception as e:
        print(f"Error loading BERTopic model '{model_filename}': {e}")
        continue  # Skip to the next model

    # Proceed with updating topics
    try:
        print("Updating topics in the topic model...")
        topic_model.update_topics(dataset_as_list_of_strings)
        print("Topics updated.")
    except Exception as e:
        print(f"Error updating topics for model '{model_filename}': {e}")
        continue  # Skip to the next model

    # Initialize a list to store metrics for this model
    model_metrics = []

    # Apply KeyBERT-Inspired representation
    print("Applying KeyBERT-Inspired representation...")
    keybert_repr = KeyBERTInspired(top_n_words=top_n_words)
    try:
        topic_model.update_topics(dataset_as_list_of_strings, representation_model=keybert_repr)
        keybert_topics = topic_model.get_topics()
    except Exception as e:
        print(f"Error applying KeyBERT-Inspired representation for model '{model_filename}': {e}")
        keybert_topics = {}

    # Save KeyBERT-Inspired topics if available
    if keybert_topics:
        # Directory to save processed topics for this model
        output_dir = f"/content/drive/MyDrive/processed_topics_{model_filename.split('.')[0]}"
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created directory for processed topics: {output_dir}")

        # Save KeyBERT-Inspired processed topics
        repr_name = 'keybert_inspired'
        topics = keybert_topics
        print(f"Saving {repr_name} processed topics...")
        csv_data = []
        json_data = {}
        for topic_id, topic_words in topics.items():
            # Filter out any empty strings or non-alphabetic tokens
            words = [word for word, _ in topic_words if word.isalpha()]
            if not words:
                continue  # Skip topics with no valid words
            csv_data.append({
                "model_name": model_filename,
                "protocol": repr_name,
                "topic_id": topic_id,
                "topic_words": ", ".join(words)
            })
            json_data[str(topic_id)] = words

        # Save CSV
        csv_df = pd.DataFrame(csv_data)
        csv_output_path = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{repr_name}.csv")
        csv_df.to_csv(csv_output_path, index=False)

        # Save JSON
        json_output_path = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{repr_name}.json")
        with open(json_output_path, "w") as json_file:
            json.dump(json_data, json_file, indent=4)
        print(f"{repr_name} processed topics saved.")
    else:
        print(f"No KeyBERT-Inspired topics available for model '{model_filename}'. Skipping saving.")

    # Apply Maximal Marginal Relevance representation
    print("Applying Maximal Marginal Relevance representation...")
    mmr_repr = MaximalMarginalRelevance(top_n_words=top_n_words)
    try:
        topic_model.update_topics(dataset_as_list_of_strings, representation_model=mmr_repr)
        mmr_topics = topic_model.get_topics()
    except Exception as e:
        print(f"Error applying Maximal Marginal Relevance representation for model '{model_filename}': {e}")
        mmr_topics = {}

    # Save MMR processed topics if available
    if mmr_topics:
        repr_name = 'mmr'
        topics = mmr_topics
        print(f"Saving {repr_name} processed topics...")
        csv_data = []
        json_data = {}
        for topic_id, topic_words in topics.items():
            # Filter out any empty strings or non-alphabetic tokens
            words = [word for word, _ in topic_words if word.isalpha()]
            if not words:
                continue  # Skip topics with no valid words
            csv_data.append({
                "model_name": model_filename,
                "protocol": repr_name,
                "topic_id": topic_id,
                "topic_words": ", ".join(words)
            })
            json_data[str(topic_id)] = words

        # Save CSV
        csv_df = pd.DataFrame(csv_data)
        csv_output_path = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{repr_name}.csv")
        csv_df.to_csv(csv_output_path, index=False)

        # Save JSON
        json_output_path = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{repr_name}.json")
        with open(json_output_path, "w") as json_file:
            json.dump(json_data, json_file, indent=4)
        print(f"{repr_name} processed topics saved.")
    else:
        print(f"No MMR topics available for model '{model_filename}'. Skipping saving.")

    # Apply POS filters and save for each protocol
    pos_protocols = {
        'nouns': ["NOUN"],
        'nouns_verbs': ["NOUN", "VERB"],
        'nouns_adjectives': ["NOUN", "ADJ"]
    }

    # List of representations to process
    representations = ['keybert_inspired', 'mmr'] + list(pos_protocols.keys())
    # Dictionary to store topics for each representation
    topics_dict = {'keybert_inspired': keybert_topics, 'mmr': mmr_topics}

    for protocol_name, pos_tags in pos_protocols.items():
        print(f"Applying POS filtering protocol: {protocol_name}")
        # Create pos_patterns from pos_tags
        pos_patterns = [[{"POS": tag}] for tag in pos_tags]
        # Create a PartOfSpeech representation model with specified POS patterns
        pos_repr = PartOfSpeech(top_n_words=top_n_words, pos_patterns=pos_patterns)
        try:
            topic_model.update_topics(dataset_as_list_of_strings, representation_model=pos_repr)
            # Get the topics after applying POS filtering
            pos_topics = topic_model.get_topics()
            topics_dict[protocol_name] = pos_topics
        except Exception as e:
            print(f"Error applying PartOfSpeech representation '{protocol_name}' for model '{model_filename}': {e}")
            topics_dict[protocol_name] = {}
            continue  # Skip to the next representation

        # Save POS filtered topics if available
        if pos_topics:
            csv_data = []
            json_data = {}
            for topic_id, topic_words in pos_topics.items():
                # Filter out any empty strings or non-alphabetic tokens
                words = [word for word, _ in topic_words if word.isalpha()]
                if not words:
                    continue  # Skip topics with no valid words
                csv_data.append({
                    "model_name": model_filename,
                    "protocol": protocol_name,
                    "topic_id": topic_id,
                    "topic_words": ", ".join(words)
                })
                json_data[str(topic_id)] = words

            # Save CSV file for the protocol
            pos_filtered_path_csv = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{protocol_name}.csv")
            csv_df = pd.DataFrame(csv_data)
            csv_df.to_csv(pos_filtered_path_csv, index=False)

            # Save JSON file for the protocol
            pos_filtered_path_json = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{protocol_name}.json")
            with open(pos_filtered_path_json, "w") as json_file:
                json.dump(json_data, json_file, indent=4)

            print(f"POS filtered topics saved for protocol: {protocol_name}")
        else:
            print(f"No POS filtered topics available for protocol '{protocol_name}' in model '{model_filename}'. Skipping saving.")

    print(f"Post-processing completed and saved for model: {model_filename}")

    # Calculate coherence and diversity metrics
    print(f"\nCalculating coherence and diversity for model: {model_filename}")
    try:
        for repr_name in representations:
            print(f"Calculating metrics for representation: {repr_name}")
            # Get the topics for the current representation
            topics = topics_dict.get(repr_name, {})

            if not topics:
                print(f"No topics found for representation: {repr_name}")
                continue  # Skip if no topics are available

            # Prepare the topic words using the top_n_words parameter
            topic_words_list = []
            per_topic_metrics = []  # List to store per-topic metrics

            for topic_id, words in topics.items():
                # Get the top N words for the topic and filter out invalid words
                topic_words = [word for word, _ in words[:top_n_words] if word.isalpha()]
                if not topic_words:
                    continue  # Skip topics with no valid words
                topic_words_list.append(topic_words)

                # Calculate per-topic coherence using Gensim's CoherenceModel with 'c_v'
                coherence_model_topic = CoherenceModel(topics=[topic_words],
                                                       texts=processed_docs,
                                                       dictionary=dictionary,
                                                       coherence='c_v')
                coherence_score_topic = coherence_model_topic.get_coherence()

                # Calculate per-topic diversity (number of unique words over total words in the topic)
                unique_words_topic = set(topic_words)
                total_words_topic = len(topic_words)
                diversity_score_topic = len(unique_words_topic) / total_words_topic if total_words_topic > 0 else 0

                # Save per-topic metrics
                per_topic_metrics.append({
                    'model_name': model_filename,
                    'protocol': repr_name,
                    'topic_id': topic_id,
                    'coherence': coherence_score_topic,
                    'diversity': diversity_score_topic
                })

            if not topic_words_list:
                print(f"No valid topics found for representation: {repr_name}")
                continue  # Skip if no valid topics are available

            # Calculate overall coherence using Gensim's CoherenceModel with 'c_v'
            coherence_model = CoherenceModel(topics=topic_words_list,
                                             texts=processed_docs,
                                             dictionary=dictionary,
                                             coherence='c_v')
            coherence_score = coherence_model.get_coherence()

            # Calculate overall diversity
            unique_words = set()
            total_words = 0
            for words in topic_words_list:
                unique_words.update(words)
                total_words += len(words)
            diversity_score = len(unique_words) / total_words if total_words > 0 else 0

            # Save overall metrics
            metrics_data = {
                'model_name': model_filename,
                'protocol': repr_name,
                'coherence': coherence_score,
                'diversity': diversity_score
            }
            model_metrics.append(metrics_data)
            print(f"Overall Coherence: {coherence_score}, Overall Diversity: {diversity_score}")

            # Save per-topic metrics to a CSV file
            per_topic_metrics_df = pd.DataFrame(per_topic_metrics)
            per_topic_metrics_path = os.path.join(output_dir, f"{model_filename.split('.')[0]}_{repr_name}_per_topic_metrics.csv")
            per_topic_metrics_df.to_csv(per_topic_metrics_path, index=False)
            print(f"Per-topic metrics saved for representation: {repr_name}")

    except Exception as e:
        print(f"Error calculating metrics for model '{model_filename}': {e}")
        continue

    # Add model metrics to the overall DataFrame
    if model_metrics:
        model_metrics_df = pd.DataFrame(model_metrics)
        all_models_metrics = pd.concat([all_models_metrics, model_metrics_df], ignore_index=True)

        # Save metrics for the model
        metrics_output_path = os.path.join(output_dir, f"{model_filename.split('.')[0]}_metrics.csv")
        model_metrics_df.to_csv(metrics_output_path, index=False)
        print(f"Metrics saved for model: {model_filename}")
    else:
        print(f"No metrics calculated for model: {model_filename}")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


BERTopic version: 0.16.3
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading additional stop words and standard English stop words...
Stop words loaded. Total stop words: 7704
Loading dataset...
Dataset loaded.
Preprocessing sentences...
Sentences preprocessed.
Total sentences in dataset: 680822
Tokenizing sentences and removing stop words...
Tokenization and stop word removal completed.
Creating dictionary and corpus for coherence model...
Dictionary and corpus created.
Model and Embedding DataFrame created:
                                      model_name             embedding_model  \
0  bertopic_model_0_iter_66_20241101_231419.pkl           all-MiniLM-L12-v2   
1  bertopic_model_1_iter_14_20241101_232426.pkl    paraphrase-mpnet-base-v2   
2  bertopic_model_2_iter_75_20241101_233115.pkl           all-MiniLM-L12-v2   
3   bertopic_model_3_iter_0_20241101_234120.pkl    paraphrase-mpnet-base-v2   
4  b

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Overall Coherence: 0.4856402947301775, Overall Diversity: 0.6509803921568628
Per-topic metrics saved for representation: keybert_inspired
Calculating metrics for representation: mmr
No topics found for representation: mmr
Calculating metrics for representation: nouns


