In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install required libraries
!pip install gensim
!pip install nltk
!pip install tqdm
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

# Import libraries
import os
import json
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import nltk
from nltk.corpus import stopwords
from gensim import downloader as gensim_downloader
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
# Define the path to the topics folder in Google Drive
topics_folder_path = '/content/drive/MyDrive/topics_for_metrics'

# List to store all models and their topics
models_topics = []

# Traverse through the topic files
for filename in os.listdir(topics_folder_path):
    if filename.endswith('.json'):
        # Remove the '.json' extension
        model_name_run = filename.replace('.json', '')
        parts = model_name_run.split('_')  # Split by '_'

        # Example filename structure:
        # bertopic_model_10_iter_11_multi-qa-mpnet-base-cos-v1_20241102_213544_topics
        # Adjust splitting based on the actual structure
        try:
            # Assuming the structure is: bertopic_model_<number>_iter_<number>_<model_details>_<timestamp>_topics
            # Extract model name and run index accordingly
            # For example: 'bertopic_model_10_iter_11_multi-qa-mpnet-base-cos-v1_20241102_213544_topics'
            # Model Name: 'bertopic_model_10_iter_11_multi-qa-mpnet-base-cos-v1'
            # Run Index: '20241102_213544'

            # Remove the trailing '_topics'
            base_name = '_'.join(parts[:-1])

            # Split the base_name by '_' to separate model details and timestamp
            base_parts = base_name.rsplit('_', 1)
            if len(base_parts) != 2:
                print(f"Filename '{filename}' does not match the expected format.")
                continue  # Skip this file or handle accordingly

            model_name = base_parts[0]  # 'bertopic_model_10_iter_11_multi-qa-mpnet-base-cos-v1'
            run_index = base_parts[1]   # '20241102_213544'

        except IndexError:
            print(f"Filename '{filename}' does not match the expected format.")
            continue  # Skip this file or handle accordingly

        # Load the topics
        with open(os.path.join(topics_folder_path, filename), 'r') as file:
            topics = json.load(file)

        # Store the data
        models_topics.append({
            'model_name': model_name,
            'run_index': run_index,
            'topics': topics,
            'filename': filename
        })

print(f"Total models loaded: {len(models_topics)}")

Total models loaded: 10


In [None]:
# Define the path to your dataset
dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new.csv'

# Load the raw dataset
df = pd.read_csv(dataset_path)

# Preprocess the sentences
df['Sentence'] = df['Sentence'].astype(str)  # Ensure all entries are strings
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub(r'\n+', ' ', x))
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub(r'\s+', ' ', x).strip().lower())

# List of sentence strings
dataset_as_list_of_strings = df['Sentence'].tolist()

print(f"Total sentences in dataset: {df.shape[0]}")

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Tokenize the sentences
texts = [nltk.word_tokenize(sentence) for sentence in dataset_as_list_of_strings]

# Remove stopwords
stop_words = set(stopwords.words('english'))
texts = [[word for word in text if word not in stop_words] for text in texts]

# Create Gensim dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Total sentences in dataset: 680822


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load pre-trained word embeddings (e.g., GloVe)
print("Loading word embeddings...")
word_vectors = gensim_downloader.load('glove-wiki-gigaword-100')  # You can choose different models
print("Word embeddings loaded.")

Loading word embeddings...
Word embeddings loaded.


In [None]:
def compute_coherence(topic, texts, dictionary, coherence_type='c_v'):
    """
    Compute coherence score for a single topic.
    """
    cm = CoherenceModel(
        topics=[topic],
        texts=texts,
        dictionary=dictionary,
        coherence=coherence_type
    )
    return cm.get_coherence()

def compute_diversity(topic, word_vectors):
    """
    Compute diversity score for a single topic based on word embeddings.
    Diversity is defined as the average pairwise dissimilarity between word vectors.
    """
    vectors = []
    for word in topic:
        if word in word_vectors:
            vectors.append(word_vectors[word])
    if len(vectors) < 2:
        return np.nan  # Not enough words to compute diversity
    # Compute pairwise cosine similarity
    sim_matrix = cosine_similarity(vectors)
    # We only need the upper triangle without the diagonal
    sim_scores = []
    for i in range(len(vectors)):
        for j in range(i+1, len(vectors)):
            sim_scores.append(sim_matrix[i][j])
    # Diversity is 1 - average similarity
    avg_similarity = np.mean(sim_scores)
    diversity = 1 - avg_similarity
    return diversity

In [None]:
# List to store coherence and diversity data
metrics_data = []

# Loop through each model's topics
for model in tqdm(models_topics, desc="Processing models"):
    topic_dict = model['topics']
    model_name = model['model_name']
    run_index = model['run_index']

    # Ensure topics are ordered by topic index
    sorted_topic_keys = sorted(topic_dict.keys(), key=lambda x: int(x))
    topics = [topic_dict[key] for key in sorted_topic_keys]

    # Compute coherence and diversity for each topic
    for idx, topic in enumerate(tqdm(topics, desc=f"Processing topics for {model_name}", leave=False)):
        # Compute coherence
        coherence = compute_coherence(topic, texts, dictionary)

        # Compute diversity
        diversity = compute_diversity(topic, word_vectors)

        # Store the results
        metrics_data.append({
            'model_name': model_name,
            'run_index': run_index,
            'topic_index': idx,
            'topic': topic,
            'coherence': coherence,
            'diversity': diversity
        })

# Create a DataFrame from the metrics data
df_metrics = pd.DataFrame(metrics_data)

# Drop rows with NaN values in coherence or diversity
df_metrics = df_metrics.dropna(subset=['coherence', 'diversity']).reset_index(drop=True)

print(df_metrics.head())

Processing models:   0%|          | 0/10 [00:00<?, ?it/s]
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   0%|          | 0/103 [00:00<?, ?it/s][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   1%|          | 1/103 [00:08<14:09,  8.33s/it][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   2%|▏         | 2/103 [00:16<14:08,  8.40s/it][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   3%|▎         | 3/103 [00:24<13:51,  8.32s/it][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   4%|▍         | 4/103 [00:33<13:41,  8.30s/it][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   5%|▍         | 5/103 [00:41<13:31,  8.29s/it][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_20241102:   6%|▌         | 6/103 [00:49<13:13,  8.18s/it][A
Processing topics for bertopic_model_1_iter_66_all-MiniLM-L12-v2_202

                                          model_name run_index  topic_index  \
0  bertopic_model_1_iter_66_all-MiniLM-L12-v2_202...    202800            0   
1  bertopic_model_1_iter_66_all-MiniLM-L12-v2_202...    202800            1   
2  bertopic_model_1_iter_66_all-MiniLM-L12-v2_202...    202800            2   
3  bertopic_model_1_iter_66_all-MiniLM-L12-v2_202...    202800            3   
4  bertopic_model_1_iter_66_all-MiniLM-L12-v2_202...    202800            4   

                                               topic  coherence  diversity  
0  [think, say, thinking, says, wants, guess, say...   0.371887   0.337968  
1  [hand, finger, grip, wrists, grasp, fist, wris...   0.446623   0.677789  
2  [smile, smiles, smiled, grins, grin, smirk, gr...   0.347536   0.679073  
3  [door, doors, doorway, doorman, doorframe, doo...   0.336006   0.653144  
4  [hips, hip, knees, thighs, thigh, leg, legs, k...   0.539015   0.612843  





In [None]:
# Save the DataFrame to Google Drive
df_metrics.to_csv('/content/drive/MyDrive/BERTopic_metrics.csv', index=False)

print("df_metrics has been successfully saved to your Google Drive at 'MyDrive/BERTopic_metrics.csv'.")

df_metrics has been successfully saved to your Google Drive at 'MyDrive/BERTopic_metrics.csv'.
