# Assigning Topics to Docs in Corpus

In [1]:
# Check CUDA and driver versions
!nvcc --version  # Check CUDA version
!nvidia-smi      # Check driver version

# Install RAPIDS and other required libraries
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Thu Nov  7 13:36:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                      

In [1]:
# Install Remaining Necessary Libraries (After Restarting Runtime)

# Install specific versions of required libraries
!pip install bertopic==0.16.3
!pip install octis
!pip install sentence-transformers
!pip install umap-learn==0.5.3
!pip install hdbscan
!pip install tqdm
!pip install pandas
!pip install gensim
!pip install wandb
!pip install umap
!pip install scipy
!pip install nltk

Collecting bertopic==0.16.3
  Using cached bertopic-0.16.3-py3-none-any.whl.metadata (23 kB)
Using cached bertopic-0.16.3-py3-none-any.whl (143 kB)
Installing collected packages: bertopic
  Attempting uninstall: bertopic
    Found existing installation: bertopic 0.16.4
    Uninstalling bertopic-0.16.4:
      Successfully uninstalled bertopic-0.16.4
Successfully installed bertopic-0.16.3
Collecting umap-learn==0.5.3
  Using cached umap_learn-0.5.3-py3-none-any.whl
Installing collected packages: umap-learn
  Attempting uninstall: umap-learn
    Found existing installation: umap-learn 0.5.7
    Uninstalling umap-learn-0.5.7:
      Successfully uninstalled umap-learn-0.5.7
Successfully installed umap-learn-0.5.3
Collecting umap
  Using cached umap-0.1.1-py3-none-any.whl
Installing collected packages: umap
Successfully installed umap-0.1.1


In [2]:
# Clean Up and Update Libraries

!pip list | grep umap
!pip uninstall -y umap
!find . -type d -name "__pycache__" -exec rm -r {} +
!pip install --upgrade bertopic umap-learn

# Import BERTopic after installation
from bertopic import BERTopic

umap                               0.1.1
umap-learn                         0.5.3
Found existing installation: umap 0.1.1
Uninstalling umap-0.1.1:
  Successfully uninstalled umap-0.1.1
Collecting bertopic
  Using cached bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Using cached bertopic-0.16.4-py3-none-any.whl (143 kB)
Using cached umap_learn-0.5.7-py3-none-any.whl (88 kB)
Installing collected packages: umap-learn, bertopic
  Attempting uninstall: umap-learn
    Found existing installation: umap-learn 0.5.3
    Uninstalling umap-learn-0.5.3:
      Successfully uninstalled umap-learn-0.5.3
  Attempting uninstall: bertopic
    Found existing installation: bertopic 0.16.3
    Uninstalling bertopic-0.16.3:
      Successfully uninstalled bertopic-0.16.3
Successfully installed bertopic-0.16.4 umap-learn-0.5.7


In [3]:
# Import Necessary Libraries for Analysis and Visualization

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import time
import os

In [4]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Load Dataset

# Define dataset path
dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new.csv'

# Load the dataset
df_sentences = pd.read_csv(dataset_path)

# Clean column names for consistency
df_sentences.columns = df_sentences.columns.str.strip().str.replace(' ', '_')
print("Cleaned column names:", df_sentences.columns.tolist())

Cleaned column names: ['Author', 'Book_Title', 'Chapter', 'Sentence']


In [6]:
# Load Pre-trained BERTopic Model

# Define model path
pretrained_model_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/bertopic_model_5_iter_19_paraphrase-MiniLM-L6-v2_20241102_205523.pkl'

# Attempt to load the BERTopic model
try:
    topic_model = BERTopic.load(pretrained_model_path)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")

Model loaded successfully.


In [7]:
# Assign Topics to Documents in Batches

# Set batch size for processing
batch_size = 1000
sentences_list = df_sentences['Sentence'].tolist()
document_topics = []
document_probabilities = []

print("\nStarting topic assignment...")
total_batches = int(np.ceil(len(sentences_list) / batch_size))

# Process sentences in batches
with tqdm(total=total_batches, desc='Processing documents') as progress_bar:
    for start_index in range(0, len(sentences_list), batch_size):
        batch_sentences = sentences_list[start_index:start_index + batch_size]
        batch_topics, batch_probs = topic_model.transform(batch_sentences)

        document_topics.extend(batch_topics)
        document_probabilities.extend(batch_probs)

        progress_bar.update(1)

# Add topics and probabilities to DataFrame
df_sentences['topic'] = document_topics
df_sentences['topic_probability'] = document_probabilities
print("Topics assigned and added to DataFrame.\n")


Starting topic assignment...


Processing documents: 100%|██████████| 681/681 [05:21<00:00,  2.12it/s]


Topics assigned and added to DataFrame.



In [8]:
# Load Topic Labels CSV

# Track start time for performance measurement
script_start_time = time.time()

# Define path to the topic labels
topic_labels_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/topics_5_20241102_205523/labels_bertopic_model_5_iter_19_paraphrase-MiniLM-L6-v2_20241102_205523_topics_info.csv'
print(f"Loading labels from {topic_labels_path}...")

# Load the topic labels CSV
df_topic_labels = pd.read_csv(topic_labels_path)
df_topic_labels['Topic'] = df_topic_labels['Topic'].astype(int)

print(f"Loaded {len(df_topic_labels)} labels. Time elapsed: {time.time() - script_start_time:.2f} seconds\n")

Loading labels from /content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/topics_5_20241102_205523/labels_bertopic_model_5_iter_19_paraphrase-MiniLM-L6-v2_20241102_205523_topics_info.csv...
Loaded 217 labels. Time elapsed: 0.01 seconds



In [9]:
# Merge Labels with Main DataFrame

# Track time for merge operation
merge_start_time = time.time()
print("Merging labels with the main DataFrame...")

# Merge labels into the main DataFrame based on topic
df_sentences = df_sentences.merge(df_topic_labels[['Topic', 'Label']], left_on='topic', right_on='Topic', how='left')
print(f"Labels merged into main DataFrame. Time for merge: {time.time() - merge_start_time:.2f} seconds")

Merging labels with the main DataFrame...
Labels merged into main DataFrame. Time for merge: 0.20 seconds


In [10]:
# Save Updated DataFrame with Labels to CSV

# Track time for saving
save_start_time = time.time()

# Define output path
output_path_with_labels = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/processed_novels_with_labels.csv'
print(f"Saving updated DataFrame to {output_path_with_labels}...")

# Save the DataFrame to CSV
df_sentences.to_csv(output_path_with_labels, index=False)
print(f"DataFrame saved. Time for saving: {time.time() - save_start_time:.2f} seconds")

# Total time taken for loading, merging, and saving
total_script_time = time.time() - script_start_time
print(f"Total time for loading, merging, and saving: {total_script_time:.2f} seconds")

Saving updated DataFrame to /content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/processed_novels_with_labels.csv...
DataFrame saved. Time for saving: 714.02 seconds
Total time for loading, merging, and saving: 716.78 seconds


In [11]:
# Display the first few lines of the saved CSV file
print(pd.read_csv(output_path_with_labels).head())

     Author               Book_Title  Chapter  \
0  Ann_Cole  Mr. Mysterious In Black        1   
1  Ann_Cole  Mr. Mysterious In Black        1   
2  Ann_Cole  Mr. Mysterious In Black        1   
3  Ann_Cole  Mr. Mysterious In Black        1   
4  Ann_Cole  Mr. Mysterious In Black        1   

                                            Sentence  topic  \
0                            Prologue H e was tired.    210   
1                                         Dog-tired.    210   
2  Amped up by pleasure mere minutes ago, his hea...     20   
3  Unfortunately, he was yanked back from the bec...     23   
4  Languid, he opened his eyes to the annoying re...     -1   

                                   topic_probability  Topic  \
0  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...    210   
1  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...    210   
2  [0.0017705  0.00184171 0.00216719 0.00173465 0...     20   
3  [0.00237123 0.00262983 0.00200518 0.00196397 0...     23   
4  [0.     

In [12]:
# Filter Out Noisy Topics

# Define noisy topics to filter out
noisy_topics_to_exclude = [-1, 76, 117, 165, 190]

# Filter out the noisy topics from DataFrame
df_filtered_sentences = df_sentences[~df_sentences['topic'].isin(noisy_topics_to_exclude)].copy()
print("Noisy topics filtered.")

Noisy topics filtered.
