In [5]:
# Check CUDA and driver versions
!nvcc --version  # Check CUDA version
!nvidia-smi      # Check driver version

# Install RAPIDS and other required libraries
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Tue Nov  5 15:04:28 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0              29W /  70W |    575MiB / 15360MiB |      0%      Default |
|                                      

In [1]:
## After restarting, install remaining necessary libraries
# Run this cell after restarting the runtime
!pip install bertopic==0.16.3
!pip install octis
!pip install sentence-transformers
!pip install umap-learn==0.5.3  # Specify a compatible version
!pip install hdbscan
!pip install tqdm
!pip install pandas
!pip install gensim
!pip install wandb
!pip install umap
!pip install scipy
!pip install nltk

Collecting bertopic==0.16.3
  Downloading bertopic-0.16.3-py3-none-any.whl.metadata (23 kB)
Downloading bertopic-0.16.3-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
  Attempting uninstall: bertopic
    Found existing installation: bertopic 0.16.4
    Uninstalling bertopic-0.16.4:
      Successfully uninstalled bertopic-0.16.4
Successfully installed bertopic-0.16.3
Collecting octis
  Downloading octis-1.14.0-py2.py3-none-any.whl.metadata (27 kB)
Collecting scikit-learn==1.1.0 (from octis)
  Downloading scikit_learn-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting scikit-optimize>=0.8.1 (from octis)
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting libsvm (from octis)
  Downloading libsvm-3.23.0.4.tar.gz (170 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip list | grep umap

umap                               0.1.1
umap-learn                         0.5.3


In [3]:
!pip uninstall -y umap

Found existing installation: umap 0.1.1
Uninstalling umap-0.1.1:
  Successfully uninstalled umap-0.1.1


In [4]:
!find . -type d -name "__pycache__" -exec rm -r {} +

In [5]:
!pip install --upgrade bertopic umap-learn

Collecting bertopic
  Using cached bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Using cached bertopic-0.16.4-py3-none-any.whl (143 kB)
Using cached umap_learn-0.5.7-py3-none-any.whl (88 kB)
Installing collected packages: umap-learn, bertopic
  Attempting uninstall: umap-learn
    Found existing installation: umap-learn 0.5.3
    Uninstalling umap-learn-0.5.3:
      Successfully uninstalled umap-learn-0.5.3
  Attempting uninstall: bertopic
    Found existing installation: bertopic 0.16.3
    Uninstalling bertopic-0.16.3:
      Successfully uninstalled bertopic-0.16.3
Successfully installed bertopic-0.16.4 umap-learn-0.5.7


In [6]:
from bertopic import BERTopic

In [7]:
# Install necessary packages (if not already installed)
!pip install bertopic

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [8]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Path to your dataset CSV file
dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new.csv'

# Read the dataset into a DataFrame
df = pd.read_csv(dataset_path)

# Display the first few rows to verify
df.head()

Unnamed: 0,Author,Book Title,Chapter,Sentence
0,Ann_Cole,Mr. Mysterious In Black,1,Prologue H e was tired.
1,Ann_Cole,Mr. Mysterious In Black,1,Dog-tired.
2,Ann_Cole,Mr. Mysterious In Black,1,"Amped up by pleasure mere minutes ago, his hea..."
3,Ann_Cole,Mr. Mysterious In Black,1,"Unfortunately, he was yanked back from the bec..."
4,Ann_Cole,Mr. Mysterious In Black,1,"Languid, he opened his eyes to the annoying re..."


In [10]:
# Paths to your pre-trained BERTopic models
model1_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-mpnet-base-v2/bertopic_model_4_iter_0_paraphrase-mpnet-base-v2_20241102_205031.pkl'
model2_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/bertopic_model_5_iter_19_paraphrase-MiniLM-L6-v2_20241102_205523.pkl'

# Load the models
topic_model1 = BERTopic.load(model1_path)
topic_model2 = BERTopic.load(model2_path)

In [15]:
import numpy as np
from tqdm import tqdm
import time

In [16]:
# Define the batch size
batch_size = 1000  # Adjust based on your system's memory capacity

In [19]:
print("Starting topic assignment using Model 1 with progress bar...")
start_time_model1 = time.time()

# Prepare an empty list to store topics and probabilities
topics1_list = []
probs1_list = []

# Get the list of sentences
sentences = df['Sentence'].tolist()

# Initialize tqdm progress bar
num_batches = int(np.ceil(len(sentences) / batch_size))
with tqdm(total=num_batches, desc='Processing Model 1') as pbar:
    # Process sentences in batches
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        batch_topics, batch_probs = topic_model1.transform(batch_sentences)
        topics1_list.extend(batch_topics)
        probs1_list.extend(batch_probs)
        pbar.update(1)  # Update progress bar

elapsed_time_model1 = time.time() - start_time_model1
print(f"\nCompleted topic assignment using Model 1 in {elapsed_time_model1:.2f} seconds.")

# Add topics to the DataFrame
df['topic_model1'] = topics1_list
print("Topics assigned using Model 1 and added to DataFrame.\n")

Starting topic assignment using Model 1 with progress bar...


Processing Model 1: 100%|██████████| 681/681 [20:32<00:00,  1.81s/it]



Completed topic assignment using Model 1 in 1232.69 seconds.
Topics assigned using Model 1 and added to DataFrame.



In [18]:
print("Starting topic assignment using Model 2 with progress bar...")
start_time_model2 = time.time()

# Prepare an empty list to store topics and probabilities
topics2_list = []
probs2_list = []

# Initialize tqdm progress bar
with tqdm(total=num_batches, desc='Processing Model 2') as pbar:
    # Process sentences in batches
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        batch_topics, batch_probs = topic_model2.transform(batch_sentences)
        topics2_list.extend(batch_topics)
        probs2_list.extend(batch_probs)
        pbar.update(1)  # Update progress bar

elapsed_time_model2 = time.time() - start_time_model2
print(f"\nCompleted topic assignment using Model 2 in {elapsed_time_model2:.2f} seconds.")

# Add topics to the DataFrame
df['topic_model2'] = topics2_list
print("Topics assigned using Model 2 and added to DataFrame.")

Starting topic assignment using Model 2 with progress bar...


Processing Model 2: 100%|██████████| 681/681 [05:16<00:00,  2.15it/s]



Completed topic assignment using Model 2 in 316.47 seconds.
Topics assigned using Model 2 and added to DataFrame.


In [23]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.strip().str.replace(' ', '_')
print("Cleaned column names:")
print(df.columns.tolist())

Cleaned column names:
['Author', 'Book_Title', 'Chapter', 'Sentence', 'topic_model1', 'topic_model2']


In [25]:
# Group by Book_Title and topic_model1 to get counts
topic_counts1 = df.groupby(['Book_Title', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
topic_distribution1 = topic_counts1.pivot(index='Book_Title', columns='topic_model1', values='count').fillna(0)

# Display the topic distribution
print("Topic distribution for Model 1:")
topic_distribution1.head()

Topic distribution for Model 1:


topic_model1,-1,0,1,2,3,4,5,6,7,8,...,54,55,56,57,58,59,60,61,62,63
Book_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,6898.0,132.0,201.0,165.0,101.0,21.0,122.0,56.0,80.0,109.0,...,13.0,18.0,0.0,1.0,48.0,6.0,2.0,2.0,2.0,4.0
A Not So Meet Cute,6129.0,347.0,232.0,285.0,102.0,66.0,94.0,40.0,90.0,76.0,...,6.0,20.0,8.0,3.0,42.0,0.0,5.0,8.0,2.0,7.0
Ache for You,5126.0,129.0,217.0,77.0,99.0,19.0,107.0,44.0,112.0,69.0,...,11.0,12.0,9.0,11.0,39.0,9.0,1.0,3.0,23.0,0.0
An Accidental Date with a Billionaire,2240.0,106.0,118.0,52.0,19.0,18.0,44.0,26.0,47.0,27.0,...,6.0,2.0,3.0,5.0,9.0,0.0,1.0,0.0,4.0,2.0
Bad Boss,3876.0,165.0,275.0,52.0,74.0,64.0,69.0,31.0,89.0,18.0,...,3.0,13.0,0.0,2.0,32.0,3.0,5.0,11.0,2.0,7.0


In [26]:
# Group by Book_Title and topic_model2 to get counts
topic_counts2 = df.groupby(['Book_Title', 'topic_model2']).size().reset_index(name='count')

# Pivot to get topics as columns
topic_distribution2 = topic_counts2.pivot(index='Book_Title', columns='topic_model2', values='count').fillna(0)

# Display the topic distribution
print("Topic distribution for Model 2:")
topic_distribution2.head()

Topic distribution for Model 2:


topic_model2,-1,0,1,2,3,4,5,6,7,8,...,206,207,208,209,210,211,212,213,214,215
Book_Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,5411.0,224.0,160.0,106.0,32.0,311.0,64.0,263.0,1453.0,56.0,...,2.0,0.0,2.0,0.0,2.0,1.0,4.0,3.0,1.0,3.0
A Not So Meet Cute,5432.0,206.0,222.0,87.0,44.0,243.0,84.0,108.0,416.0,49.0,...,3.0,1.0,6.0,2.0,1.0,0.0,2.0,10.0,1.0,5.0
Ache for You,4369.0,191.0,74.0,79.0,41.0,175.0,84.0,95.0,345.0,42.0,...,5.0,1.0,3.0,0.0,2.0,0.0,5.0,1.0,3.0,0.0
An Accidental Date with a Billionaire,1963.0,104.0,50.0,22.0,22.0,142.0,20.0,22.0,111.0,21.0,...,0.0,0.0,1.0,4.0,0.0,1.0,0.0,1.0,2.0,1.0
Bad Boss,3505.0,251.0,44.0,36.0,35.0,128.0,47.0,14.0,175.0,27.0,...,0.0,1.0,2.0,4.0,0.0,0.0,1.0,2.0,0.0,1.0


In [34]:
# Step 5: Aggregate Topics by Book

### Model 1

print("\nAggregating topics by Book_Title for Model 1...")
# Group by Book_Title and topic_model1 to get counts
topic_counts1 = df.groupby(['Book_Title', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
topic_distribution1 = topic_counts1.pivot(index='Book_Title', columns='topic_model1', values='count').fillna(0)

# Display the topic distribution
print("Topic distribution for Model 1:")
print(topic_distribution1.head())

### Model 2

print("\nAggregating topics by Book_Title for Model 2...")
# Group by Book_Title and topic_model2 to get counts
topic_counts2 = df.groupby(['Book_Title', 'topic_model2']).size().reset_index(name='count')

# Pivot to get topics as columns
topic_distribution2 = topic_counts2.pivot(index='Book_Title', columns='topic_model2', values='count').fillna(0)

# Display the topic distribution
print("Topic distribution for Model 2:")
print(topic_distribution2.head())

# Step 6: Compute Percentage Distributions

### Model 1

print("\nComputing percentage distributions for Model 1...")
# Compute percentage of each topic in a book
topic_distribution_percent1 = topic_distribution1.div(topic_distribution1.sum(axis=1), axis=0) * 100

# Display the percentage distribution
print("Percentage topic distribution for Model 1:")
print(topic_distribution_percent1.head())

### Model 2

print("\nComputing percentage distributions for Model 2...")
# Compute percentage of each topic in a book
topic_distribution_percent2 = topic_distribution2.div(topic_distribution2.sum(axis=1), axis=0) * 100

# Display the percentage distribution
print("Percentage topic distribution for Model 2:")
print(topic_distribution_percent2.head())

# Step 7: Load Topics Info CSVs and Merge with DataFrame

### Model 1

print("\nLoading topics info CSV for Model 1...")
# Path to topics info CSV
topics_info1_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-mpnet-base-v2/topics_4_20241102_205031/bertopic_model_4_iter_0_paraphrase-mpnet-base-v2_20241102_205031_topics_info.csv'

# Load the topics info CSV
topics_info1 = pd.read_csv(topics_info1_path)

# Merge with the main DataFrame
df_model1 = df.merge(topics_info1, left_on='topic_model1', right_on='Topic', how='left', suffixes=('', '_model1'))

# Display the merged DataFrame
print("Merged DataFrame for Model 1:")
print(df_model1.head())

### Model 2

print("\nLoading topics info CSV for Model 2...")
# Path to topics info CSV
topics_info2_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/topics_5_20241102_205523/bertopic_model_5_iter_19_paraphrase-MiniLM-L6-v2_20241102_205523_topics_info.csv'

# Load the topics info CSV
topics_info2 = pd.read_csv(topics_info2_path)

# Merge with the main DataFrame
df_model2 = df.merge(topics_info2, left_on='topic_model2', right_on='Topic', how='left', suffixes=('', '_model2'))

# Display the merged DataFrame
print("Merged DataFrame for Model 2:")
print(df_model2.head())

# Step 9: Analyze Topic Distribution Per Author

### Model 1

print("\nAnalyzing topic distribution per author for Model 1...")
# Group by Author and topic_model1
author_topic_counts1 = df.groupby(['Author', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
author_topic_distribution1 = author_topic_counts1.pivot(index='Author', columns='topic_model1', values='count').fillna(0)

# Compute percentages
author_topic_distribution_percent1 = author_topic_distribution1.div(author_topic_distribution1.sum(axis=1), axis=0) * 100

# Display the percentage distribution per author
print("Percentage topic distribution per author for Model 1:")
print(author_topic_distribution_percent1.head())


# Step 10: Additional Analysis

### Analyzing Topics Over Chapters

print("\nAnalyzing topics over chapters for a specific book...")
# Ensure 'Chapter' is numeric
df['Chapter'] = pd.to_numeric(df['Chapter'], errors='coerce')

# Drop rows with NaN chapters (if any)
df_chapter = df.dropna(subset=['Chapter'])

# Group by Book_Title, Chapter, and topic_model1
chapter_topic_counts = df_chapter.groupby(['Book_Title', 'Chapter', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
chapter_topic_distribution = chapter_topic_counts.pivot_table(index=['Book_Title', 'Chapter'], columns='topic_model1', values='count', fill_value=0)

# Select a specific book for analysis
book_name = 'Mr. Mysterious In Black'  # Replace with your book title
if book_name in chapter_topic_distribution.index.get_level_values(0):
    book_chapter_topics = chapter_topic_distribution.loc[book_name]

else:
    print(f"Book '{book_name}' not found in the dataset.")


Aggregating topics by Book_Title for Model 1...
Topic distribution for Model 1:
topic_model1                              -1      0      1      2      3   \
Book_Title                                                                  
A Long Time Coming                     6898.0  132.0  201.0  165.0  101.0   
A Not So Meet Cute                     6129.0  347.0  232.0  285.0  102.0   
Ache for You                           5126.0  129.0  217.0   77.0   99.0   
An Accidental Date with a Billionaire  2240.0  106.0  118.0   52.0   19.0   
Bad Boss                               3876.0  165.0  275.0   52.0   74.0   

topic_model1                             4      5     6      7      8   ...  \
Book_Title                                                              ...   
A Long Time Coming                     21.0  122.0  56.0   80.0  109.0  ...   
A Not So Meet Cute                     66.0   94.0  40.0   90.0   76.0  ...   
Ache for You                           19.0  107.0  44.0  112.0

In [37]:
# Import necessary modules
import pandas as pd
import os

# ... (other imports and previous code)

# Paths to topics info CSV
topics_info1_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-mpnet-base-v2/topics_4_20241102_205031/bertopic_model_4_iter_0_paraphrase-mpnet-base-v2_20241102_205031_topics_info.csv'
topics_info2_path = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/topics_5_20241102_205523/bertopic_model_5_iter_19_paraphrase-MiniLM-L6-v2_20241102_205523_topics_info.csv'

# Extract directories
topics_info1_dir = os.path.dirname(topics_info1_path)
topics_info2_dir = os.path.dirname(topics_info2_path)

print(f"Model 1 topics info directory: {topics_info1_dir}")
print(f"Model 2 topics info directory: {topics_info2_dir}")

# Step 5: Aggregate Topics by Book

### Model 1

print("\nAggregating topics by Book_Title for Model 1...")
# Group by Book_Title and topic_model1 to get counts
topic_counts1 = df.groupby(['Book_Title', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
topic_distribution1 = topic_counts1.pivot(index='Book_Title', columns='topic_model1', values='count').fillna(0)

# Display the topic distribution
print("Topic distribution for Model 1:")
print(topic_distribution1.head())

# Save topic_distribution1 to CSV in topics_info1_dir
topic_distribution1.to_csv(os.path.join(topics_info1_dir, 'topic_distribution_model1.csv'))
print("Topic distribution for Model 1 saved to CSV in topics info directory.")

### Model 2

print("\nAggregating topics by Book_Title for Model 2...")
# Group by Book_Title and topic_model2 to get counts
topic_counts2 = df.groupby(['Book_Title', 'topic_model2']).size().reset_index(name='count')

# Pivot to get topics as columns
topic_distribution2 = topic_counts2.pivot(index='Book_Title', columns='topic_model2', values='count').fillna(0)

# Display the topic distribution
print("Topic distribution for Model 2:")
print(topic_distribution2.head())

# Save topic_distribution2 to CSV in topics_info2_dir
topic_distribution2.to_csv(os.path.join(topics_info2_dir, 'topic_distribution_model2.csv'))
print("Topic distribution for Model 2 saved to CSV in topics info directory.")

# Step 6: Compute Percentage Distributions

### Model 1

print("\nComputing percentage distributions for Model 1...")
# Compute percentage of each topic in a book
topic_distribution_percent1 = topic_distribution1.div(topic_distribution1.sum(axis=1), axis=0) * 100

# Display the percentage distribution
print("Percentage topic distribution for Model 1:")
print(topic_distribution_percent1.head())

# Save topic_distribution_percent1 to CSV in topics_info1_dir
topic_distribution_percent1.to_csv(os.path.join(topics_info1_dir, 'topic_distribution_percent_model1.csv'))
print("Percentage topic distribution for Model 1 saved to CSV in topics info directory.")

### Model 2

print("\nComputing percentage distributions for Model 2...")
# Compute percentage of each topic in a book
topic_distribution_percent2 = topic_distribution2.div(topic_distribution2.sum(axis=1), axis=0) * 100

# Display the percentage distribution
print("Percentage topic distribution for Model 2:")
print(topic_distribution_percent2.head())

# Save topic_distribution_percent2 to CSV in topics_info2_dir
topic_distribution_percent2.to_csv(os.path.join(topics_info2_dir, 'topic_distribution_percent_model2.csv'))
print("Percentage topic distribution for Model 2 saved to CSV in topics info directory.")

# Step 7: Load Topics Info CSVs and Merge with DataFrame

### Model 1

print("\nLoading topics info CSV for Model 1...")
# Load the topics info CSV
topics_info1 = pd.read_csv(topics_info1_path)

# Merge with the main DataFrame
df_model1 = df.merge(topics_info1, left_on='topic_model1', right_on='Topic', how='left', suffixes=('', '_model1'))

# Display the merged DataFrame
print("Merged DataFrame for Model 1:")
print(df_model1.head())

# Save df_model1 to CSV in topics_info1_dir
df_model1.to_csv(os.path.join(topics_info1_dir, 'df_model1.csv'), index=False)
print("Merged DataFrame for Model 1 saved to CSV in topics info directory.")

### Model 2

print("\nLoading topics info CSV for Model 2...")
# Load the topics info CSV
topics_info2 = pd.read_csv(topics_info2_path)

# Merge with the main DataFrame
df_model2 = df.merge(topics_info2, left_on='topic_model2', right_on='Topic', how='left', suffixes=('', '_model2'))

# Display the merged DataFrame
print("Merged DataFrame for Model 2:")
print(df_model2.head())

# Save df_model2 to CSV in topics_info2_dir
df_model2.to_csv(os.path.join(topics_info2_dir, 'df_model2.csv'), index=False)
print("Merged DataFrame for Model 2 saved to CSV in topics info directory.")

# Step 9: Analyze Topic Distribution Per Author

### Model 1

print("\nAnalyzing topic distribution per author for Model 1...")
# Group by Author and topic_model1
author_topic_counts1 = df.groupby(['Author', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
author_topic_distribution1 = author_topic_counts1.pivot(index='Author', columns='topic_model1', values='count').fillna(0)

# Compute percentages
author_topic_distribution_percent1 = author_topic_distribution1.div(author_topic_distribution1.sum(axis=1), axis=0) * 100

# Display the percentage distribution per author
print("Percentage topic distribution per author for Model 1:")
print(author_topic_distribution_percent1.head())

# Save author_topic_distribution_percent1 to CSV in topics_info1_dir
author_topic_distribution_percent1.to_csv(os.path.join(topics_info1_dir, 'author_topic_distribution_percent_model1.csv'))
print("Percentage topic distribution per author for Model 1 saved to CSV in topics info directory.")

# Step 10: Additional Analysis

### Analyzing Topics Over Chapters

print("\nAnalyzing topics over chapters for a specific book...")
# Ensure 'Chapter' is numeric
df['Chapter'] = pd.to_numeric(df['Chapter'], errors='coerce')

# Drop rows with NaN chapters (if any)
df_chapter = df.dropna(subset=['Chapter'])

# Group by Book_Title, Chapter, and topic_model1
chapter_topic_counts = df_chapter.groupby(['Book_Title', 'Chapter', 'topic_model1']).size().reset_index(name='count')

# Pivot to get topics as columns
chapter_topic_distribution = chapter_topic_counts.pivot_table(index=['Book_Title', 'Chapter'], columns='topic_model1', values='count', fill_value=0)

# Save chapter_topic_distribution to CSV in topics_info1_dir
chapter_topic_distribution.to_csv(os.path.join(topics_info1_dir, 'chapter_topic_distribution_model1.csv'))
print("Chapter topic distribution saved to CSV in topics info directory.")

# Select a specific book for analysis
book_name = 'Mr. Mysterious In Black'  # Replace with your book title
if book_name in chapter_topic_distribution.index.get_level_values(0):
    book_chapter_topics = chapter_topic_distribution.loc[book_name]

    # Save book_chapter_topics to CSV in topics_info1_dir
    book_chapter_topics.to_csv(os.path.join(topics_info1_dir, f'{book_name}_chapter_topics_model1.csv'))
    print(f"Chapter topic distribution for '{book_name}' saved to CSV in topics info directory.")
else:
    print(f"Book '{book_name}' not found in the dataset.")

Model 1 topics info directory: /content/drive/MyDrive/BERTTopic_Models/paraphrase-mpnet-base-v2/topics_4_20241102_205031
Model 2 topics info directory: /content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/topics_5_20241102_205523

Aggregating topics by Book_Title for Model 1...
Topic distribution for Model 1:
topic_model1                              -1      0      1      2      3   \
Book_Title                                                                  
A Long Time Coming                     6898.0  132.0  201.0  165.0  101.0   
A Not So Meet Cute                     6129.0  347.0  232.0  285.0  102.0   
Ache for You                           5126.0  129.0  217.0   77.0   99.0   
An Accidental Date with a Billionaire  2240.0  106.0  118.0   52.0   19.0   
Bad Boss                               3876.0  165.0  275.0   52.0   74.0   

topic_model1                             4      5     6      7      8   ...  \
Book_Title                                                 