In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Clustering

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for clustering and preprocessing
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# For visualization
%matplotlib inline
sns.set(style="whitegrid")

In [None]:
# Define the dataset path
dataset_filepath = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/all_books_all_topics.csv'

In [None]:
# Load the dataset
df = pd.read_csv(dataset_filepath)

In [None]:
# Display the first few rows
print("First 5 rows of the dataset:")
display(df.head())

First 5 rows of the dataset:


Unnamed: 0,Book Title,Topic,Topic_Label,Probability
0,A Long Time Coming,Topic_0,Expressions of Joy and Amusement,79.474938
1,A Not So Meet Cute,Topic_0,Expressions of Joy and Amusement,71.655854
2,Ache for You,Topic_0,Expressions of Joy and Amusement,56.319151
3,An Accidental Date with a Billionaire,Topic_0,Expressions of Joy and Amusement,26.288002
4,Bad Boss,Topic_0,Expressions of Joy and Amusement,43.874309


In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Book Title     0
Topic          0
Topic_Label    0
Probability    0
dtype: int64


In [None]:
# Summary statistics
print("\nSummary statistics:")
display(df.describe())

# Verify the number of unique topics
unique_topics = df['Topic'].nunique()
print(f"Number of unique topics in the dataset: {unique_topics}")


Summary statistics:


Unnamed: 0,Probability
count,22575.0
mean,29.854139
std,16.13613
min,0.700859
25%,18.110644
50%,27.829047
75%,39.04845
max,114.721734


Number of unique topics in the dataset: 215


In [None]:
# Create a mapping from 'Topic' string to 'Topic_Label'
topic_labels_df = df[['Topic', 'Topic_Label']].drop_duplicates().sort_values('Topic')

# Create the mapping dictionary with 'Topic' strings as keys
topic_labels = topic_labels_df.set_index('Topic')['Topic_Label'].to_dict()

# Print the first 10 topic mappings to verify
print("Topic number to Topic_Label mapping (first 10 topics):")
for topic, label in list(topic_labels.items())[:10]:
    print(f"{topic}: {label}")

Topic number to Topic_Label mapping (first 10 topics):
Topic_0: Expressions of Joy and Amusement
Topic_1: Dining and Meals
Topic_10: Gazes and Eye Contact
Topic_100: Clocks and Timekeeping
Topic_101: Jealousy and Envy
Topic_102: Pleasure and Seduction
Topic_103: Education and School
Topic_104: Ties and Formal Wear
Topic_105: Kneeling and Lower Body Actions
Topic_106: Tone and Communication


In [None]:
# Verify that all topics in df_pivot are present in topic_labels
# (We'll define df_pivot in the next cell, but to prepare, extract all unique topics)
unique_topics_in_df = df['Topic'].unique()

# Identify any topics not present in topic_labels
missing_in_mapping = set(unique_topics_in_df) - set(topic_labels.keys())

if missing_in_mapping:
    print(f"Missing topics in mapping: {missing_in_mapping}")
    # Optionally, add placeholder labels for missing topics
    for missing_topic in missing_in_mapping:
        topic_labels[missing_topic] = "Unknown Topic"
    print(f"Added placeholder labels for missing topics: {missing_in_mapping}")
else:
    print("All topics are present in the mapping.")

All topics are present in the mapping.


In [None]:
# Pivot the DataFrame to have books as rows and topics as columns
df_pivot = df.pivot_table(
    index='Book Title',          # Corrected column name
    columns='Topic',
    values='Probability',
    fill_value=0
)

# Display the pivoted DataFrame
print("Pivoted DataFrame (df_pivot):")
display(df_pivot.head())

# Display the shape of the pivoted DataFrame
print(f"{df_pivot.shape[0]} rows × {df_pivot.shape[1]} columns")

Pivoted DataFrame (df_pivot):


Topic,Topic_0,Topic_1,Topic_10,Topic_100,Topic_101,Topic_102,Topic_103,Topic_104,Topic_105,Topic_106,...,Topic_90,Topic_91,Topic_92,Topic_93,Topic_94,Topic_95,Topic_96,Topic_97,Topic_98,Topic_99
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,79.474938,56.317327,69.781076,28.041094,62.643593,86.409227,41.159529,44.710135,44.032778,65.074435,...,37.320165,28.184105,19.073854,50.570807,49.238223,45.548591,40.048028,50.409369,68.443,75.107108
A Not So Meet Cute,71.655854,51.052766,63.149428,22.492508,56.784662,77.896244,37.8091,40.955683,39.315136,58.919767,...,30.414757,27.199027,19.443791,44.797097,44.021191,40.037086,34.680983,45.963807,58.682914,62.849054
Ache for You,56.319151,38.191399,51.023719,19.264886,45.389982,58.180555,25.342493,35.025604,32.546548,47.054189,...,26.805348,24.507383,18.228584,34.295827,36.22998,34.181295,27.437485,35.395689,43.722991,48.661635
An Accidental Date with a Billionaire,26.288002,18.064906,23.180115,8.50299,21.700494,26.564325,12.985737,15.524906,14.501216,20.750922,...,9.669584,9.670462,6.813146,15.031829,16.342032,14.682329,11.619466,14.573176,21.5452,21.79973
Bad Boss,43.874309,29.619211,40.246871,17.512199,33.221254,45.192322,24.778549,26.592173,24.266978,34.811752,...,18.65364,18.565761,13.603523,28.130165,28.270488,24.141426,21.893897,25.668722,38.207716,37.302531


105 rows × 215 columns


In [None]:
# Function to get top N topics per cluster
def get_top_topics(cluster_data, topic_labels, top_n=30):
    """
    Returns a DataFrame of top N topics based on mean probability within the cluster.
    """
    # Calculate mean probability per topic within the cluster
    topic_means = cluster_data.mean().sort_values(ascending=False).head(top_n)
    topic_std = cluster_data.std().loc[topic_means.index]
    top_topics_df = pd.DataFrame({
        'Topic': topic_means.index,
        'Topic_Label': topic_means.index.map(topic_labels),
        'Mean_Probability': topic_means.values,
        'Std_Deviation': topic_std.values
    })
    return top_topics_df

# Function to get all topics per cluster
def get_all_topics(cluster_data, topic_labels):
    """
    Returns a DataFrame of all topics with their mean probabilities and standard deviations within the cluster.
    """
    topic_means = cluster_data.mean().sort_values(ascending=False)
    topic_std = cluster_data.std().loc[topic_means.index]
    all_topics_df = pd.DataFrame({
        'Topic': topic_means.index,
        'Topic_Label': topic_means.index.map(topic_labels),
        'Mean_Probability': topic_means.values,
        'Std_Deviation': topic_std.values
    })
    return all_topics_df

# Function to get top N books per cluster
def get_top_books(cluster_data, top_n=5):
    """
    Returns a list of top N book titles based on the sum of their topic probabilities within the cluster.
    """
    # Sum probabilities across all topics for each book to determine top books
    top_books = cluster_data.sum(axis=1).sort_values(ascending=False).head(top_n).index.tolist()
    return top_books

# Function to get all books sorted by sum of topic probabilities
def get_all_books(cluster_data):
    """
    Returns a DataFrame of all books sorted by the sum of their topic probabilities within the cluster.
    """
    book_sums = cluster_data.sum(axis=1).sort_values(ascending=False)
    all_books_df = pd.DataFrame({
        'Book Title': book_sums.index,
        'Total_Probability': book_sums.values
    })
    return all_books_df

In [None]:
# Verification of helper functions with sample data
print("\nVerifying helper functions with sample data...")

# Create a sample cluster_data by selecting a subset of df_pivot
sample_cluster_data = df_pivot.iloc[:10]  # taking first 10 books as sample

# Get top 30 topics for sample data
sample_top_topics = get_top_topics(sample_cluster_data, topic_labels, top_n=30)
print("\nSample Top 30 Topics:")
display(sample_top_topics)

# Get all topics for sample data
sample_all_topics = get_all_topics(sample_cluster_data, topic_labels)
print("\nSample All Topics:")
display(sample_all_topics)

# Get top 5 books for sample data
sample_top_books = get_top_books(sample_cluster_data, top_n=5)
print("\nSample Top 5 Books:")
for i, book in enumerate(sample_top_books, 1):
    print(f"{i}. {book}")

# Get all books sorted by total probability
sample_all_books = get_all_books(sample_cluster_data)
print("\nSample All Books Sorted by Total Probability:")
display(sample_all_books.head())


Verifying helper functions with sample data...

Sample Top 30 Topics:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
0,Topic_72,Love and Affection,59.046471,16.735448
1,Topic_70,Leaving and Letting Go,57.38012,17.692158
2,Topic_201,Happiness and Gratitude,56.663655,17.409058
3,Topic_102,Pleasure and Seduction,56.006795,17.780864
4,Topic_175,Moments and Realizations,54.540264,16.647522
5,Topic_129,Breaking and Letting Go,53.401722,16.719943
6,Topic_0,Expressions of Joy and Amusement,53.294888,15.682545
7,Topic_117,Possession and Desire,53.048371,19.880187
8,Topic_36,Nods and Agreement,52.742087,15.089946
9,Topic_12,Nightlife and Events,51.614334,15.887609



Sample All Topics:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
0,Topic_72,Love and Affection,59.046471,16.735448
1,Topic_70,Leaving and Letting Go,57.380120,17.692158
2,Topic_201,Happiness and Gratitude,56.663655,17.409058
3,Topic_102,Pleasure and Seduction,56.006795,17.780864
4,Topic_175,Moments and Realizations,54.540264,16.647522
...,...,...,...,...
210,Topic_92,Mirrors and Reflections,15.044534,4.108528
211,Topic_114,Screens and Viewing,13.472833,4.658705
212,Topic_151,Buttons and Devices,12.359829,4.201884
213,Topic_186,Laptops and Technology,8.218655,2.578650



Sample Top 5 Books:
1. A Long Time Coming
2. A Not So Meet Cute
3. Banking the Billionaire
4. Ache for You
5. Between Commitment and Betrayal

Sample All Books Sorted by Total Probability:


Unnamed: 0,Book Title,Total_Probability
0,A Long Time Coming,10636.42079
1,A Not So Meet Cute,9423.447186
2,Banking the Billionaire,8288.035902
3,Ache for You,7527.456728
4,Between Commitment and Betrayal,7343.20673


In [None]:
# Ensure that there are no NaN values in 'Topic_Label' columns
# Select all unique topics from df_pivot
pivot_topics = df_pivot.columns.tolist()

# Check for any topics without a corresponding label
topics_without_labels = [topic for topic in pivot_topics if topic not in topic_labels]

if topics_without_labels:
    print(f"Topics without labels: {topics_without_labels}")
    # Optionally, add placeholder labels
    for topic in topics_without_labels:
        topic_labels[topic] = "Unknown Topic"
    print(f"Added placeholder labels for missing topics: {topics_without_labels}")
else:
    print("All topics have corresponding labels.")

All topics have corresponding labels.


In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    df = df.drop_duplicates()
    print(f"Dropped {duplicates} duplicate rows.")
else:
    print("No duplicate rows found.")

Number of duplicate rows: 0
No duplicate rows found.


In [None]:
# Downcast numerical columns to save memory
df['Probability'] = pd.to_numeric(df['Probability'], downcast='float')

# Convert 'Book Title' and 'Topic' columns to categorical types if there are many repeated values
df['Book Title'] = df['Book Title'].astype('category')
df['Topic'] = df['Topic'].astype('category')

print("Data types after optimization:")
print(df.dtypes)

Data types after optimization:
Book Title     category
Topic          category
Topic_Label      object
Probability     float32
dtype: object


In [None]:
# Inspect the shape of df_pivot
print(f"Shape of df_pivot: {df_pivot.shape}")

# Inspect column names
print("\nColumns in df_pivot:")
print(df_pivot.columns.tolist())

# Inspect a sample of df_pivot
print("\nSample data from df_pivot:")
display(df_pivot.head())

Shape of df_pivot: (105, 215)

Columns in df_pivot:
['Topic_0', 'Topic_1', 'Topic_10', 'Topic_100', 'Topic_101', 'Topic_102', 'Topic_103', 'Topic_104', 'Topic_105', 'Topic_106', 'Topic_107', 'Topic_108', 'Topic_109', 'Topic_11', 'Topic_110', 'Topic_111', 'Topic_112', 'Topic_113', 'Topic_114', 'Topic_115', 'Topic_116', 'Topic_117', 'Topic_118', 'Topic_119', 'Topic_12', 'Topic_120', 'Topic_121', 'Topic_122', 'Topic_123', 'Topic_124', 'Topic_125', 'Topic_126', 'Topic_127', 'Topic_128', 'Topic_129', 'Topic_13', 'Topic_130', 'Topic_131', 'Topic_132', 'Topic_133', 'Topic_134', 'Topic_135', 'Topic_136', 'Topic_137', 'Topic_138', 'Topic_139', 'Topic_14', 'Topic_140', 'Topic_141', 'Topic_142', 'Topic_143', 'Topic_144', 'Topic_145', 'Topic_146', 'Topic_147', 'Topic_148', 'Topic_149', 'Topic_15', 'Topic_150', 'Topic_151', 'Topic_152', 'Topic_153', 'Topic_154', 'Topic_155', 'Topic_156', 'Topic_157', 'Topic_158', 'Topic_159', 'Topic_16', 'Topic_160', 'Topic_161', 'Topic_162', 'Topic_163', 'Topic_16

Topic,Topic_0,Topic_1,Topic_10,Topic_100,Topic_101,Topic_102,Topic_103,Topic_104,Topic_105,Topic_106,...,Topic_90,Topic_91,Topic_92,Topic_93,Topic_94,Topic_95,Topic_96,Topic_97,Topic_98,Topic_99
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,79.474938,56.317327,69.781076,28.041094,62.643593,86.409227,41.159529,44.710135,44.032778,65.074435,...,37.320165,28.184105,19.073854,50.570807,49.238223,45.548591,40.048028,50.409369,68.443,75.107108
A Not So Meet Cute,71.655854,51.052766,63.149428,22.492508,56.784662,77.896244,37.8091,40.955683,39.315136,58.919767,...,30.414757,27.199027,19.443791,44.797097,44.021191,40.037086,34.680983,45.963807,58.682914,62.849054
Ache for You,56.319151,38.191399,51.023719,19.264886,45.389982,58.180555,25.342493,35.025604,32.546548,47.054189,...,26.805348,24.507383,18.228584,34.295827,36.22998,34.181295,27.437485,35.395689,43.722991,48.661635
An Accidental Date with a Billionaire,26.288002,18.064906,23.180115,8.50299,21.700494,26.564325,12.985737,15.524906,14.501216,20.750922,...,9.669584,9.670462,6.813146,15.031829,16.342032,14.682329,11.619466,14.573176,21.5452,21.79973
Bad Boss,43.874309,29.619211,40.246871,17.512199,33.221254,45.192322,24.778549,26.592173,24.266978,34.811752,...,18.65364,18.565761,13.603523,28.130165,28.270488,24.141426,21.893897,25.668722,38.207716,37.302531


In [None]:
# Convert df_pivot to a sparse DataFrame if it contains many zeros
df_pivot_sparse = df_pivot.astype(pd.SparseDtype("float", 0))

# Display information about the sparse DataFrame
print("Sparse DataFrame Info:")
df_pivot_sparse.info()

Sparse DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, A Long Time Coming to Unmasking the Billionaire
Columns: 215 entries, Topic_0 to Topic_99
dtypes: Sparse[float64, 0](215)
memory usage: 265.4+ KB


In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Select topic columns (all columns in df_pivot)
topic_columns = df_pivot.columns.tolist()

# Fit and transform the data
df_scaled = scaler.fit_transform(df_pivot)

# Convert the scaled data back to a DataFrame
df_scaled = pd.DataFrame(df_scaled, index=df_pivot.index, columns=topic_columns)

print("Data scaling completed. Here's a sample of the scaled data:")
display(df_scaled.head())

Data scaling completed. Here's a sample of the scaled data:


Unnamed: 0_level_0,Topic_0,Topic_1,Topic_10,Topic_100,Topic_101,Topic_102,Topic_103,Topic_104,Topic_105,Topic_106,...,Topic_90,Topic_91,Topic_92,Topic_93,Topic_94,Topic_95,Topic_96,Topic_97,Topic_98,Topic_99
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,1.515081,1.642712,1.454704,1.516337,1.492844,1.66569,1.557965,1.324266,1.424637,1.523449,...,1.631003,1.030558,0.801507,1.634371,1.34586,1.408287,1.576822,1.661021,1.679045,1.820568
A Not So Meet Cute,1.136348,1.268476,1.09031,0.759248,1.130412,1.267855,1.233424,1.016844,1.031194,1.168156,...,0.940151,0.911657,0.861984,1.178393,0.956806,0.967622,1.058646,1.316378,1.099651,1.149389
Ache for You,0.393485,0.354212,0.42403,0.318848,0.42554,0.346485,0.025838,0.531278,0.466706,0.483187,...,0.579048,0.586771,0.663324,0.349056,0.375786,0.49943,0.359302,0.497081,0.211576,0.372571
An Accidental Date with a Billionaire,-1.061133,-1.0765,-1.105912,-1.149582,-1.039885,-1.13103,-1.171107,-1.065478,-1.038242,-1.035232,...,-1.1353,-1.204073,-1.202854,-1.172317,-1.107333,-1.059584,-1.167893,-1.117191,-1.104979,-1.098227
Bad Boss,-0.209305,-0.25515,-0.168133,0.079699,-0.327214,-0.260491,-0.028789,-0.159268,-0.223795,-0.223537,...,-0.23649,-0.130394,-0.092774,-0.137877,-0.217783,-0.303294,-0.17592,-0.257005,-0.115831,-0.249386


In [None]:
# Define the output directory
output_dir = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print(f"Output directory set to: {output_dir}")

Output directory set to: /content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs


In [None]:
# Define the number of clusters to analyze
cluster_range = [3, 4, 5]

for k in cluster_range:
    print(f"\nPerforming K-Means clustering with k={k}...")

    # Initialize and fit KMeans
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(df_scaled)

    # Assign cluster labels to df_pivot
    df_pivot[f'Cluster_{k}'] = cluster_labels

    # Create a mapping from Book Title to Cluster
    cluster_map = df_pivot[f'Cluster_{k}'].to_dict()

    # Assign clusters to the original df based on Book Title
    df[f'Cluster_{k}'] = df['Book Title'].map(cluster_map)

    # Verify the mapping by displaying a sample
    print(f"Assigned clusters to the original DataFrame for k={k}.")
    print(f"Sample cluster assignments for k={k}:")
    display(df[['Book Title', f'Cluster_{k}']].drop_duplicates().head())

    # Prepare to store results
    top_topics_all_clusters = {}
    all_topics_all_clusters = {}
    top_books_all_clusters = {}
    all_books_all_clusters = {}
    cluster_profiles = {}
    thematic_spread = {}

    # Analyze each cluster
    for cluster in range(k):
        print(f"\nAnalyzing Cluster {cluster} for k={k}...")

        # Filter data for the current cluster from df_pivot
        cluster_books = df_pivot[df_pivot[f'Cluster_{k}'] == cluster]

        # Define topic columns as columns starting with 'Topic_'
        topic_columns_cluster = [col for col in cluster_books.columns if col.startswith('Topic_')]

        # Check if topic_columns_cluster are present
        if not topic_columns_cluster:
            print(f"Warning: No topic columns found for Cluster {cluster} in k={k}.")
            continue  # Skip to next cluster

        # Select only topic columns
        cluster_books_topic = cluster_books[topic_columns_cluster]

        # Get top 30 topics
        top_topics_df = get_top_topics(cluster_books_topic, topic_labels, top_n=30)
        top_topics_all_clusters[cluster] = top_topics_df

        # Get all topics
        all_topics_df = get_all_topics(cluster_books_topic, topic_labels)
        all_topics_all_clusters[cluster] = all_topics_df

        # Get top 5 books
        top_books = get_top_books(cluster_books_topic, top_n=5)
        top_books_all_clusters[cluster] = top_books

        # Get all books sorted by total probability
        all_books_df = get_all_books(cluster_books_topic)
        all_books_all_clusters[cluster] = all_books_df

        # Cluster Size and Percentage
        cluster_size = cluster_books.shape[0]
        cluster_percentage = (cluster_size / df_pivot.shape[0]) * 100

        # Cluster Profiles with Statistical Measures
        cluster_profiles[cluster] = {
            'Cluster_Size': cluster_size,
            'Cluster_Percentage': cluster_percentage,
            'Top_Topics': top_topics_df
        }

        # Thematic Spread
        # For dominant themes (top 30 topics), calculate min, max, median probabilities
        dominant_topics = top_topics_df['Topic'].tolist()
        # Ensure that dominant_topics exist in cluster_books_topic
        existing_dominant_topics = [topic for topic in dominant_topics if topic in cluster_books_topic.columns]
        if not existing_dominant_topics:
            print(f"Warning: No dominant topics found for Cluster {cluster} in k={k}.")
            continue
        dominant_data = cluster_books_topic[existing_dominant_topics]

        # Check if dominant_data is empty
        if dominant_data.empty:
            print(f"Warning: dominant_data is empty for Cluster {cluster} in k={k}.")
            continue

        # Debugging: Print head of dominant_data
        print(f"Head of dominant_data for Cluster {cluster} in k={k}:")
        display(dominant_data.head())

        # Calculate Probability Range and Std Deviation using separate aggregations
        prob_min = dominant_data.min().to_dict()
        prob_max = dominant_data.max().to_dict()
        prob_median = dominant_data.median().to_dict()

        prob_range = {
            'min': prob_min,
            'max': prob_max,
            'median': prob_median
        }

        std_deviation = dominant_data.std().to_dict()

        # Debugging: Print probability_range structure
        print(f"Probability_Range for Cluster {cluster} in k={k}:")
        print(prob_range)

        thematic_spread[cluster] = {
            'Probability_Range': prob_range,
            'Std_Deviation_per_Topic': std_deviation
        }

    # Save Top 30 Topics and All Topics per Cluster to CSV
    for cluster in range(k):
        if cluster not in top_topics_all_clusters or cluster not in all_topics_all_clusters:
            print(f"Skipping saving topics for Cluster {cluster} in k={k} due to missing data.")
            continue

        # Top 30 Topics
        top_topics_df = top_topics_all_clusters[cluster]
        top_topics_output = os.path.join(output_dir, f'top_30_topics_k{k}_cluster{cluster}.csv')
        top_topics_df.to_csv(top_topics_output, index=False)
        print(f"Saved top 30 topics for Cluster {cluster} to '{top_topics_output}'.")

        # All Topics
        all_topics_df = all_topics_all_clusters[cluster]
        all_topics_output = os.path.join(output_dir, f'all_topics_k{k}_cluster{cluster}.csv')
        all_topics_df.to_csv(all_topics_output, index=False)
        print(f"Saved all topics for Cluster {cluster} to '{all_topics_output}'.")

    # Save Top 5 Books and All Books per Cluster to CSV
    for cluster in range(k):
        if cluster not in top_books_all_clusters or cluster not in all_books_all_clusters:
            print(f"Skipping saving books for Cluster {cluster} in k={k} due to missing data.")
            continue

        # Top 5 Books
        top_books = top_books_all_clusters[cluster]
        top_books_df = pd.DataFrame({'Book Title': top_books})
        top_books_output = os.path.join(output_dir, f'top_5_books_k{k}_cluster{cluster}.csv')
        top_books_df.to_csv(top_books_output, index=False)
        print(f"Saved top 5 books for Cluster {cluster} to '{top_books_output}'.")

        # All Books
        all_books_df = all_books_all_clusters[cluster]
        all_books_output = os.path.join(output_dir, f'all_books_k{k}_cluster{cluster}.csv')
        all_books_df.to_csv(all_books_output, index=False)
        print(f"Saved all books for Cluster {cluster} to '{all_books_output}'.")

    # Save Cluster Profiles with Statistical Measures
    profiles_data = []
    for cluster, profile in cluster_profiles.items():
        size = profile['Cluster_Size']
        percentage = profile['Cluster_Percentage']
        for _, row in profile['Top_Topics'].iterrows():
            profiles_data.append({
                'Cluster': cluster,
                'Topic': row['Topic'],
                'Topic_Label': row['Topic_Label'],
                'Mean_Probability': row['Mean_Probability'],
                'Std_Deviation': row['Std_Deviation'],
                'Cluster_Size': size,
                'Cluster_Percentage': percentage
            })
    profiles_df = pd.DataFrame(profiles_data)
    profiles_output = os.path.join(output_dir, f'cluster_profiles_k{k}.csv')
    profiles_df.to_csv(profiles_output, index=False)
    print(f"Saved cluster profiles for k={k} to '{profiles_output}'.")

    # Save Thematic Spread
    thematic_data = []
    for cluster, spread in thematic_spread.items():
        prob_range = spread.get('Probability_Range', {})
        std_dev = spread.get('Std_Deviation_per_Topic', {})

        # Ensure 'min', 'max', 'median' keys exist
        for key in ['min', 'max', 'median']:
            if key not in prob_range:
                prob_range[key] = {}
                print(f"Warning: '{key}' key missing in Probability_Range for Cluster {cluster}. Assigning NaN.")

        for topic, std in std_dev.items():
            min_prob = prob_range['min'].get(topic, float('nan'))
            max_prob = prob_range['max'].get(topic, float('nan'))
            median_prob = prob_range['median'].get(topic, float('nan'))

            thematic_data.append({
                'Cluster': cluster,
                'Topic': topic,
                'Min_Probability': min_prob,
                'Max_Probability': max_prob,
                'Median_Probability': median_prob,
                'Std_Deviation': std
            })

    thematic_df = pd.DataFrame(thematic_data)
    thematic_output = os.path.join(output_dir, f'thematic_spread_k{k}.csv')
    thematic_df.to_csv(thematic_output, index=False)
    print(f"Saved thematic spread for k={k} to '{thematic_output}'.")

    # Save Cluster Assignments
    cluster_assignments_output = os.path.join(output_dir, f'cluster_assignments_k{k}.csv')
    cluster_assignments = df_pivot[[f'Cluster_{k}']].rename(columns={f'Cluster_{k}': 'Cluster'})
    cluster_assignments.to_csv(cluster_assignments_output)
    print(f"Saved cluster assignments for k={k} to '{cluster_assignments_output}'.")

    print(f"Clustering with k={k} completed and results saved.\n")


Performing K-Means clustering with k=3...
Assigned clusters to the original DataFrame for k=3.
Sample cluster assignments for k=3:


Unnamed: 0,Book Title,Cluster_3
0,A Long Time Coming,1
1,A Not So Meet Cute,1
2,Ache for You,0
3,An Accidental Date with a Billionaire,2
4,Bad Boss,0



Analyzing Cluster 0 for k=3...
Head of dominant_data for Cluster 0 in k=3:


Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_129,Topic_0,Topic_117,Topic_36,Topic_12,...,Topic_60,Topic_99,Topic_31,Topic_50,Topic_11,Topic_64,Topic_179,Topic_49,Topic_98,Topic_115
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ache for You,57.628415,56.880138,55.068065,58.180555,54.437852,55.985111,56.319151,56.039241,51.501959,51.596241,...,47.917208,48.661635,49.415486,46.686634,49.823008,49.887579,46.727125,45.559387,43.722991,45.005019
Bad Boss,48.937849,47.072433,48.225659,45.192322,47.101467,43.017024,43.874309,43.134256,44.232852,42.895156,...,36.850456,37.302531,37.725331,37.322676,36.370452,37.592128,37.788813,37.366143,38.207716,35.421537
Banking the Billionaire,67.338625,65.578731,66.372643,65.588555,63.678955,60.742803,63.634949,64.081367,61.743543,61.22785,...,53.324409,55.598779,51.983287,54.627552,54.169766,53.932228,52.671945,53.124814,52.61848,49.379849
Beauty and the Billionaire,58.845881,52.948413,53.467553,53.812438,51.278201,49.456941,52.195326,45.897764,50.635772,49.541668,...,45.127404,42.31575,44.239503,43.384968,42.107857,42.761932,42.097425,45.074004,40.805066,38.788087
Bedding the Billionaire,53.682072,50.571483,50.854971,49.483791,47.392565,46.359166,48.162375,41.062011,49.871963,46.706812,...,42.762807,39.776318,40.737828,41.861483,38.493562,40.96202,39.469905,42.749482,38.119258,37.274192


Probability_Range for Cluster 0 in k=3:
{'min': {'Topic_72': 42.24676432563, 'Topic_70': 42.07928208484, 'Topic_201': 41.23106852018, 'Topic_102': 41.48197991863, 'Topic_175': 39.7483221089, 'Topic_129': 39.0248389989, 'Topic_0': 39.61140213772, 'Topic_117': 37.59152399045, 'Topic_36': 38.2636188118, 'Topic_12': 38.53207717847, 'Topic_14': 37.7853262093, 'Topic_167': 34.32953572492, 'Topic_178': 35.66898896265, 'Topic_51': 33.93020488013, 'Topic_157': 34.71102750189, 'Topic_83': 34.84466220858, 'Topic_10': 35.444119069, 'Topic_84': 33.79986441381, 'Topic_187': 33.6999010024, 'Topic_119': 33.65291764828, 'Topic_60': 31.8313308354, 'Topic_99': 34.1156842785, 'Topic_31': 33.8241092529, 'Topic_50': 32.95822593158, 'Topic_11': 33.76902511737, 'Topic_64': 33.6123365737, 'Topic_179': 32.91236002747, 'Topic_49': 31.54990131354, 'Topic_98': 32.4656066582, 'Topic_115': 31.04694880438}, 'max': {'Topic_72': 75.1360006544, 'Topic_70': 70.92149856687, 'Topic_201': 68.24951752723, 'Topic_102': 70.952

Topic,Topic_70,Topic_72,Topic_102,Topic_117,Topic_201,Topic_129,Topic_175,Topic_0,Topic_14,Topic_36,...,Topic_60,Topic_11,Topic_64,Topic_187,Topic_31,Topic_179,Topic_50,Topic_115,Topic_133,Topic_98
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,89.64046,87.735891,86.409227,89.233251,88.470986,84.696569,83.728109,79.474938,82.583982,79.086398,...,67.879283,69.666778,68.470536,68.080449,66.606954,69.844535,68.434361,73.370752,67.352189,68.443
A Not So Meet Cute,77.12307,78.115778,77.896244,76.576213,76.227428,71.01608,73.723201,71.655854,73.637646,70.049858,...,61.721981,62.815676,60.8255,59.656805,60.841712,61.920565,59.190119,61.689096,60.186075,58.682914
Blue Eyed Devil,78.674587,78.029453,72.43025,71.957314,75.679984,73.388228,72.322244,68.252744,69.82918,70.960031,...,63.130958,60.540235,59.407181,60.814763,59.943694,61.663991,61.63623,65.554881,58.821186,57.637508
Brooklynaire,71.31031,66.769018,67.621515,67.205259,69.729444,65.614438,67.362755,64.349374,65.35521,64.024782,...,55.1055,56.610357,54.617966,56.554913,51.336611,55.127921,53.704689,56.123608,51.833137,61.246128
Brutal Billionaire,78.472603,73.254801,73.063784,70.723266,73.669379,70.83358,72.615748,64.690243,69.988083,68.302479,...,59.123202,57.931885,56.878062,57.758085,57.349795,58.528871,57.247819,63.047993,55.962996,59.243987


Probability_Range for Cluster 1 in k=3:
{'min': {'Topic_70': 70.57257841241, 'Topic_72': 66.7690179022, 'Topic_102': 67.62151481264, 'Topic_117': 66.06062545573, 'Topic_201': 67.21159854517, 'Topic_129': 65.61443780191, 'Topic_175': 67.16201947271, 'Topic_0': 62.60177242983, 'Topic_14': 61.37780578657, 'Topic_36': 62.85726853198, 'Topic_12': 63.2735934156, 'Topic_178': 60.0637018391, 'Topic_84': 58.09270809869, 'Topic_167': 58.17296481693, 'Topic_51': 56.84301729863, 'Topic_157': 56.22084114617, 'Topic_10': 57.71011387603, 'Topic_83': 56.47294895954, 'Topic_119': 56.18500456908, 'Topic_99': 54.45530067876, 'Topic_60': 55.10550015266, 'Topic_11': 55.21731794447, 'Topic_64': 54.61796584246, 'Topic_187': 56.55491323126, 'Topic_31': 51.33661134906, 'Topic_179': 53.64854325332, 'Topic_50': 53.70468929226, 'Topic_115': 54.98460918436, 'Topic_133': 51.8331374776, 'Topic_98': 52.68929581618}, 'max': {'Topic_70': 107.86176037538, 'Topic_72': 103.90453133131, 'Topic_102': 114.11270234707, 'Topic

Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_129,Topic_0,Topic_36,Topic_12,Topic_167,...,Topic_31,Topic_119,Topic_50,Topic_64,Topic_99,Topic_49,Topic_179,Topic_11,Topic_98,Topic_195
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
An Accidental Date with a Billionaire,29.594729,28.538562,27.594561,26.564325,26.399104,26.284801,26.288002,26.769036,25.012327,25.029914,...,21.884366,23.375049,23.251143,21.88468,21.79973,23.532903,21.947035,21.227367,21.5452,22.954028
Beauty and the Boss,43.814597,41.068694,40.376643,38.453629,38.423683,38.130472,38.155054,38.336382,35.52738,35.781649,...,32.174512,34.181909,33.843265,31.572267,31.50806,34.550111,31.722651,30.21312,29.844581,32.981781
Billionaire Beast,26.057682,24.261895,23.339244,23.631001,22.312019,22.408305,20.933641,21.260771,21.550219,21.661014,...,18.345376,18.904103,18.303481,17.500922,17.583982,18.623624,17.115754,17.174887,17.106663,18.05895
Billionaire Hero,34.5428,32.011485,29.96648,28.745322,29.149155,29.295114,26.024816,26.364931,26.688924,27.032614,...,23.728996,23.498518,22.728929,21.817518,23.056892,24.218755,21.548647,21.180575,22.087032,25.023755
Billionaire Protector,24.907794,23.736931,22.238328,21.435341,21.301015,21.987804,20.002142,20.515792,20.407266,20.907426,...,17.292545,17.655715,17.713676,16.966282,17.326291,17.703179,16.320352,16.571359,16.918548,18.673938


Probability_Range for Cluster 2 in k=3:
{'min': {'Topic_72': 6.6902022193, 'Topic_70': 5.96029475794, 'Topic_201': 6.0391171624, 'Topic_102': 7.2027280112, 'Topic_175': 5.8973335354, 'Topic_129': 5.875989118, 'Topic_0': 6.5209316176, 'Topic_36': 5.6227633015, 'Topic_12': 5.9276324582, 'Topic_167': 6.5901355198, 'Topic_117': 6.3728193862, 'Topic_14': 5.7331448988, 'Topic_157': 5.64153411886, 'Topic_51': 5.4680915485, 'Topic_83': 5.32849736646, 'Topic_10': 5.9203868564, 'Topic_178': 5.18784214015, 'Topic_60': 5.47602294465, 'Topic_84': 5.02738840396, 'Topic_187': 5.5543064092, 'Topic_31': 5.903578033, 'Topic_119': 5.5595529533, 'Topic_50': 5.12978856455, 'Topic_64': 5.65503210336, 'Topic_99': 5.20229171795, 'Topic_49': 5.2470400084, 'Topic_179': 5.2392391498, 'Topic_11': 5.6356649983, 'Topic_98': 4.77266021001, 'Topic_195': 4.3520545152}, 'max': {'Topic_72': 43.8145966126, 'Topic_70': 41.06869378106, 'Topic_201': 40.37664262128, 'Topic_102': 39.6649273278, 'Topic_175': 38.42368267946, 'T

Unnamed: 0,Book Title,Cluster_4
0,A Long Time Coming,1
1,A Not So Meet Cute,1
2,Ache for You,1
3,An Accidental Date with a Billionaire,2
4,Bad Boss,0



Analyzing Cluster 0 for k=4...
Head of dominant_data for Cluster 0 in k=4:


Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_0,Topic_129,Topic_36,Topic_117,Topic_12,...,Topic_119,Topic_31,Topic_50,Topic_99,Topic_49,Topic_64,Topic_179,Topic_11,Topic_98,Topic_195
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bad Boss,48.937849,47.072433,48.225659,45.192322,47.101467,43.874309,43.017024,44.232852,43.134256,42.895156,...,38.183474,37.725331,37.322676,37.302531,37.366143,37.592128,37.788813,36.370452,38.207716,36.539431
Beauty and the Billionaire,58.845881,52.948413,53.467553,53.812438,51.278201,52.195326,49.456941,50.635772,45.897764,49.541668,...,45.500945,44.239503,43.384968,42.31575,45.074004,42.761932,42.097425,42.107857,40.805066,43.276966
Beauty and the Boss,43.814597,41.068694,40.376643,38.453629,38.423683,38.155054,38.130472,38.336382,33.936989,35.52738,...,34.181909,32.174512,33.843265,31.50806,34.550111,31.572267,31.722651,30.21312,29.844581,32.981781
Bedding the Billionaire,53.682072,50.571483,50.854971,49.483791,47.392565,48.162375,46.359166,49.871963,41.062011,46.706812,...,43.452,40.737828,41.861483,39.776318,42.749482,40.96202,39.469905,38.493562,38.119258,39.992137
Between Commitment and Betrayal,64.770876,64.379214,59.978047,58.486863,59.239506,53.18892,58.328255,55.193112,57.262236,54.731581,...,48.855624,47.58213,48.647688,49.019952,48.297869,46.280517,46.980806,45.241027,47.883278,48.838382


Probability_Range for Cluster 0 in k=4:
{'min': {'Topic_72': 37.93854449747, 'Topic_70': 37.21857330171, 'Topic_201': 36.44396642783, 'Topic_102': 38.19861478184, 'Topic_175': 34.63358944251, 'Topic_0': 35.73837525487, 'Topic_129': 34.6520587881, 'Topic_36': 32.92708426483, 'Topic_117': 31.07165464403, 'Topic_12': 33.85332126826, 'Topic_167': 33.93345744298, 'Topic_14': 32.77166196871, 'Topic_83': 29.45986569732, 'Topic_51': 29.69840842424, 'Topic_157': 31.19026402325, 'Topic_178': 30.79286937396, 'Topic_10': 31.55904860263, 'Topic_84': 30.77333387722, 'Topic_187': 28.79609784827, 'Topic_60': 30.9863467734, 'Topic_119': 29.11515042748, 'Topic_31': 30.36688983304, 'Topic_50': 28.7633580618, 'Topic_99': 29.1302079541, 'Topic_49': 27.20028728347, 'Topic_64': 30.39411235869, 'Topic_179': 29.47615579525, 'Topic_11': 30.07906330663, 'Topic_98': 27.92008874232, 'Topic_195': 26.15116233033}, 'max': {'Topic_72': 64.77087604017, 'Topic_70': 64.37921380084, 'Topic_201': 60.78699086186, 'Topic_102

Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_117,Topic_129,Topic_0,Topic_36,Topic_12,...,Topic_187,Topic_60,Topic_115,Topic_50,Topic_31,Topic_11,Topic_179,Topic_64,Topic_98,Topic_49
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,87.735891,89.64046,88.470986,86.409227,83.728109,89.233251,84.696569,79.474938,79.086398,79.725405,...,68.080449,67.879283,73.370752,68.434361,66.606954,69.666778,69.844535,68.470536,68.443,62.647164
A Not So Meet Cute,78.115778,77.12307,76.227428,77.896244,73.723201,76.576213,71.01608,71.655854,70.049858,69.178924,...,59.656805,61.721981,61.689096,59.190119,60.841712,62.815676,61.920565,60.8255,58.682914,55.424988
Ache for You,57.628415,56.880138,55.068065,58.180555,54.437852,56.039241,55.985111,56.319151,51.501959,51.596241,...,47.71764,47.917208,45.005019,46.686634,49.415486,49.823008,46.727125,49.887579,43.722991,45.559387
Banking the Billionaire,67.338625,65.578731,66.372643,65.588555,63.678955,64.081367,60.742803,63.634949,61.743543,61.22785,...,55.922145,53.324409,49.379849,54.627552,51.983287,54.169766,52.671945,53.932228,52.61848,53.124814
Between Love and Loathing,65.652357,63.42817,61.95684,59.624659,58.216367,58.457895,57.747023,54.926813,56.894002,56.389985,...,48.849272,47.548566,50.208413,49.756389,48.217503,45.199277,48.12884,45.947559,47.002614,48.565946


Probability_Range for Cluster 1 in k=4:
{'min': {'Topic_72': 57.62841471996, 'Topic_70': 56.88013805826, 'Topic_201': 55.06806491061, 'Topic_102': 57.16611531998, 'Topic_175': 54.43785224665, 'Topic_117': 52.88440404446, 'Topic_129': 54.41957654975, 'Topic_0': 54.24428939004, 'Topic_36': 51.50195862866, 'Topic_12': 51.59624060319, 'Topic_14': 52.61392990657, 'Topic_178': 50.84123934523, 'Topic_167': 48.8467841298, 'Topic_84': 48.00235053641, 'Topic_51': 48.80766951051, 'Topic_157': 48.04592127484, 'Topic_83': 46.64510062069, 'Topic_10': 47.3276098197, 'Topic_99': 48.3390811966, 'Topic_119': 47.21108950501, 'Topic_187': 46.71573314221, 'Topic_60': 47.44689603025, 'Topic_115': 44.8546209945, 'Topic_50': 43.68561271501, 'Topic_31': 45.69471079819, 'Topic_11': 45.19927659345, 'Topic_179': 46.41696335378, 'Topic_64': 45.94755931011, 'Topic_98': 43.72299091561, 'Topic_49': 43.05280423516}, 'max': {'Topic_72': 87.73589120917, 'Topic_70': 89.64045975956, 'Topic_201': 88.4709859996, 'Topic_102'

Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_129,Topic_36,Topic_0,Topic_12,Topic_167,...,Topic_31,Topic_119,Topic_50,Topic_99,Topic_49,Topic_64,Topic_98,Topic_179,Topic_195,Topic_11
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
An Accidental Date with a Billionaire,29.594729,28.538562,27.594561,26.564325,26.399104,26.284801,26.769036,26.288002,25.012327,25.029914,...,21.884366,23.375049,23.251143,21.79973,23.532903,21.88468,21.5452,21.947035,22.954028,21.227367
Billionaire Beast,26.057682,24.261895,23.339244,23.631001,22.312019,22.408305,21.260771,20.933641,21.550219,21.661014,...,18.345376,18.904103,18.303481,17.583982,18.623624,17.500922,17.106663,17.115754,18.05895,17.174887
Billionaire Hero,34.5428,32.011485,29.96648,28.745322,29.149155,29.295114,26.364931,26.024816,26.688924,27.032614,...,23.728996,23.498518,22.728929,23.056892,24.218755,21.817518,22.087032,21.548647,25.023755,21.180575
Billionaire Protector,24.907794,23.736931,22.238328,21.435341,21.301015,21.987804,20.515792,20.002142,20.407266,20.907426,...,17.292545,17.655715,17.713676,17.326291,17.703179,16.966282,16.918548,16.320352,18.673938,16.571359
Boss,34.136145,33.491892,34.784851,33.725797,32.663952,31.35275,32.028181,33.310819,30.342221,27.983953,...,28.345806,27.957741,26.765989,27.802987,24.931169,27.870195,27.067696,28.832335,23.382269,27.525358


Probability_Range for Cluster 2 in k=4:
{'min': {'Topic_72': 6.6902022193, 'Topic_70': 5.96029475794, 'Topic_201': 6.0391171624, 'Topic_102': 7.2027280112, 'Topic_175': 5.8973335354, 'Topic_129': 5.875989118, 'Topic_36': 5.6227633015, 'Topic_0': 6.5209316176, 'Topic_12': 5.9276324582, 'Topic_167': 6.5901355198, 'Topic_117': 6.3728193862, 'Topic_14': 5.7331448988, 'Topic_157': 5.64153411886, 'Topic_51': 5.4680915485, 'Topic_83': 5.32849736646, 'Topic_10': 5.9203868564, 'Topic_178': 5.18784214015, 'Topic_60': 5.47602294465, 'Topic_84': 5.02738840396, 'Topic_187': 5.5543064092, 'Topic_31': 5.903578033, 'Topic_119': 5.5595529533, 'Topic_50': 5.12978856455, 'Topic_99': 5.20229171795, 'Topic_49': 5.2470400084, 'Topic_64': 5.65503210336, 'Topic_98': 4.77266021001, 'Topic_179': 5.2392391498, 'Topic_195': 4.3520545152, 'Topic_11': 5.6356649983}, 'max': {'Topic_72': 40.241389734, 'Topic_70': 39.70435814084, 'Topic_201': 39.4092946666, 'Topic_102': 35.6744589433, 'Topic_175': 35.440430177, 'Topic

Topic,Topic_117,Topic_102,Topic_0,Topic_70,Topic_72,Topic_201,Topic_129,Topic_14,Topic_175,Topic_36,...,Topic_99,Topic_68,Topic_51,Topic_157,Topic_179,Topic_31,Topic_73,Topic_106,Topic_50,Topic_133
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Billionaire's Christmas Bride,108.817997,107.057802,105.841517,102.955869,98.020964,98.554079,100.409149,99.123465,98.985492,97.316764,...,92.08932,92.596583,89.108042,88.065528,89.264143,88.931692,91.285401,90.284275,88.438698,88.106608
The Billionaire's Fake Wife,89.223509,86.520488,83.935525,84.386533,79.632029,80.472241,81.365516,79.384505,82.271574,79.076076,...,75.282352,75.638829,73.293841,71.985576,72.559487,72.751718,72.124742,72.762141,70.055985,71.741811
The Billionaire's Secret,114.721734,114.112702,110.74312,107.86176,103.904531,103.110809,106.219888,102.716733,104.546611,102.069216,...,94.76846,98.312488,94.768671,93.379117,92.256756,95.701019,95.704003,93.95602,92.27604,93.961619
The Casanova,87.049569,85.042473,84.601771,86.247113,86.139146,85.024351,81.500665,81.86617,81.352278,78.02491,...,70.924712,67.695476,69.391945,71.052969,68.642553,68.557049,67.517225,67.100879,68.630814,67.649519
The Stopover,98.855335,97.211921,98.803577,100.761172,97.700626,98.892348,94.681787,96.476589,93.927488,91.332969,...,80.962937,78.209747,80.664001,81.124127,81.665243,77.865082,77.461875,79.358589,79.163749,78.183528


Probability_Range for Cluster 3 in k=4:
{'min': {'Topic_117': 87.04956911925, 'Topic_102': 85.0424728922, 'Topic_0': 83.93552504704, 'Topic_70': 84.38653269954, 'Topic_72': 79.63202864223, 'Topic_201': 80.47224084414, 'Topic_129': 81.36551597921, 'Topic_14': 79.38450461529, 'Topic_175': 81.352278055, 'Topic_36': 78.02491024017, 'Topic_12': 75.32274129505, 'Topic_178': 74.28468664948, 'Topic_64': 71.23333068518, 'Topic_11': 73.28541012869, 'Topic_10': 74.2322794539, 'Topic_167': 71.47765543042, 'Topic_119': 70.67685624663, 'Topic_60': 72.10286327838, 'Topic_84': 72.68204003732, 'Topic_83': 70.14445467266, 'Topic_99': 70.92471162193, 'Topic_68': 67.6954757607, 'Topic_51': 69.39194460805, 'Topic_157': 71.05296864856, 'Topic_179': 68.64255307675, 'Topic_31': 68.55704869556, 'Topic_73': 67.5172249062, 'Topic_106': 67.10087949997, 'Topic_50': 68.63081354456, 'Topic_133': 67.64951946061}, 'max': {'Topic_117': 114.72173433419, 'Topic_102': 114.11270234707, 'Topic_0': 110.74312029796, 'Topic_70

Unnamed: 0,Book Title,Cluster_5
0,A Long Time Coming,1
1,A Not So Meet Cute,1
2,Ache for You,4
3,An Accidental Date with a Billionaire,2
4,Bad Boss,0



Analyzing Cluster 0 for k=5...
Head of dominant_data for Cluster 0 in k=5:


Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_0,Topic_117,Topic_129,Topic_36,Topic_12,...,Topic_119,Topic_99,Topic_31,Topic_50,Topic_179,Topic_64,Topic_11,Topic_98,Topic_49,Topic_195
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bad Boss,48.937849,47.072433,48.225659,45.192322,47.101467,43.874309,43.134256,43.017024,44.232852,42.895156,...,38.183474,37.302531,37.725331,37.322676,37.788813,37.592128,36.370452,38.207716,37.366143,36.539431
Beauty and the Boss,43.814597,41.068694,40.376643,38.453629,38.423683,38.155054,33.936989,38.130472,38.336382,35.52738,...,34.181909,31.50806,32.174512,33.843265,31.722651,31.572267,30.21312,29.844581,34.550111,32.981781
Bedding the Billionaire,53.682072,50.571483,50.854971,49.483791,47.392565,48.162375,41.062011,46.359166,49.871963,46.706812,...,43.452,39.776318,40.737828,41.861483,39.469905,40.96202,38.493562,38.119258,42.749482,39.992137
Billionaire Bad Boy,46.440322,42.781625,43.842286,43.204486,40.329222,42.911651,37.591524,39.85096,42.079581,40.577614,...,37.776516,35.750288,34.638026,35.780716,35.298359,35.514808,34.780224,32.801982,36.317262,34.810955
Boss,34.136145,33.491892,34.784851,33.725797,32.663952,33.310819,32.757599,31.35275,32.028181,30.342221,...,27.957741,27.802987,28.345806,26.765989,28.832335,27.870195,27.525358,27.067696,24.931169,23.382269


Probability_Range for Cluster 0 in k=5:
{'min': {'Topic_72': 34.13614529628, 'Topic_70': 33.49189203235, 'Topic_201': 34.78485104286, 'Topic_102': 33.7257968922, 'Topic_175': 32.6639522352, 'Topic_0': 33.31081883046, 'Topic_117': 31.07165464403, 'Topic_129': 31.35274988212, 'Topic_36': 32.02818091595, 'Topic_12': 30.34222108434, 'Topic_167': 27.98395342215, 'Topic_14': 31.57795144472, 'Topic_157': 28.63633388383, 'Topic_178': 29.70123923564, 'Topic_51': 27.40977710702, 'Topic_83': 28.76524919775, 'Topic_84': 30.10412443487, 'Topic_10': 28.865999277, 'Topic_187': 27.16588696689, 'Topic_60': 27.30459030422, 'Topic_119': 27.95774137603, 'Topic_99': 27.8029867269, 'Topic_31': 27.38246105663, 'Topic_50': 26.7659888063, 'Topic_179': 28.83233493008, 'Topic_64': 27.85169497393, 'Topic_11': 27.5253581105, 'Topic_98': 27.06769566347, 'Topic_49': 24.93116858824, 'Topic_195': 23.38226935039}, 'max': {'Topic_72': 57.44719561313, 'Topic_70': 55.11222390215, 'Topic_201': 52.87698727382, 'Topic_102': 

Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_117,Topic_175,Topic_129,Topic_0,Topic_14,Topic_36,...,Topic_115,Topic_60,Topic_187,Topic_11,Topic_50,Topic_31,Topic_179,Topic_64,Topic_98,Topic_128
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Long Time Coming,87.735891,89.64046,88.470986,86.409227,89.233251,83.728109,84.696569,79.474938,82.583982,79.086398,...,73.370752,67.879283,68.080449,69.666778,68.434361,66.606954,69.844535,68.470536,68.443,69.605533
A Not So Meet Cute,78.115778,77.12307,76.227428,77.896244,76.576213,73.723201,71.01608,71.655854,73.637646,70.049858,...,61.689096,61.721981,59.656805,62.815676,59.190119,60.841712,61.920565,60.8255,58.682914,60.735797
Blue Eyed Devil,78.029453,78.674587,75.679984,72.43025,71.957314,72.322244,73.388228,68.252744,69.82918,70.960031,...,65.554881,63.130958,60.814763,60.540235,61.63623,59.943694,61.663991,59.407181,57.637508,58.605908
Brooklynaire,66.769018,71.31031,69.729444,67.621515,67.205259,67.362755,65.614438,64.349374,65.35521,64.024782,...,56.123608,55.1055,56.554913,56.610357,53.704689,51.336611,55.127921,54.617966,61.246128,57.281107
Brutal Billionaire,73.254801,78.472603,73.669379,73.063784,70.723266,72.615748,70.83358,64.690243,69.988083,68.302479,...,63.047993,59.123202,57.758085,57.931885,57.247819,57.349795,58.528871,56.878062,59.243987,61.217073


Probability_Range for Cluster 1 in k=5:
{'min': {'Topic_72': 66.7690179022, 'Topic_70': 70.57257841241, 'Topic_201': 67.21159854517, 'Topic_102': 67.62151481264, 'Topic_117': 66.06062545573, 'Topic_175': 67.16201947271, 'Topic_129': 65.61443780191, 'Topic_0': 62.60177242983, 'Topic_14': 61.37780578657, 'Topic_36': 62.85726853198, 'Topic_12': 63.2735934156, 'Topic_178': 60.0637018391, 'Topic_84': 58.09270809869, 'Topic_51': 56.84301729863, 'Topic_157': 56.22084114617, 'Topic_167': 58.17296481693, 'Topic_83': 56.47294895954, 'Topic_10': 57.71011387603, 'Topic_99': 54.45530067876, 'Topic_119': 56.18500456908, 'Topic_115': 54.98460918436, 'Topic_60': 55.10550015266, 'Topic_187': 56.55491323126, 'Topic_11': 55.21731794447, 'Topic_50': 53.70468929226, 'Topic_31': 51.33661134906, 'Topic_179': 53.64854325332, 'Topic_64': 54.61796584246, 'Topic_98': 52.68929581618, 'Topic_128': 51.01283333587}, 'max': {'Topic_72': 87.73589120917, 'Topic_70': 89.64045975956, 'Topic_201': 88.4709859996, 'Topic_10

Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_129,Topic_36,Topic_0,Topic_167,Topic_12,...,Topic_31,Topic_119,Topic_49,Topic_99,Topic_50,Topic_64,Topic_195,Topic_98,Topic_179,Topic_24
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
An Accidental Date with a Billionaire,29.594729,28.538562,27.594561,26.564325,26.399104,26.284801,26.769036,26.288002,25.029914,25.012327,...,21.884366,23.375049,23.532903,21.79973,23.251143,21.88468,22.954028,21.5452,21.947035,22.391492
Billionaire Beast,26.057682,24.261895,23.339244,23.631001,22.312019,22.408305,21.260771,20.933641,21.661014,21.550219,...,18.345376,18.904103,18.623624,17.583982,18.303481,17.500922,18.05895,17.106663,17.115754,19.338817
Billionaire Hero,34.5428,32.011485,29.96648,28.745322,29.149155,29.295114,26.364931,26.024816,27.032614,26.688924,...,23.728996,23.498518,24.218755,23.056892,22.728929,21.817518,25.023755,22.087032,21.548647,24.940947
Billionaire Protector,24.907794,23.736931,22.238328,21.435341,21.301015,21.987804,20.515792,20.002142,20.907426,20.407266,...,17.292545,17.655715,17.703179,17.326291,17.713676,16.966282,18.673938,16.918548,16.320352,20.483212
Carter Grayson,35.065839,36.679604,34.920514,30.609085,32.647306,31.292706,31.819703,30.125139,28.650437,32.240451,...,24.426867,24.00737,23.60888,24.094889,27.218057,24.88951,27.141516,27.655639,26.239948,25.085814


Probability_Range for Cluster 2 in k=5:
{'min': {'Topic_72': 6.6902022193, 'Topic_70': 5.96029475794, 'Topic_201': 6.0391171624, 'Topic_102': 7.2027280112, 'Topic_175': 5.8973335354, 'Topic_129': 5.875989118, 'Topic_36': 5.6227633015, 'Topic_0': 6.5209316176, 'Topic_167': 6.5901355198, 'Topic_12': 5.9276324582, 'Topic_117': 6.3728193862, 'Topic_14': 5.7331448988, 'Topic_51': 5.4680915485, 'Topic_157': 5.64153411886, 'Topic_83': 5.32849736646, 'Topic_10': 5.9203868564, 'Topic_60': 5.47602294465, 'Topic_178': 5.18784214015, 'Topic_187': 5.5543064092, 'Topic_84': 5.02738840396, 'Topic_31': 5.903578033, 'Topic_119': 5.5595529533, 'Topic_49': 5.2470400084, 'Topic_99': 5.20229171795, 'Topic_50': 5.12978856455, 'Topic_64': 5.65503210336, 'Topic_195': 4.3520545152, 'Topic_98': 4.77266021001, 'Topic_179': 5.2392391498, 'Topic_24': 5.25364928371}, 'max': {'Topic_72': 36.7621211663, 'Topic_70': 36.6796035038, 'Topic_201': 34.92051411986, 'Topic_102': 33.34814110219, 'Topic_175': 32.6473061453, 'T

Topic,Topic_117,Topic_102,Topic_0,Topic_70,Topic_201,Topic_72,Topic_129,Topic_175,Topic_14,Topic_36,...,Topic_99,Topic_68,Topic_51,Topic_179,Topic_157,Topic_73,Topic_31,Topic_106,Topic_52,Topic_133
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Billionaire's Christmas Bride,108.817997,107.057802,105.841517,102.955869,98.554079,98.020964,100.409149,98.985492,99.123465,97.316764,...,92.08932,92.596583,89.108042,89.264143,88.065528,91.285401,88.931692,90.284275,93.33545,88.106608
The Billionaire's Fake Wife,89.223509,86.520488,83.935525,84.386533,80.472241,79.632029,81.365516,82.271574,79.384505,79.076076,...,75.282352,75.638829,73.293841,72.559487,71.985576,72.124742,72.751718,72.762141,73.097412,71.741811
The Billionaire's Secret,114.721734,114.112702,110.74312,107.86176,103.110809,103.904531,106.219888,104.546611,102.716733,102.069216,...,94.76846,98.312488,94.768671,92.256756,93.379117,95.704003,95.701019,93.95602,97.612517,93.961619
The Stopover,98.855335,97.211921,98.803577,100.761172,98.892348,97.700626,94.681787,93.927488,96.476589,91.332969,...,80.962937,78.209747,80.664001,81.665243,81.124127,77.461875,77.865082,79.358589,74.005676,78.183528
The Takeover,100.133924,98.755546,100.231517,101.65402,101.054947,102.013082,97.187188,94.095525,95.666194,91.223609,...,80.699036,78.446376,80.938291,80.750235,81.414336,79.01258,80.054863,77.826849,74.444545,77.851128


Probability_Range for Cluster 3 in k=5:
{'min': {'Topic_117': 89.22350887143, 'Topic_102': 86.5204883794, 'Topic_0': 83.93552504704, 'Topic_70': 84.38653269954, 'Topic_201': 80.47224084414, 'Topic_72': 79.63202864223, 'Topic_129': 81.36551597921, 'Topic_175': 82.27157422965, 'Topic_14': 79.38450461529, 'Topic_36': 79.07607566407, 'Topic_12': 75.32274129505, 'Topic_178': 77.55937848559999, 'Topic_64': 79.27135050887, 'Topic_11': 74.7457025241, 'Topic_119': 76.12279174031, 'Topic_10': 76.11966118938, 'Topic_60': 75.97108893923, 'Topic_167': 71.47765543042, 'Topic_84': 72.68204003732, 'Topic_83': 74.35756479252, 'Topic_99': 75.28235164419, 'Topic_68': 75.6388288141, 'Topic_51': 73.29384125456, 'Topic_179': 72.55948656087, 'Topic_157': 71.98557558946, 'Topic_73': 72.12474245203, 'Topic_31': 72.75171797122, 'Topic_106': 72.76214059846, 'Topic_52': 73.09741174633, 'Topic_133': 71.74181118747}, 'max': {'Topic_117': 114.72173433419, 'Topic_102': 114.11270234707, 'Topic_0': 110.74312029796, 'To

Topic,Topic_72,Topic_70,Topic_201,Topic_102,Topic_175,Topic_129,Topic_0,Topic_117,Topic_36,Topic_12,...,Topic_60,Topic_99,Topic_31,Topic_50,Topic_49,Topic_11,Topic_64,Topic_179,Topic_98,Topic_115
Book Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ache for You,57.628415,56.880138,55.068065,58.180555,54.437852,55.985111,56.319151,56.039241,51.501959,51.596241,...,47.917208,48.661635,49.415486,46.686634,45.559387,49.823008,49.887579,46.727125,43.722991,45.005019
Banking the Billionaire,67.338625,65.578731,66.372643,65.588555,63.678955,60.742803,63.634949,64.081367,61.743543,61.22785,...,53.324409,55.598779,51.983287,54.627552,53.124814,54.169766,53.932228,52.671945,52.61848,49.379849
Beauty and the Billionaire,58.845881,52.948413,53.467553,53.812438,51.278201,49.456941,52.195326,45.897764,50.635772,49.541668,...,45.127404,42.31575,44.239503,43.384968,45.074004,42.107857,42.761932,42.097425,40.805066,38.788087
Between Commitment and Betrayal,64.770876,64.379214,59.978047,58.486863,59.239506,58.328255,53.18892,57.262236,55.193112,54.731581,...,49.072724,49.019952,47.58213,48.647688,48.297869,45.241027,46.280517,46.980806,47.883278,49.470807
Between Love and Loathing,65.652357,63.42817,61.95684,59.624659,58.216367,57.747023,54.926813,58.457895,56.894002,56.389985,...,47.548566,48.35128,48.217503,49.756389,48.565946,45.199277,45.947559,48.12884,47.002614,50.208413


Probability_Range for Cluster 4 in k=5:
{'min': {'Topic_72': 51.78232845106, 'Topic_70': 51.97685243855, 'Topic_201': 49.37901058366, 'Topic_102': 51.21786653832, 'Topic_175': 49.75909426403, 'Topic_129': 48.0332429687, 'Topic_0': 48.98195218858, 'Topic_117': 42.61638962508, 'Topic_36': 47.31505044821, 'Topic_12': 46.1489227469, 'Topic_14': 46.52930518972, 'Topic_167': 45.1353655735, 'Topic_178': 42.04481353904, 'Topic_51': 44.0876115043, 'Topic_83': 44.31722859836, 'Topic_157': 42.49347559358, 'Topic_84': 42.25994866496, 'Topic_10': 45.00354813667, 'Topic_187': 40.97237416936, 'Topic_119': 43.01622704679, 'Topic_60': 42.13119810984, 'Topic_99': 40.56145178004, 'Topic_31': 42.21945500717, 'Topic_50': 40.1105315662, 'Topic_49': 40.83120101438, 'Topic_11': 41.29993854204, 'Topic_64': 41.2614351248, 'Topic_179': 41.26676127121, 'Topic_98': 40.1209810964, 'Topic_115': 38.14369913799}, 'max': {'Topic_72': 75.1360006544, 'Topic_70': 70.92149856687, 'Topic_201': 68.24951752723, 'Topic_102': 7

In [None]:
# List all files in the output directory to verify saved results
print(f"\nFiles saved in the output directory '{output_dir}':")
for filename in os.listdir(output_dir):
    print(filename)


Files saved in the output directory '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs':
top_30_topics_k3_cluster0.csv
all_topics_k3_cluster0.csv
top_30_topics_k3_cluster1.csv
all_topics_k3_cluster1.csv
top_30_topics_k3_cluster2.csv
all_topics_k3_cluster2.csv
top_5_books_k3_cluster0.csv
all_books_k3_cluster0.csv
top_5_books_k3_cluster1.csv
all_books_k3_cluster1.csv
top_5_books_k3_cluster2.csv
all_books_k3_cluster2.csv
cluster_profiles_k3.csv
thematic_spread_k3.csv
cluster_assignments_k3.csv
top_30_topics_k4_cluster0.csv
all_topics_k4_cluster0.csv
top_30_topics_k4_cluster1.csv
all_topics_k4_cluster1.csv
top_30_topics_k4_cluster2.csv
all_topics_k4_cluster2.csv
top_30_topics_k4_cluster3.csv
all_topics_k4_cluster3.csv
top_5_books_k4_cluster0.csv
all_books_k4_cluster0.csv
top_5_books_k4_cluster1.csv
all_books_k4_cluster1.csv
top_5_books_k4_cluster2.csv
all_books_k4_cluster2.csv
top_5_books_k4_cluster3.csv
all_books_k4_cluster3.csv
cluster_profiles_k

In [None]:
# Corrected Verification of Cluster Assignments Using Merge

for k in cluster_range:
    print(f"\nVerifying cluster assignments for k={k}...")

    # Merge df and df_pivot on 'Book Title' to align cluster labels
    merged_df = df.merge(
        df_pivot[[f'Cluster_{k}']].reset_index(),  # Reset index to have 'Book Title' as a column
        on='Book Title',
        how='left',
        suffixes=('', '_pivot')
    )

    # Check for any missing cluster assignments after merge
    missing_assignments = merged_df[f'Cluster_{k}_pivot'].isnull().sum()
    if missing_assignments > 0:
        print(f"Warning: {missing_assignments} books in df do not have cluster assignments in df_pivot for k={k}.")

    # Compare the cluster labels
    consistency = (merged_df[f'Cluster_{k}'] == merged_df[f'Cluster_{k}_pivot']).all()

    if consistency:
        print(f"✔️ Cluster assignments for k={k} are consistent between df_pivot and original df.")
    else:
        # Identify discrepancies
        discrepancies = merged_df[merged_df[f'Cluster_{k}'] != merged_df[f'Cluster_{k}_pivot']]
        print(f"❌ Cluster assignments for k={k} are NOT consistent between df_pivot and original df.")
        print(f"Number of discrepancies: {discrepancies.shape[0]}")
        print("Sample discrepancies:")
        display(discrepancies[['Book Title', f'Cluster_{k}', f'Cluster_{k}_pivot']].head())


Verifying cluster assignments for k=3...
✔️ Cluster assignments for k=3 are consistent between df_pivot and original df.

Verifying cluster assignments for k=4...
✔️ Cluster assignments for k=4 are consistent between df_pivot and original df.

Verifying cluster assignments for k=5...
✔️ Cluster assignments for k=5 are consistent between df_pivot and original df.


In [None]:
# Check for duplicate 'Book Title's in df
duplicate_books_df = df[df.duplicated(subset=['Book Title'], keep=False)]
if not duplicate_books_df.empty:
    print("Duplicate 'Book Title's found in df:")
    display(duplicate_books_df.head())
else:
    print("No duplicate 'Book Title's found in df.")

# Check for duplicate 'Book Title's in df_pivot
duplicate_books_pivot = df_pivot[df_pivot.index.duplicated(keep=False)]
if not duplicate_books_pivot.empty:
    print("Duplicate 'Book Title's found in df_pivot:")
    display(duplicate_books_pivot.head())
else:
    print("No duplicate 'Book Title's found in df_pivot.")

Duplicate 'Book Title's found in df:


Unnamed: 0,Book Title,Topic,Topic_Label,Probability,Cluster_3,Cluster_4,Cluster_5
0,A Long Time Coming,Topic_0,Expressions of Joy and Amusement,79.474937,1,1,1
1,A Not So Meet Cute,Topic_0,Expressions of Joy and Amusement,71.655853,1,1,1
2,Ache for You,Topic_0,Expressions of Joy and Amusement,56.319153,0,1,4
3,An Accidental Date with a Billionaire,Topic_0,Expressions of Joy and Amusement,26.288002,2,2,2
4,Bad Boss,Topic_0,Expressions of Joy and Amusement,43.87431,0,0,0


No duplicate 'Book Title's found in df_pivot.


In [None]:
# Verify all expected output files are present
expected_files = []
for k in cluster_range:
    for cluster in range(k):
        expected_files.extend([
            f'top_30_topics_k{k}_cluster{cluster}.csv',
            f'all_topics_k{k}_cluster{cluster}.csv',
            f'top_5_books_k{k}_cluster{cluster}.csv',
            f'all_books_k{k}_cluster{cluster}.csv'
        ])
    expected_files.extend([
        f'cluster_profiles_k{k}.csv',
        f'thematic_spread_k{k}.csv',
        f'cluster_assignments_k{k}.csv'
    ])

print("\nVerifying presence of all expected output files:")
for file in expected_files:
    file_path = os.path.join(output_dir, file)
    if os.path.exists(file_path):
        print(f"✔️ {file} exists.")
    else:
        print(f"❌ {file} is missing.")


Verifying presence of all expected output files:
✔️ top_30_topics_k3_cluster0.csv exists.
✔️ all_topics_k3_cluster0.csv exists.
✔️ top_5_books_k3_cluster0.csv exists.
✔️ all_books_k3_cluster0.csv exists.
✔️ top_30_topics_k3_cluster1.csv exists.
✔️ all_topics_k3_cluster1.csv exists.
✔️ top_5_books_k3_cluster1.csv exists.
✔️ all_books_k3_cluster1.csv exists.
✔️ top_30_topics_k3_cluster2.csv exists.
✔️ all_topics_k3_cluster2.csv exists.
✔️ top_5_books_k3_cluster2.csv exists.
✔️ all_books_k3_cluster2.csv exists.
✔️ cluster_profiles_k3.csv exists.
✔️ thematic_spread_k3.csv exists.
✔️ cluster_assignments_k3.csv exists.
✔️ top_30_topics_k4_cluster0.csv exists.
✔️ all_topics_k4_cluster0.csv exists.
✔️ top_5_books_k4_cluster0.csv exists.
✔️ all_books_k4_cluster0.csv exists.
✔️ top_30_topics_k4_cluster1.csv exists.
✔️ all_topics_k4_cluster1.csv exists.
✔️ top_5_books_k4_cluster1.csv exists.
✔️ all_books_k4_cluster1.csv exists.
✔️ top_30_topics_k4_cluster2.csv exists.
✔️ all_topics_k4_cluster2.c

In [96]:
# Comparative Analysis

# Initialize DataFrames for comparative analysis
comparative_profiles = pd.DataFrame()
comparative_thematic = pd.DataFrame()

for k in cluster_range:
    # Load Cluster Profiles
    profiles_path = os.path.join(output_dir, f'cluster_profiles_k{k}.csv')
    profiles_df = pd.read_csv(profiles_path)
    profiles_df['k'] = k
    comparative_profiles = pd.concat([comparative_profiles, profiles_df], ignore_index=True)

    # Load Thematic Spread
    thematic_path = os.path.join(output_dir, f'thematic_spread_k{k}.csv')
    thematic_df = pd.read_csv(thematic_path)
    thematic_df['k'] = k
    comparative_thematic = pd.concat([comparative_thematic, thematic_df], ignore_index=True)

# Save Comparative Profiles
comparative_profiles_output = os.path.join(output_dir, 'comparative_cluster_profiles.csv')
comparative_profiles.to_csv(comparative_profiles_output, index=False)
print(f"Saved comparative cluster profiles to '{comparative_profiles_output}'.")
print(f"Sample comparative cluster profiles:")
display(comparative_profiles.head())

# Save Comparative Thematic Spread
comparative_thematic_output = os.path.join(output_dir, 'comparative_thematic_spread.csv')
comparative_thematic.to_csv(comparative_thematic_output, index=False)
print(f"Saved comparative thematic spread to '{comparative_thematic_output}'.")
print(f"Sample comparative thematic spread:")
display(comparative_thematic.head())


Saved comparative cluster profiles to '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/comparative_cluster_profiles.csv'.
Sample comparative cluster profiles:


Unnamed: 0,Cluster,Topic,Topic_Label,Mean_Probability,Std_Deviation,Cluster_Size,Cluster_Percentage,k
0,0,Topic_72,Love and Affection,56.95968,7.831732,55,52.380952,3
1,0,Topic_70,Leaving and Letting Go,55.085835,7.906873,55,52.380952,3
2,0,Topic_201,Happiness and Gratitude,53.9773,7.330659,55,52.380952,3
3,0,Topic_102,Pleasure and Seduction,53.503047,7.149895,55,52.380952,3
4,0,Topic_175,Moments and Realizations,52.148115,7.278969,55,52.380952,3


Saved comparative thematic spread to '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/comparative_thematic_spread.csv'.
Sample comparative thematic spread:


Unnamed: 0,Cluster,Topic,Min_Probability,Max_Probability,Median_Probability,Std_Deviation,k
0,0,Topic_72,42.246764,75.136001,57.463841,7.831732,3
1,0,Topic_70,42.079282,70.921499,53.589912,7.906873,3
2,0,Topic_201,41.231069,68.249518,53.14141,7.330659,3
3,0,Topic_102,41.48198,70.952675,52.413939,7.149895,3
4,0,Topic_175,39.748322,65.05258,51.047829,7.278969,3


# Stat

In [99]:
comparative_profiles = pd.read_csv('/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/comparative_cluster_profiles.csv')

In [100]:
cluster_sizes = comparative_profiles[['k', 'Cluster', 'Cluster_Size', 'Cluster_Percentage']].drop_duplicates()
print("Cluster Sizes and Percentages:")
display(cluster_sizes)

Cluster Sizes and Percentages:


Unnamed: 0,k,Cluster,Cluster_Size,Cluster_Percentage
0,3,0,55,52.380952
30,3,1,20,19.047619
60,3,2,30,28.571429
90,4,0,42,40.0
120,4,1,31,29.52381
150,4,2,26,24.761905
180,4,3,6,5.714286
210,5,0,30,28.571429
240,5,1,15,14.285714
270,5,2,24,22.857143


In [101]:
for k in sorted(comparative_profiles['k'].unique()):
    print(f"\nTop 30 Topics for k={k}:")
    clusters_k = comparative_profiles[comparative_profiles['k'] == k]
    for cluster in sorted(clusters_k['Cluster'].unique()):
        print(f"\nCluster {cluster}:")
        cluster_data = clusters_k[clusters_k['Cluster'] == cluster]
        # Sort topics by mean probability descending
        cluster_data = cluster_data.sort_values(by='Mean_Probability', ascending=False)
        # Display the top 30 topics
        display(cluster_data[['Topic', 'Topic_Label', 'Mean_Probability', 'Std_Deviation']].head(30))


Top 30 Topics for k=3:

Cluster 0:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
0,Topic_72,Love and Affection,56.95968,7.831732
1,Topic_70,Leaving and Letting Go,55.085835,7.906873
2,Topic_201,Happiness and Gratitude,53.9773,7.330659
3,Topic_102,Pleasure and Seduction,53.503047,7.149895
4,Topic_175,Moments and Realizations,52.148115,7.278969
5,Topic_129,Breaking and Letting Go,51.07328,7.271985
6,Topic_0,Expressions of Joy and Amusement,50.83648,6.574115
7,Topic_117,Possession and Desire,50.48275,7.913357
8,Topic_36,Nods and Agreement,50.172346,6.841607
9,Topic_12,Nightlife and Events,49.672491,6.715273



Cluster 1:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
30,Topic_70,Leaving and Letting Go,83.676551,11.296283
31,Topic_72,Love and Affection,83.332764,10.283198
32,Topic_102,Pleasure and Seduction,81.832056,13.278993
33,Topic_117,Possession and Desire,81.32274,14.630324
34,Topic_201,Happiness and Gratitude,81.109163,11.417902
35,Topic_129,Breaking and Letting Go,79.294983,11.628158
36,Topic_175,Moments and Realizations,79.11932,11.056705
37,Topic_0,Expressions of Joy and Amusement,77.696095,14.997231
38,Topic_14,Conversations and Communication,76.266857,13.108001
39,Topic_36,Nods and Agreement,75.582944,11.561559



Cluster 2:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
60,Topic_72,Love and Affection,27.058034,10.25913
61,Topic_70,Leaving and Letting Go,25.838168,9.624095
62,Topic_201,Happiness and Gratitude,25.287094,9.55792
63,Topic_102,Pleasure and Seduction,25.038694,9.090026
64,Topic_175,Moments and Realizations,24.424998,8.826892
65,Topic_129,Breaking and Letting Go,23.727007,8.857014
66,Topic_0,Expressions of Joy and Amusement,23.686636,8.977523
67,Topic_36,Nods and Agreement,23.548015,8.900502
68,Topic_12,Nightlife and Events,22.952962,8.421589
69,Topic_167,Romance and Feelings,22.890218,8.431321



Top 30 Topics for k=4:

Cluster 0:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
90,Topic_72,Love and Affection,52.140778,6.914671
91,Topic_70,Leaving and Letting Go,49.863856,6.359648
92,Topic_201,Happiness and Gratitude,49.069435,5.970205
93,Topic_102,Pleasure and Seduction,48.796419,5.746704
94,Topic_175,Moments and Realizations,47.298992,6.029522
95,Topic_0,Expressions of Joy and Amusement,46.494164,5.455808
96,Topic_129,Breaking and Letting Go,46.224954,5.823022
97,Topic_36,Nods and Agreement,45.80114,5.824434
98,Topic_117,Possession and Desire,45.257121,5.855231
99,Topic_12,Nightlife and Events,45.040474,5.4332



Cluster 1:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
120,Topic_72,Love and Affection,71.15106,8.486287
121,Topic_70,Leaving and Letting Go,70.337544,8.242015
122,Topic_201,Happiness and Gratitude,68.255702,8.17775
123,Topic_102,Pleasure and Seduction,67.606925,8.18116
124,Topic_175,Moments and Realizations,66.298751,7.780298
125,Topic_117,Possession and Desire,65.911684,8.869083
126,Topic_129,Breaking and Letting Go,65.719156,7.873887
127,Topic_0,Expressions of Joy and Amusement,63.27234,6.824294
128,Topic_36,Nods and Agreement,62.986537,7.316735
129,Topic_12,Nightlife and Events,62.828011,7.053345



Cluster 2:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
150,Topic_72,Love and Affection,24.929721,9.272187
151,Topic_70,Leaving and Letting Go,23.837872,8.710651
152,Topic_201,Happiness and Gratitude,23.292147,8.633838
153,Topic_102,Pleasure and Seduction,22.942474,7.843479
154,Topic_175,Moments and Realizations,22.549413,7.911819
155,Topic_129,Breaking and Letting Go,21.793624,7.848433
156,Topic_36,Nods and Agreement,21.627087,7.90558
157,Topic_0,Expressions of Joy and Amusement,21.622036,7.750614
158,Topic_12,Nightlife and Events,21.139537,7.520903
159,Topic_167,Romance and Feelings,21.021522,7.42252



Cluster 3:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
180,Topic_117,Possession and Desire,99.800345,10.764961
181,Topic_102,Pleasure and Seduction,98.116822,11.33534
182,Topic_0,Expressions of Joy and Amusement,97.359505,10.994203
183,Topic_70,Leaving and Letting Go,97.311078,9.627046
184,Topic_72,Love and Affection,94.568396,9.575648
185,Topic_201,Happiness and Gratitude,94.518129,9.374493
186,Topic_129,Breaking and Letting Go,93.560699,10.156097
187,Topic_14,Conversations and Communication,92.538942,9.583062
188,Topic_175,Moments and Realizations,92.529828,9.172272
189,Topic_36,Nods and Agreement,89.840591,9.644247



Top 30 Topics for k=5:

Cluster 0:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
210,Topic_72,Love and Affection,47.742296,5.61331
211,Topic_70,Leaving and Letting Go,46.04649,5.221198
212,Topic_201,Happiness and Gratitude,45.400268,4.875763
213,Topic_102,Pleasure and Seduction,45.026092,4.92872
214,Topic_175,Moments and Realizations,43.393745,4.785887
215,Topic_0,Expressions of Joy and Amusement,42.780008,4.129971
216,Topic_117,Possession and Desire,42.46376,5.074796
217,Topic_129,Breaking and Letting Go,42.440929,4.662444
218,Topic_36,Nods and Agreement,42.027527,4.601993
219,Topic_12,Nightlife and Events,41.542053,4.579872



Cluster 1:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
240,Topic_72,Love and Affection,79.025603,6.111105
241,Topic_70,Leaving and Letting Go,78.394111,5.566207
242,Topic_201,Happiness and Gratitude,76.006589,6.456326
243,Topic_102,Pleasure and Seduction,75.532177,6.162501
244,Topic_117,Possession and Desire,74.313486,7.244603
245,Topic_175,Moments and Realizations,73.903981,5.477553
246,Topic_129,Breaking and Letting Go,73.735743,5.165542
247,Topic_0,Expressions of Joy and Amusement,70.29111,6.405034
248,Topic_14,Conversations and Communication,70.131311,6.986685
249,Topic_36,Nods and Agreement,70.042684,5.348757



Cluster 2:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
270,Topic_72,Love and Affection,23.908134,8.859
271,Topic_70,Leaving and Letting Go,22.774518,8.142222
272,Topic_201,Happiness and Gratitude,22.141736,7.912488
273,Topic_102,Pleasure and Seduction,21.962669,7.332418
274,Topic_175,Moments and Realizations,21.590848,7.443665
275,Topic_129,Breaking and Letting Go,20.859708,7.408546
276,Topic_36,Nods and Agreement,20.610543,7.323531
277,Topic_0,Expressions of Joy and Amusement,20.556591,7.056362
278,Topic_167,Romance and Feelings,20.214329,7.099354
279,Topic_12,Nightlife and Events,20.187967,6.983725



Cluster 3:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
300,Topic_117,Possession and Desire,102.3505,9.802082
301,Topic_102,Pleasure and Seduction,100.731692,10.456139
302,Topic_0,Expressions of Joy and Amusement,99.911051,10.112502
303,Topic_70,Leaving and Letting Go,99.523871,8.895314
304,Topic_201,Happiness and Gratitude,96.416885,9.100102
305,Topic_72,Love and Affection,96.254246,9.659225
306,Topic_129,Breaking and Letting Go,95.972706,9.235776
307,Topic_175,Moments and Realizations,94.765338,8.226904
308,Topic_14,Conversations and Communication,94.673497,8.978944
309,Topic_36,Nods and Agreement,92.203727,8.624458



Cluster 4:


Unnamed: 0,Topic,Topic_Label,Mean_Probability,Std_Deviation
330,Topic_72,Love and Affection,62.530946,4.966763
331,Topic_70,Leaving and Letting Go,60.544606,5.709485
332,Topic_201,Happiness and Gratitude,59.159826,5.006449
333,Topic_102,Pleasure and Seduction,58.578761,4.805507
334,Topic_175,Moments and Realizations,57.448501,4.687281
335,Topic_129,Breaking and Letting Go,56.354186,4.703318
336,Topic_0,Expressions of Joy and Amusement,55.801518,3.778632
337,Topic_117,Possession and Desire,55.229289,7.074059
338,Topic_36,Nods and Agreement,55.175505,4.09522
339,Topic_12,Nightlife and Events,54.509775,4.499091


# Statistical Analysis

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import pandas as pd

print(f"Pandas version: {pd.__version__}")

Pandas version: 2.2.2


In [None]:
# Comparative Metrics Across Clusters - Shared Themes Using Pearson Correlation

from scipy.stats import pearsonr
import pandas as pd
import os

# Initialize a list to store Pearson correlation results
correlation_results = []

for k in cluster_range:
    print(f"\nCalculating Pearson Correlation for shared themes in k={k}...")

    # Load cluster profiles for k
    profiles_path = os.path.join(output_dir, f'cluster_profiles_k{k}.csv')
    profiles_df = pd.read_csv(profiles_path)

    # Select only the necessary columns to avoid including grouping columns
    profiles_subset = profiles_df[['Cluster', 'Topic', 'Mean_Probability']]

    # Get the top 30 topics for each cluster
    top_n = 30
    # Remove 'include_groups=False' to avoid the TypeError
    top_topics_df = profiles_subset.groupby('Cluster', group_keys=False).apply(
        lambda x: x.nlargest(top_n, 'Mean_Probability')
    )

    # Get the list of unique clusters
    clusters = top_topics_df['Cluster'].unique()

    # Iterate through each unique pair of clusters
    for i in range(len(clusters)):
        for j in range(i+1, len(clusters)):
            cluster_a = clusters[i]
            cluster_b = clusters[j]

            # Extract mean probabilities for the top 30 topics in each cluster
            topics_a = top_topics_df[top_topics_df['Cluster'] == cluster_a].set_index('Topic')['Mean_Probability']
            topics_b = top_topics_df[top_topics_df['Cluster'] == cluster_b].set_index('Topic')['Mean_Probability']

            # Find common topics between the two clusters
            common_topics = topics_a.index.intersection(topics_b.index)

            if len(common_topics) < 2:
                print(f"Not enough common topics between Cluster {cluster_a} and Cluster {cluster_b} for k={k}. Skipping...")
                continue

            # Extract the probabilities for the common topics
            prob_a = topics_a[common_topics]
            prob_b = topics_b[common_topics]

            # Calculate Pearson correlation
            corr_coef, p_val = pearsonr(prob_a, prob_b)

            # Append the results to the list
            correlation_results.append({
                'k': k,
                'Cluster_A': cluster_a,
                'Cluster_B': cluster_b,
                'Pearson_Correlation': corr_coef,
                'P_Value': p_val
            })

    # Display the correlations for current k
    correlations_df = pd.DataFrame([res for res in correlation_results if res['k'] == k])
    print(f"\nPearson Correlation Coefficients for k={k}:")
    display(correlations_df[['Cluster_A', 'Cluster_B', 'Pearson_Correlation', 'P_Value']])

# Convert the complete list to a DataFrame
shared_themes_df = pd.DataFrame(correlation_results)

# Save the shared themes correlations to CSV
shared_themes_output = os.path.join(output_dir, 'shared_themes_pearson_correlation.csv')
shared_themes_df.to_csv(shared_themes_output, index=False)
print(f"\nSaved shared themes Pearson correlation coefficients to '{shared_themes_output}'.")

# Additional Check: Verify the contents of the CSV
print("\nVerifying the contents of 'shared_themes_pearson_correlation.csv':")
shared_themes_preview = pd.read_csv(shared_themes_output).head()
display(shared_themes_preview)


Calculating Pearson Correlation for shared themes in k=3...

Pearson Correlation Coefficients for k=3:


Unnamed: 0,Cluster_A,Cluster_B,Pearson_Correlation,P_Value
0,0,1,0.961984,9.523581000000001e-17
1,0,2,0.991741,1.2697070000000001e-25
2,1,2,0.927417,1.306236e-12



Calculating Pearson Correlation for shared themes in k=4...

Pearson Correlation Coefficients for k=4:


Unnamed: 0,Cluster_A,Cluster_B,Pearson_Correlation,P_Value
0,0,1,0.973841,6.570738e-19
1,0,2,0.994438,6.411918e-29
2,0,3,0.835185,1.113465e-07
3,1,2,0.964234,4.235807e-17
4,1,3,0.876965,4.180925e-09
5,2,3,0.797634,1.062801e-06



Calculating Pearson Correlation for shared themes in k=5...

Pearson Correlation Coefficients for k=5:


Unnamed: 0,Cluster_A,Cluster_B,Pearson_Correlation,P_Value
0,0,1,0.969128,2.470889e-17
1,0,2,0.981609,5.912423e-21
2,0,3,0.828189,3.213007e-07
3,0,4,0.995526,3.308764e-29
4,1,2,0.931587,1.73836e-12
5,1,3,0.861084,3.306805e-08
6,1,4,0.970265,3.627208e-18
7,2,3,0.747472,2.696304e-05
8,2,4,0.985348,1.678187e-21
9,3,4,0.822111,4.6426e-07



Saved shared themes Pearson correlation coefficients to '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/shared_themes_pearson_correlation.csv'.

Verifying the contents of 'shared_themes_pearson_correlation.csv':


Unnamed: 0,k,Cluster_A,Cluster_B,Pearson_Correlation,P_Value
0,3,0,1,0.961984,9.523581000000001e-17
1,3,0,2,0.991741,1.2697070000000001e-25
2,3,1,2,0.927417,1.306236e-12
3,4,0,1,0.973841,6.570738e-19
4,4,0,2,0.994438,6.411918e-29


In [None]:
from statsmodels.stats.multitest import multipletests
import numpy as np

# Load the shared themes Pearson correlation data
shared_themes_path = os.path.join(output_dir, 'shared_themes_pearson_correlation.csv')
shared_themes_df = pd.read_csv(shared_themes_path)

# Initialize a column for adjusted p-values
shared_themes_df['Adjusted_P_Value'] = np.nan

# Apply Benjamini-Hochberg correction per k
for k in cluster_range:
    mask = shared_themes_df['k'] == k
    pvals = shared_themes_df.loc[mask, 'P_Value']

    # Check if there are p-values to adjust
    if pvals.empty:
        print(f"No p-values to adjust for k={k}.")
        continue

    # Apply Benjamini-Hochberg correction
    reject, pvals_corrected, _, _ = multipletests(pvals, method='fdr_bh')

    # Assign the corrected p-values back to the DataFrame
    shared_themes_df.loc[mask, 'Adjusted_P_Value'] = pvals_corrected

# Save the corrected correlations to CSV
corrected_shared_themes_output = os.path.join(output_dir, 'shared_themes_pearson_correlation_corrected.csv')
shared_themes_df.to_csv(corrected_shared_themes_output, index=False)
print(f"Saved corrected shared themes Pearson correlation coefficients to '{corrected_shared_themes_output}'.")

# Additional Check: Verify the contents of the corrected CSV
print("\nVerifying the contents of 'shared_themes_pearson_correlation_corrected.csv':")
corrected_shared_themes_preview = pd.read_csv(corrected_shared_themes_output).head()
display(corrected_shared_themes_preview)

Saved corrected shared themes Pearson correlation coefficients to '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/shared_themes_pearson_correlation_corrected.csv'.

Verifying the contents of 'shared_themes_pearson_correlation_corrected.csv':


Unnamed: 0,k,Cluster_A,Cluster_B,Pearson_Correlation,P_Value,Adjusted_P_Value
0,3,0,1,0.961984,9.523581000000001e-17,1.428537e-16
1,3,0,2,0.991741,1.2697070000000001e-25,3.809121e-25
2,3,1,2,0.927417,1.306236e-12,1.306236e-12
3,4,0,1,0.973841,6.570738e-19,1.971221e-18
4,4,0,2,0.994438,6.411918e-29,3.847151e-28


In [None]:
# Calculating Silhouette Scores for Each k

from sklearn.metrics import silhouette_score
import pandas as pd
import os
import numpy as np

# Initialize a list to store silhouette scores
silhouette_results = []

for k in cluster_range:
    print(f"\nCalculating Silhouette Score for k={k}...")

    # Load cluster assignments
    cluster_assignments_path = os.path.join(output_dir, f'cluster_assignments_k{k}.csv')
    cluster_assignments_df = pd.read_csv(cluster_assignments_path)

    # Extract cluster labels
    cluster_labels = cluster_assignments_df['Cluster']

    # Calculate silhouette score
    try:
        score = silhouette_score(df_scaled, cluster_labels)
        silhouette_results.append({
            'k': k,
            'Silhouette_Score': score
        })
        print(f"Silhouette Score for k={k}: {score:.4f}")
    except Exception as e:
        print(f"Error calculating silhouette score for k={k}: {e}")
        silhouette_results.append({
            'k': k,
            'Silhouette_Score': np.nan
        })

# Convert the list to a DataFrame
silhouette_scores_df = pd.DataFrame(silhouette_results)

# Save the silhouette scores to CSV
silhouette_scores_output = os.path.join(output_dir, 'silhouette_scores.csv')
silhouette_scores_df.to_csv(silhouette_scores_output, index=False)
print(f"\nSaved silhouette scores to '{silhouette_scores_output}'.")

# Additional Check: Verify the contents of the silhouette scores CSV
print("\nVerifying the contents of 'silhouette_scores.csv':")
silhouette_scores_preview = pd.read_csv(silhouette_scores_output).head()
display(silhouette_scores_preview)


Calculating Silhouette Score for k=3...
Silhouette Score for k=3: 0.5331

Calculating Silhouette Score for k=4...
Silhouette Score for k=4: 0.4907

Calculating Silhouette Score for k=5...
Silhouette Score for k=5: 0.4878

Saved silhouette scores to '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/silhouette_scores.csv'.

Verifying the contents of 'silhouette_scores.csv':


Unnamed: 0,k,Silhouette_Score
0,3,0.533099
1,4,0.49074
2,5,0.48784


In [None]:
# Function to verify CSV file contents
def verify_csv(file_path, expected_columns, num_rows=5):
    """
    Verifies the contents of a CSV file.

    Parameters:
    - file_path (str): Path to the CSV file.
    - expected_columns (list): List of expected column names.
    - num_rows (int): Number of rows to display for verification.

    Returns:
    - None
    """
    if not os.path.exists(file_path):
        print(f"File '{file_path}' does not exist.")
        return

    df = pd.read_csv(file_path)
    actual_columns = df.columns.tolist()

    # Check for expected columns
    missing_columns = set(expected_columns) - set(actual_columns)
    if missing_columns:
        print(f"File '{file_path}' is missing columns: {missing_columns}")
    else:
        print(f"All expected columns are present in '{file_path}'.")

    # Display the first few rows
    print(f"\nFirst {num_rows} rows of '{file_path}':")
    display(df.head(num_rows))

# Example Usage:

# Verify 'shared_themes_pearson_correlation.csv'
verify_csv(
    file_path=os.path.join(output_dir, 'shared_themes_pearson_correlation.csv'),
    expected_columns=['k', 'Cluster_A', 'Cluster_B', 'Pearson_Correlation', 'P_Value']
)

# Verify 'shared_themes_pearson_correlation_corrected.csv'
verify_csv(
    file_path=os.path.join(output_dir, 'shared_themes_pearson_correlation_corrected.csv'),
    expected_columns=['k', 'Cluster_A', 'Cluster_B', 'Pearson_Correlation', 'P_Value', 'Adjusted_P_Value']
)

# Verify 'silhouette_scores.csv'
verify_csv(
    file_path=os.path.join(output_dir, 'silhouette_scores.csv'),
    expected_columns=['k', 'Silhouette_Score']
)

All expected columns are present in '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/shared_themes_pearson_correlation.csv'.

First 5 rows of '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/shared_themes_pearson_correlation.csv':


Unnamed: 0,k,Cluster_A,Cluster_B,Pearson_Correlation,P_Value
0,3,0,1,0.961984,9.523581000000001e-17
1,3,0,2,0.991741,1.2697070000000001e-25
2,3,1,2,0.927417,1.306236e-12
3,4,0,1,0.973841,6.570738e-19
4,4,0,2,0.994438,6.411918e-29


All expected columns are present in '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/shared_themes_pearson_correlation_corrected.csv'.

First 5 rows of '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/shared_themes_pearson_correlation_corrected.csv':


Unnamed: 0,k,Cluster_A,Cluster_B,Pearson_Correlation,P_Value,Adjusted_P_Value
0,3,0,1,0.961984,9.523581000000001e-17,1.428537e-16
1,3,0,2,0.991741,1.2697070000000001e-25,3.809121e-25
2,3,1,2,0.927417,1.306236e-12,1.306236e-12
3,4,0,1,0.973841,6.570738e-19,1.971221e-18
4,4,0,2,0.994438,6.411918e-29,3.847151e-28


All expected columns are present in '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/silhouette_scores.csv'.

First 5 rows of '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/silhouette_scores.csv':


Unnamed: 0,k,Silhouette_Score
0,3,0.533099
1,4,0.49074
2,5,0.48784


In [None]:
import pandas as pd
import os

# Define output directory
output_dir = '/content/drive/MyDrive/BERTTopic_Models/paraphrase-MiniLM-L6-v2/results/clustering_outputs/'

# Specify a k value to inspect
k = 3
cluster_profiles_path = os.path.join(output_dir, f'cluster_profiles_k{k}.csv')

# Load the CSV
cluster_profiles_df = pd.read_csv(cluster_profiles_path)

# Display the first few rows
display(cluster_profiles_df.head())

Unnamed: 0,Cluster,Topic,Topic_Label,Mean_Probability,Std_Deviation,Cluster_Size,Cluster_Percentage
0,0,Topic_72,Love and Affection,56.95968,7.831732,55,52.380952
1,0,Topic_70,Leaving and Letting Go,55.085835,7.906873,55,52.380952
2,0,Topic_201,Happiness and Gratitude,53.9773,7.330659,55,52.380952
3,0,Topic_102,Pleasure and Seduction,53.503047,7.149895,55,52.380952
4,0,Topic_175,Moments and Realizations,52.148115,7.278969,55,52.380952
