## Topic Modeling - BERTopic

### Libraries

In [None]:
import os, sys
import pandas as pd
import numpy as np
import torch 
import math
import re
import seaborn as sns

import matplotlib.pyplot as plt

from transformers import *
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, models
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN

from sklearn.preprocessing import normalize

from sklearn.manifold import trustworthiness
from sklearn.metrics import silhouette_score

from sklearn.feature_extraction.text import CountVectorizer
import unicodedata 

sys.path.append(os.path.dirname(os.path.abspath('..')))
from utils.text_analysis_functions import data_cleaning
from utils.modeling_helpers import split_text_natural_or_equal, clean_text, get_topic_words, summarize_doc

### Initialization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "nlpaueb/bert-base-greek-uncased-v1",
    use_fast=True
)

cleaning_object = data_cleaning()

### Data

In [4]:
data_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\transformed_dataset.csv"
embeddings_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\my_data_embeddings.npy"

In [None]:
data = pd.read_csv(data_path)[["text","word_count","period"]]
data.head(3)

### Preprocessing

In [None]:
data["doc_id"]     = data.index
data["text_clean"] = data["text"].apply(lambda txt: clean_text(cleaning_object, txt))
data["chunks"]     = data["text_clean"].apply(
    lambda txt: split_text_natural_or_equal(tokenizer, txt, max_length=512)
)

data_exploded = data.explode("chunks").reset_index(drop=True)
data_exploded["chunk_id"] = data_exploded.index

mask = data_exploded["chunks"].str.split().str.len() >= 3
data_exploded = data_exploded[mask].reset_index(drop=True)

data_exploded.to_pickle("exploded_chunks.pkl")

final_chunks = data_exploded["chunks"].tolist()

In [9]:
len(final_chunks)

20288

In [10]:
embeddings = np.load(embeddings_path)

### Step 01 - Create Embeddings

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

In [None]:
torch.version.cuda

In [None]:
transformer_sentence_model = SentenceTransformer("nlpaueb/bert-base-greek-uncased-v1", device=device)

In [None]:
embeddings = transformer_sentence_model.encode(
    final_chunks,
    batch_size= 32,
    show_progress_bar=True)

In [12]:
assert len(embeddings) == len(data_exploded)

In [60]:
emb_norm = normalize(embeddings, axis=1)

In [13]:
np.save(embeddings_path, embeddings) # save embeddings

### Step 02 - Dimensionality Reduction & Clustering Fine Tuning

In [11]:
len(embeddings)

20288

Run ONCE!

In [None]:
param_grid = {
    "n_neighbors": [5, 15, 50],
    "min_dist":    [0.0, 0.1, 0.5],
    "n_components":[2, 5, 10]
}

records = []
for n_nb in param_grid["n_neighbors"]:
    for md in param_grid["min_dist"]:
        for nc in param_grid["n_components"]:
            um = UMAP(
                n_neighbors=n_nb,
                min_dist=md,
                n_components=nc,
                metric="cosine",
                random_state=42
            )
            X_red = um.fit_transform(embeddings)

            tw = trustworthiness(embeddings, X_red, n_neighbors=5)

            clusterer = HDBSCAN(min_cluster_size=10, metric='euclidean')
            labels = clusterer.fit_predict(X_red)
            # silhouette only on non-noise
            mask = labels >= 0
            if mask.sum() > 1:
                sil = silhouette_score(X_red[mask], labels[mask])
            else:
                sil = np.nan

            records.append({
                "n_neighbors": n_nb,
                "min_dist": md,
                "n_components": nc,
                "trustworthiness": tw,
                "silhouette": sil
            })

df_scores = pd.DataFrame(records)

best = df_scores.sort_values("silhouette", ascending=False).iloc[0]
print("Best params by silhouette:", best)

Best params by silhouette: n_neighbors        5.000000
min_dist           0.000000
n_components       5.000000
trustworthiness    0.910923
silhouette         0.562084
Name: 1, dtype: float64


In [13]:
dimensionality_reduction_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\umap_fine_tuning_results.csv"

In [14]:
df_scores.to_csv(dimensionality_reduction_path ,index=False)

In [None]:
df_scores.sort_values(by=["silhouette"], ascending=False)[df_scores["silhouette"] > 0]

In [None]:
df_scores = pd.read_csv(dimensionality_reduction_path)
df_scores_selection = df_scores.copy()
df_scores_selection["aggregate_score_mag"] = df_scores_selection["trustworthiness"] + df_scores_selection["silhouette"]
df_scores_selection["aggregate_score_avg"] = (df_scores_selection["trustworthiness"] + df_scores_selection["silhouette"]) / 2
df_scores_selection_sorted = df_scores_selection.sort_values(by=["aggregate_score_avg"], ascending=False)[df_scores["silhouette"] > 0]
df_scores_selection_sorted.head(15)

In [34]:
umap_model = UMAP(
        n_neighbors=5,
        min_dist=0,
        n_components=2,
        metric="cosine",
        random_state=42
    )
reduced_embeddings = umap_model.fit_transform(embeddings)

In [None]:
X2 = reduced_embeddings 

param_grid = {
    "min_cluster_size": [3, 5, 10, 20],
    "min_samples":      [1, 3, 5]
}

records = []
for mcs in param_grid["min_cluster_size"]:
    for ms in param_grid["min_samples"]:
        clusterer = HDBSCAN(
            min_cluster_size=mcs,
            min_samples=ms,
            metric='euclidean'
        )
        labels = clusterer.fit_predict(X2)
        
        mask = labels >= 0
        sil = silhouette_score(X2[mask], labels[mask]) if mask.sum() > 1 else np.nan
        
        unique_clusters = set(labels[mask])
        n_clusters = len(unique_clusters)
        n_noise = int((labels == -1).sum())
        
        records.append({
            "min_cluster_size": mcs,
            "min_samples": ms,
            "n_clusters": n_clusters,
            "n_noise": n_noise,
            "silhouette": sil
        })

df_scores = pd.DataFrame(records)

best = df_scores.sort_values("silhouette", ascending=False).iloc[0]
best_mcs, best_ms = best["min_cluster_size"], best["min_samples"]

best_clusterer = HDBSCAN(
    min_cluster_size=int(best_mcs),
    min_samples=int(best_ms),
    metric='euclidean'
)
best_labels = best_clusterer.fit_predict(X2)


In [None]:
df_scores.sort_values(by=["silhouette"], ascending=False)

In [26]:
clustering_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))) + "\\working_data\\hdbscan_fine_tuning_results.csv"
df_scores.to_csv(clustering_path, index=False)

In [None]:
clusterer = HDBSCAN(
    min_cluster_size=20,
    min_samples=3,
    metric='euclidean',
    prediction_data=True
)
labels = clusterer.fit_predict(reduced_embeddings)

# convert labels to a Python set for unique IDs
unique_labels = set(labels.tolist())
# remove noise label
unique_labels.discard(-1)
n_clusters = len(unique_labels)
# points were labeled as noise
n_noise = int((labels == -1).sum())

print(f"Found {n_clusters} clusters and {n_noise} noise points")

In [None]:
total_docs = len(labels)
num_noise  = (labels == -1).sum()
print(f"Noise fraction: {num_noise/total_docs:.1%}") 

### Step 03 - Dimensionality Reduction

In [38]:
umap_model = UMAP(
        n_neighbors=5,
        min_dist=0,
        n_components=2,
        metric="cosine",
        random_state=42
    )
reduced_embeddings = umap_model.fit_transform(embeddings)

### Step 04 - Clustering

In [39]:
cluster_model = HDBSCAN(
            min_cluster_size=20,
            min_samples=3,
            metric='euclidean',
            prediction_data=True
            )
labels = cluster_model.fit_predict(reduced_embeddings)
mask = labels >= 0
silhouette_score(reduced_embeddings[mask], labels[mask])

np.float32(0.41869235)

In [None]:
X2 = reduced_embeddings # 2D UMAP coordinates
labels = labels # HDBSCAN labels

# noise
noise_mask = labels == -1
cluster_mask = ~noise_mask

# top 10 clusters
counts = pd.Series(labels[cluster_mask]).value_counts()
top_k = counts.nlargest(30).index

# other clusters
others_mask = cluster_mask & ~np.isin(labels, top_k)

plt.figure(figsize=(8,6))

# noise in light gray
plt.scatter(
    X2[noise_mask,0], X2[noise_mask,1],
    c="#dddddd", s=10, label="noise", alpha=0.5
)

# non specified clusters in dark gray
plt.scatter(
    X2[others_mask,0], X2[others_mask,1],
    c="#bbbbbb", s=10, label="others", alpha=0.5
)

# top 10 clusters with colors
palette = sns.color_palette("tab10", n_colors=len(top_k))
for cluster_id, color in zip(top_k, palette):
    mask = labels == cluster_id
    plt.scatter(
        X2[mask,0], X2[mask,1],
        c=[color], s=20, label=f"cluster {cluster_id}", alpha=0.8
    )

plt.legend(
    bbox_to_anchor=(1.05,1),
    loc="upper left",
    fontsize=8,
    frameon=False
)
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.tight_layout()
plt.show()

### Step 05 - Representation

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2), stop_words="english")

### Step 06 - BERTopic Chain

In [None]:
topic_model = BERTopic(
    embedding_model=None,
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer,
    language="greek",
    calculate_probabilities=True,
    nr_topics=14,
    top_n_words=10,
)
topics, probs = topic_model.fit_transform(documents=final_chunks, embeddings=embeddings)

In [94]:
fig = topic_model.visualize_barchart(
    top_n_topics=20,
    n_words=20
)
fig.show(renderer="browser")

In [None]:
from collections import Counter
Counter(topics)

In [None]:
topic_model.save("BERTopic_model")