# 04 Narrative Clustering




In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
df = pd.read_csv("../data/processed/clean_text_data.csv")
embeddings = np.load("../data/embeddings/text_embeddings.npy")

df.head(), embeddings.shape


(                                                text  \
 0  Vaccines cause serious side effects according ...   
 1         5G towers are spreading harmful radiation.   
 2   Government confirms vaccine safety after trials.   
 3  Social media claims about microchips in vaccin...   
 
                                           clean_text  word_count  
 0    vaccine cause serious side effect accord report           7  
 1                   g tower spread harmful radiation           5  
 2            government confirm vaccine safety trial           5  
 3  social medium claim microchip vaccine spread r...           7  ,
 (4, 384))

In [3]:
assert len(df) == embeddings.shape[0]


In [4]:
documents = df["clean_text"].tolist()
len(documents)


4

In [5]:
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2
)

topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)


In [12]:
from umap import UMAP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN


In [13]:
umap_model = UMAP(
    n_neighbors=5,
    n_components=2,
    min_dist=0.0,
    metric="cosine",
    random_state=42
)

hdbscan_model = HDBSCAN(
    min_cluster_size=2,    # VERY IMPORTANT
    min_samples=1,         # VERY IMPORTANT
    metric="euclidean",
    prediction_data=True
)

vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=1,
    ngram_range=(1, 2)
)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)


In [14]:
topics, probs = topic_model.fit_transform(documents, embeddings)


2026-02-10 17:26:08,216 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-10 17:26:08,226 - BERTopic - Dimensionality - Completed ✓
2026-02-10 17:26:08,227 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-10 17:26:08,229 - BERTopic - Cluster - Completed ✓
2026-02-10 17:26:08,232 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-10 17:26:08,242 - BERTopic - Representation - Completed ✓


In [15]:
df["topic"] = topics
df.head()


Unnamed: 0,text,clean_text,word_count,topic
0,Vaccines cause serious side effects according ...,vaccine cause serious side effect accord report,7,0
1,5G towers are spreading harmful radiation.,g tower spread harmful radiation,5,0
2,Government confirms vaccine safety after trials.,government confirm vaccine safety trial,5,1
3,Social media claims about microchips in vaccin...,social medium claim microchip vaccine spread r...,7,1


In [16]:
topic_info = topic_model.get_topic_info()
topic_info


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,2,0_cause_cause effect_vaccine cause_tower spread,"[cause, cause effect, vaccine cause, tower spr...","[g tower spread harmful radiation, vaccine cau..."
1,1,2,1_vaccine_medium claim_confirm vaccine_safety,"[vaccine, medium claim, confirm vaccine, safet...","[government confirm vaccine safety trial, soci..."


In [17]:
topic_id = topic_info.iloc[1]["Topic"]  # pick a valid topic id
topic_model.get_topic(topic_id)


[('vaccine', 0.17712016758791652),
 ('medium claim', 0.1323705407705836),
 ('confirm vaccine', 0.1323705407705836),
 ('safety', 0.1323705407705836),
 ('rapidly', 0.1323705407705836),
 ('microchip vaccine', 0.1323705407705836),
 ('microchip', 0.1323705407705836),
 ('social medium', 0.1323705407705836),
 ('medium', 0.1323705407705836),
 ('government confirm', 0.1323705407705836)]

In [18]:
def get_topic_name(topic_id):
    if topic_id == -1:
        return "Outlier / Noise"
    words = topic_model.get_topic(topic_id)
    return ", ".join([w[0] for w in words[:3]])

df["narrative"] = df["topic"].apply(get_topic_name)
df[["clean_text", "topic", "narrative"]].head()


Unnamed: 0,clean_text,topic,narrative
0,vaccine cause serious side effect accord report,0,"cause, cause effect, vaccine cause"
1,g tower spread harmful radiation,0,"cause, cause effect, vaccine cause"
2,government confirm vaccine safety trial,1,"vaccine, medium claim, confirm vaccine"
3,social medium claim microchip vaccine spread r...,1,"vaccine, medium claim, confirm vaccine"


In [19]:
df["topic"].value_counts()


topic
0    2
1    2
Name: count, dtype: int64

In [20]:
df.to_csv("../data/processed/text_with_narratives.csv", index=False)
print("Narrative clustering saved successfully.")


Narrative clustering saved successfully.
