# Install & Import Libraries

# BERTopic per Year

In [1]:
#!pip install bertopic[all] umap-learn

In [2]:
from bertopic import BERTopic
from umap import UMAP
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
from sentence_transformers import SentenceTransformer

In [3]:
df_debates = pd.read_csv('debate_transcripts_cleaned.csv', encoding='utf-8')
df_debates = df_debates[df_debates["is_candidate"] == True].copy()

In [None]:
embedderMiniLM = SentenceTransformer("all-MiniLM-L6-v2")
df_short = df_debates[df_debates["dialogue"].str.split().str.len() > 5]  # remove anything shorter than 6 words
docs = df_short["dialogue"].tolist()
embeddingsMiniLM=embedderMiniLM.encode(docs, show_progress_bar=True)

def run_bertopic_over_metadata(df, groupby_col="year", model=None, embedding_model="all-MiniLM-L6-v2",embedder=embedderMiniLM, embeddings=embeddingsMiniLM, verbose=True):
    """
    Fits BERTopic to the 'dialogue' column of df and returns topic model and topics over time/group.

    Parameters:
        df (pd.DataFrame): DataFrame with at least 'dialogue' and one metadata column
        groupby_col (str): Column in df to group topic evolution by. Options: "year", "date", "debate_title", "actor", "party"
        model (BERTopic or None): Pass an existing BERTopic model to reuse, or None to create new
        embedding_model (str or SentenceTransformer): Sentence embedding model name or object
        verbose (bool): Whether to print status messages

    Returns:
        topic_model (BERTopic)
        topics_over_group (pd.DataFrame)
        topics (List[int])
        probs (List[float])
    """

    assert "dialogue" in df.columns, "DataFrame must have a 'dialogue' column."
    assert groupby_col in df.columns, f"{groupby_col} not found in DataFrame columns."

    df = df[df["dialogue"].str.split().str.len() > 5]  # remove anything shorter than 6 words
    docs = df["dialogue"].tolist()
    timestamps = df[groupby_col].astype(str).tolist()  # convert to string for topics_over_time

    if embedder is None:
        if verbose:
            print(f"Creating embedding model: {embedding_model}")
        embedder=SentenceTransformer(embedding_model)

    if embeddings is None:
        if verbose:
            print(f"Embedding encoding")
        embeddings=embedder.encode(docs, show_progress_bar=True)

    if model is None:
        if verbose:
            print(f"Creating new BERTopic model using embedding model: {embedding_model}")
        model = BERTopic(
            language="english",
            embedding_model=embedder,
            verbose=verbose
        )

    if verbose:
        print("Fitting BERTopic...")
    topics, probs = model.fit_transform(docs, embeddings)
    # Automatically reduce topics based on similarity
    model.reduce_topics(df["dialogue"].tolist(), nr_topics="auto")  # or set a target number like 30
    model.set_topic_labels(model.generate_topic_labels())


    if groupby_col in ["year", "date"]:
        if verbose:
            print(f"Computing topic evolution over '{groupby_col}'...")
        topics_over_group = model.topics_over_time(docs, timestamps)
    else:
        if verbose:
            print(f"Computing topic frequency over '{groupby_col}'...")
        import pandas as pd
        # Build DataFrame with topic and groupby_col
        df_topics = pd.DataFrame({"topic": topics, groupby_col: group_data})
        # Count frequency of each topic per group
        topics_over_group = df_topics.groupby([groupby_col, "topic"]).size().reset_index(name="count")

    return model, topics_over_group, topics, probs


Batches:   0%|          | 0/143 [00:00<?, ?it/s]

In [5]:
model, topics_yearly, topics, probs = run_bertopic_over_metadata(df_debates, groupby_col="year")
model.visualize_topics_over_time(topics_yearly)

2025-05-23 18:02:47,564 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Creating new BERTopic model using embedding model: all-MiniLM-L6-v2
Fitting BERTopic...


2025-05-23 18:03:09,230 - BERTopic - Dimensionality - Completed ✓
2025-05-23 18:03:09,233 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-23 18:03:09,388 - BERTopic - Cluster - Completed ✓
2025-05-23 18:03:09,394 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-23 18:03:09,843 - BERTopic - Representation - Completed ✓
2025-05-23 18:03:10,286 - BERTopic - Topic reduction - Reducing number of topics
2025-05-23 18:03:10,305 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-23 18:03:10,752 - BERTopic - Representation - Completed ✓
2025-05-23 18:03:10,754 - BERTopic - Topic reduction - Reduced number of topics from 81 to 17


Computing topic evolution over 'year'...


14it [00:00, 16.87it/s]


In [6]:
model, topics_by_date, *_ = run_bertopic_over_metadata(df_debates, groupby_col="date")
model.visualize_topics_over_time(topics_by_date)

2025-05-23 18:03:12,246 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Creating new BERTopic model using embedding model: all-MiniLM-L6-v2
Fitting BERTopic...


2025-05-23 18:03:16,337 - BERTopic - Dimensionality - Completed ✓
2025-05-23 18:03:16,341 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-23 18:03:16,527 - BERTopic - Cluster - Completed ✓
2025-05-23 18:03:16,532 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-23 18:03:16,980 - BERTopic - Representation - Completed ✓
2025-05-23 18:03:17,403 - BERTopic - Topic reduction - Reducing number of topics
2025-05-23 18:03:17,421 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-23 18:03:17,814 - BERTopic - Representation - Completed ✓
2025-05-23 18:03:17,816 - BERTopic - Topic reduction - Reduced number of topics from 73 to 34


Computing topic evolution over 'date'...


48it [00:01, 24.11it/s]


In [7]:
_, topics_by_actor, *_ = run_bertopic_over_metadata(df_debates, groupby_col="actor")
model.visualize_topics_over_time(topics_by_actor)

2025-05-23 18:03:20,455 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Creating new BERTopic model using embedding model: all-MiniLM-L6-v2
Fitting BERTopic...


2025-05-23 18:03:24,434 - BERTopic - Dimensionality - Completed ✓
2025-05-23 18:03:24,436 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-23 18:03:24,587 - BERTopic - Cluster - Completed ✓
2025-05-23 18:03:24,592 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-23 18:03:25,008 - BERTopic - Representation - Completed ✓
2025-05-23 18:03:25,408 - BERTopic - Topic reduction - Reducing number of topics
2025-05-23 18:03:25,425 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-23 18:03:25,806 - BERTopic - Representation - Completed ✓
2025-05-23 18:03:25,809 - BERTopic - Topic reduction - Reduced number of topics from 73 to 14


Computing topic evolution over 'actor'...


DateParseError: Unknown datetime string format, unable to parse: Kennedy, at position 0

In [None]:
_, topics_by_party, *_ = run_bertopic_over_metadata(df_debates, groupby_col="party")
model.visualize_topics_over_time(topics_by_party)