In [1]:
from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [2]:
def get_topic_info(topic_model: BERTopic, text_list: list[str], dataframe: pd.DataFrame):
    doc_info = topic_model.get_document_info(text_list)
    doc_info["class_label"] = dataframe["class_label"]

    # Calculate Yes/No class counts per topic
    topic_class_counts = doc_info.groupby(['Topic', 'Name'])['class_label'].value_counts().unstack(fill_value=0).reset_index()
    topic_class_counts["count"] = topic_class_counts["Yes"] + topic_class_counts["No"]
    topic_class_counts["pos_ratio"] = topic_class_counts["Yes"] / (topic_class_counts["Yes"] + topic_class_counts["No"])

    # Add class counts to topic info
    topic_info = topic_model.get_topic_info()
    topic_info["No"] = topic_class_counts["No"]
    topic_info["Yes"] = topic_class_counts["Yes"]
    topic_info["pos_ratio"] = topic_class_counts["pos_ratio"]

    return topic_info

In [3]:
def reduce_topic(topic_model: BERTopic, topic_id, text_list, topics, threshold, strategy, embeddings=None):
    old_topic_info = topic_model.get_topic_info()
    new_topics = topic_model.reduce_outliers(text_list, topics, threshold=threshold, strategy=strategy, embeddings=embeddings, topic_id=topic_id)
    topic_model.update_topics(docs=text_list, topics=new_topics)
    new_topic_info = topic_model.get_topic_info()
    
    # How many docs changed topic
    num_reassigned = old_topic_info.loc[old_topic_info["Topic"] == topic_id]["Count"] - new_topic_info.loc[new_topic_info["Topic"] == topic_id]["Count"]
    num_reassigned = num_reassigned.iloc[0]
    
    # How many target topics did the docs change to
    num_changed = (old_topic_info["Count"] != new_topic_info["Count"]).sum() - (1 if num_reassigned != 0 else 0)
    old_topic_name = (old_topic_info.loc[old_topic_info["Topic"] == topic_id]["Name"]).iloc[0]

    print(f"Reassigned {num_reassigned} documents from \"{old_topic_name}\" to {num_changed} new topics")

    return new_topics

In [4]:
df_train = pd.read_csv("../data/CT24_checkworthy_english/train.csv")
df_dev = pd.read_csv("../data/CT24_checkworthy_english/dev.csv")
df_dev_test = pd.read_csv("../data/CT24_checkworthy_english/dev-test.csv")
df_test_gold = pd.read_csv("../data/CT24_checkworthy_english/test-gold.csv")
df = pd.concat([df_train, df_dev, df_dev_test, df_test_gold], ignore_index=True)
text_list = df.Text.to_list()

In [5]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(text_list, show_progress_bar=True)

Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [19]:
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
topic_model = BERTopic(min_topic_size=50, umap_model=umap_model, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(documents=text_list, embeddings=embeddings)

In [20]:
get_topic_info(topic_model=topic_model, text_list=text_list, dataframe=df)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
0,-1,12844,-1_people_think_going_country,"[people, think, going, country, know, presiden...","[You know, back in 1987, he took out a $100,00...",10101,2743,0.213563
1,0,1280,0_health_care_social security_social,"[health, care, social security, social, medica...",[I want to do something about Social Security....,775,505,0.394531
2,1,756,1_tax_taxes_tax cut_cut,"[tax, taxes, tax cut, cut, tax cuts, increase,...","[This is a family tax cut., I have a tax cut.,...",403,353,0.466931
3,2,542,2_jobs_unemployment_million_small,"[jobs, unemployment, million, small, business,...",[Most small businesses -- 70 percent of the ne...,305,237,0.437269
4,3,532,3_schools_education_school_teachers,"[schools, education, school, teachers, college...","[We can do better in our public schools., We c...",387,145,0.272556
...,...,...,...,...,...,...,...,...
63,62,56,62_africa_southern africa_southern_african,"[africa, southern africa, southern, african, a...","[Well, I understand, and Africa is important.,...",35,21,0.375000
64,63,55,63_test_testing_litmus test_litmus,"[test, testing, litmus test, litmus, tests, at...","[And the answer is, no, I will not have a litm...",51,4,0.072727
65,64,54,64_dole_senator dole_senator_bob dole,"[dole, senator dole, senator, bob dole, bob, d...",[And members of Senator Dole's own party in th...,31,23,0.425926
66,65,50,65_peace_world peace_peaceful_kept peace,"[peace, world peace, peaceful, kept peace, pea...","[World peace is important, and we have enhance...",47,3,0.060000


In [21]:
# Reduce outliers
topics = reduce_topic(
    topic_model=topic_model,
    topic_id=-1,
    text_list=text_list,
    topics=topics,
    threshold=0.4,
    strategy="embeddings",
    embeddings=embeddings)



Reassigned 9685 documents from "-1_people_think_going_country" to 67 new topics


In [22]:
topic_info = get_topic_info(topic_model=topic_model, text_list=text_list, dataframe=df)
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
0,-1,3159,-1_you_he_it_was,"[you, he, it, was, and, the, of, they, that, t...","[You know, back in 1987, he took out a $100,00...",2677,482,0.152580
1,0,1347,0_health_care_social_medicare,"[health, care, social, medicare, insurance, se...",[I want to do something about Social Security....,819,528,0.391982
2,1,955,1_tax_taxes_cut_for,"[tax, taxes, cut, for, cuts, increase, percent...","[This is a family tax cut., I have a tax cut.,...",519,436,0.456545
3,2,790,2_jobs_unemployment_million_small,"[jobs, unemployment, million, small, business,...",[Most small businesses -- 70 percent of the ne...,481,309,0.391139
4,3,609,3_education_schools_school_teachers,"[education, schools, school, teachers, college...","[We can do better in our public schools., We c...",441,168,0.275862
...,...,...,...,...,...,...,...,...
63,62,136,62_africa_nations_african_countries,"[africa, nations, african, countries, southern...","[Well, I understand, and Africa is important.,...",82,54,0.397059
64,63,73,63_test_testing_tests_litmus,"[test, testing, tests, litmus, any, atmosphere...","[And the answer is, no, I will not have a litm...",66,7,0.095890
65,64,64,64_dole_senator_doles_bob,"[dole, senator, doles, bob, damato, against, b...",[And members of Senator Dole's own party in th...,36,28,0.437500
66,65,224,65_peace_world_freedom_we,"[peace, world, freedom, we, our, to, the, and,...","[World peace is important, and we have enhance...",208,16,0.071429


In [23]:
topic_model.visualize_topics()

In [24]:
topic_model.visualize_documents(text_list, embeddings=embeddings, width=1000, hide_document_hover=False, hide_annotations=True)

In [25]:
df["Topic"] = topics

In [26]:
# Explore a specific topic id
topic_id = 42
print("Topic name: ", topic_info[topic_info["Topic"] == topic_id]["Name"].to_list()[0])
print("Examples:")
print("\n".join(topic_model.get_representative_docs(topic_id)))
print("Yes ratio: ", topic_info[topic_info["Topic"] == topic_id]["pos_ratio"].to_list()[0])
df[df["Topic"] == topic_id]

Topic name:  42_difference_different_opinion_differences
Examples:
It's a fundamental difference of opinion here, folks.
It's a fundamental difference of opinion.
We have a fundamental difference of opinion.
Yes ratio:  0.027624309392265192


Unnamed: 0,Sentence_id,Text,class_label,Topic
14,8355,His experience has been different from mine.,No,42
139,13956,I just have an honest philosophical difference.,No,42
193,22158,We may differ about what the elements of that ...,No,42
217,1335,"And in those two cases, I would favor them.",No,42
299,24987,And I think that perhaps we established a litt...,No,42
...,...,...,...,...
23259,16258,We have differences about how to achieve them.,No,42
23416,24105,"You see, I have a fundamental difference.",No,42
23435,24432,It's a different world now.,No,42
23967,39011,"Well, not really because what's happening is t...",Yes,42


In [27]:
# Manually identify unspecific topics containing samples that should be members of more specific topics
# Apply "outlier" reduction to these topics with appropriate threshold
# Merge the remaining samples in the unspecific topics and outliers into a "unspecific" category


# Topics containing samples that should be members of more specific topics
topics_to_dissolve = [4, 6, 8, 16, 22, 25, 26, 36, 37, 40]
# Topics that should be merged into an "unspecific" topic class
unspecific_topics = []

In [28]:
for tid in topics_to_dissolve:
    topics = reduce_topic(
        topic_model=topic_model,
        topic_id=tid,
        text_list=text_list,
        topics=topics,
        threshold=0.6,
        strategy="embeddings",
        embeddings=embeddings)



Reassigned 144 documents from "4_do_we_to_going" to 3 new topics




Reassigned 39 documents from "6_let_me_just_you" to 1 new topics




Reassigned 255 documents from "8_president_the_to_be" to 9 new topics




Reassigned 91 documents from "16_problem_issue_problems_issues" to 4 new topics




Reassigned 78 documents from "22_washington_congress_house_dc" to 9 new topics




Reassigned 138 documents from "25_american_america_people_country" to 6 new topics




Reassigned 43 documents from "26_thank_jim_university_bob" to 3 new topics




Reassigned 46 documents from "36_responsibility_make_decisions_we" to 5 new topics




Reassigned 110 documents from "37_safe_we_protect_security" to 7 new topics
Reassigned 79 documents from "40_record_look_records_at" to 2 new topics


In [29]:
get_topic_info(topic_model=topic_model, text_list=text_list, dataframe=df)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
0,-1,3159,-1_you_he_it_was,"[you, he, it, was, and, the, of, they, that, t...","[You know, back in 1987, he took out a $100,00...",2677,482,0.152580
1,0,1347,0_health_care_social_medicare,"[health, care, social, medicare, insurance, se...",[I want to do something about Social Security....,819,528,0.391982
2,1,955,1_tax_taxes_cut_for,"[tax, taxes, cut, for, cuts, increase, percent...","[This is a family tax cut., I have a tax cut.,...",519,436,0.456545
3,2,790,2_jobs_unemployment_million_small,"[jobs, unemployment, million, small, business,...",[Most small businesses -- 70 percent of the ne...,481,309,0.391139
4,3,609,3_education_schools_school_teachers,"[education, schools, school, teachers, college...","[We can do better in our public schools., We c...",441,168,0.275862
...,...,...,...,...,...,...,...,...
63,62,136,62_africa_nations_african_countries,"[africa, nations, african, countries, southern...","[Well, I understand, and Africa is important.,...",82,54,0.397059
64,63,73,63_test_testing_tests_litmus,"[test, testing, tests, litmus, any, atmosphere...","[And the answer is, no, I will not have a litm...",66,7,0.095890
65,64,65,64_dole_senator_doles_bob,"[dole, senator, doles, bob, damato, against, b...",[And members of Senator Dole's own party in th...,37,28,0.430769
66,65,1179,65_peace_president_do_to,"[peace, president, do, to, we, record, the, is...","[World peace is important, and we have enhance...",1138,41,0.034775
