In [1]:
import os
from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
def get_topic_info(topic_model: BERTopic, text_list: list[str], dataframe: pd.DataFrame):
    doc_info = topic_model.get_document_info(text_list)
    doc_info["class_label"] = dataframe["class_label"]

    # Calculate Yes/No class counts per topic
    topic_class_counts = doc_info.groupby(['Topic', 'Name'])['class_label'].value_counts().unstack(fill_value=0).reset_index()
    topic_class_counts["count"] = topic_class_counts["Yes"] + topic_class_counts["No"]
    topic_class_counts["pos_ratio"] = topic_class_counts["Yes"] / (topic_class_counts["Yes"] + topic_class_counts["No"])

    # Add class counts to topic info
    topic_info = topic_model.get_topic_info()
    topic_info["No"] = topic_class_counts["No"]
    topic_info["Yes"] = topic_class_counts["Yes"]
    topic_info["pos_ratio"] = topic_class_counts["pos_ratio"]

    return topic_info

In [3]:
def reduce_topic(topic_model: BERTopic, topic_id, text_list, topics, threshold, strategy, embeddings=None):
    old_topic_info = topic_model.get_topic_info()
    new_topics = topic_model.reduce_outliers(text_list, topics, threshold=threshold, strategy=strategy, embeddings=embeddings, topic_id=topic_id)
    topic_model.update_topics(docs=text_list, topics=new_topics)
    new_topic_info = topic_model.get_topic_info()
    
    # How many docs changed topic
    num_reassigned = old_topic_info.loc[old_topic_info["Topic"] == topic_id]["Count"] - new_topic_info.loc[new_topic_info["Topic"] == topic_id]["Count"]
    num_reassigned = num_reassigned.iloc[0]
    
    # How many target topics did the docs change to
    num_changed = (old_topic_info["Count"] != new_topic_info["Count"]).sum() - (1 if num_reassigned != 0 else 0)
    old_topic_name = (old_topic_info.loc[old_topic_info["Topic"] == topic_id]["Name"]).iloc[0]

    print(f"Reassigned {num_reassigned} documents from \"{old_topic_name}\" to {num_changed} new topics")

    return new_topics

In [4]:
df_train = pd.read_csv("../data/CT24_checkworthy_english/train.csv")
df_dev = pd.read_csv("../data/CT24_checkworthy_english/dev.csv")
df_dev_test = pd.read_csv("../data/CT24_checkworthy_english/dev-test.csv")
df_test_gold = pd.read_csv("../data/CT24_checkworthy_english/test-gold.csv")
df = pd.concat([df_train, df_dev, df_dev_test, df_test_gold], ignore_index=True)
text_list = df.Text.to_list()

In [5]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(text_list, show_progress_bar=True)

Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [6]:
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
topic_model = BERTopic(min_topic_size=50, umap_model=umap_model, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(documents=text_list, embeddings=embeddings)

In [7]:
get_topic_info(topic_model=topic_model, text_list=text_list, dataframe=df)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
0,-1,12844,-1_people_think_going_country,"[people, think, going, country, know, presiden...","[You know, back in 1987, he took out a $100,00...",10101,2743,0.213563
1,0,1280,0_health_care_social security_social,"[health, care, social security, social, medica...",[I want to do something about Social Security....,775,505,0.394531
2,1,756,1_tax_taxes_tax cut_cut,"[tax, taxes, tax cut, cut, tax cuts, increase,...","[This is a family tax cut., I have a tax cut.,...",403,353,0.466931
3,2,542,2_jobs_unemployment_million_small,"[jobs, unemployment, million, small, business,...",[Most small businesses -- 70 percent of the ne...,305,237,0.437269
4,3,532,3_schools_education_school_teachers,"[schools, education, school, teachers, college...","[We can do better in our public schools., We c...",387,145,0.272556
...,...,...,...,...,...,...,...,...
63,62,56,62_africa_southern africa_southern_african,"[africa, southern africa, southern, african, a...","[Well, I understand, and Africa is important.,...",35,21,0.375000
64,63,55,63_test_testing_litmus test_litmus,"[test, testing, litmus test, litmus, tests, at...","[And the answer is, no, I will not have a litm...",51,4,0.072727
65,64,54,64_dole_senator dole_senator_bob dole,"[dole, senator dole, senator, bob dole, bob, d...",[And members of Senator Dole's own party in th...,31,23,0.425926
66,65,50,65_peace_world peace_peaceful_kept peace,"[peace, world peace, peaceful, kept peace, pea...","[World peace is important, and we have enhance...",47,3,0.060000


In [8]:
topic_model.visualize_heatmap(n_clusters=20)

In [9]:
# Manually select similar topics and merge them
merge_topics = [
    [12, 14, 27, 34, 66, 53] # economy
]
topic_model.merge_topics(docs=text_list, topics_to_merge=merge_topics)
# Update topics list after merge
topics = topic_model.get_document_info(docs=text_list)["Topic"].to_list()

In [10]:
# Reduce outliers
topics = reduce_topic(
    topic_model=topic_model,
    topic_id=-1,
    text_list=text_list,
    topics=topics,
    threshold=0.4,
    strategy="embeddings",
    embeddings=embeddings)



Reassigned 9631 documents from "-1_people_think_going_country" to 62 new topics


In [11]:
topic_info = get_topic_info(topic_model=topic_model, text_list=text_list, dataframe=df)
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
0,-1,3213,-1_you_he_it_was,"[you, he, it, was, and, the, of, they, that, t...","[You know, back in 1987, he took out a $100,00...",2701,512,0.159353
1,0,1353,0_health_care_social_medicare,"[health, care, social, medicare, insurance, se...",[I want to do something about Social Security....,822,531,0.392461
2,1,981,1_tax_taxes_cut_for,"[tax, taxes, cut, for, cuts, the, increase, pe...","[This is a family tax cut., I have a tax cut.,...",527,454,0.462793
3,2,1407,2_economy_budget_spending_billion,"[economy, budget, spending, billion, deficit, ...","[I will pay down the national debt., When Pres...",769,638,0.453447
4,3,841,3_jobs_unemployment_million_small,"[jobs, unemployment, million, small, business,...",[We've had record numbers of new small busines...,519,322,0.382878
...,...,...,...,...,...,...,...,...
58,57,134,57_mistake_wrong_made_mistakes,"[mistake, wrong, made, mistakes, was, think, h...","[That was not a mistake., That was more than a...",126,8,0.059701
59,58,220,58_plan_my_plans_you,"[plan, my, plans, you, have, proposed, ive, fu...","[And give us a plan., But I have a plan to do ...",194,26,0.118182
60,59,73,59_test_testing_tests_litmus,"[test, testing, tests, litmus, any, atmosphere...","[And the answer is, no, I will not have a litm...",66,7,0.095890
61,60,65,60_dole_senator_doles_bob,"[dole, senator, doles, bob, damato, against, b...",[And members of Senator Dole's own party in th...,36,29,0.446154


In [12]:
#topic_model.visualize_topics()

In [13]:
#topic_model.visualize_documents(text_list, embeddings=embeddings, width=1000, hide_document_hover=False, hide_annotations=True)

In [14]:
df["Topic"] = topics

In [15]:
# Explore a specific topic id
topic_id = 61
print("Topic name: ", topic_info[topic_info["Topic"] == topic_id]["Name"].to_list()[0])
print("Examples:")
print("\n".join(topic_model.get_representative_docs(topic_id)))
print("Yes ratio: ", topic_info[topic_info["Topic"] == topic_id]["pos_ratio"].to_list()[0])
df[df["Topic"] == topic_id]

Topic name:  61_peace_world_freedom_we
Examples:
World peace is important, and we have enhanced the peace.
And that's how best it is to keep the peace.
He talks about peace and I'm thankful for peace.
Yes ratio:  0.07142857142857142


Unnamed: 0,Sentence_id,Text,class_label,Topic
23,3187,"What is more moral than peace, and the United ...",No,61
42,26856,If we do those things we will build that bridg...,No,61
194,12467,I would like to unite this country to get an a...,No,61
225,12673,"And when we are strong and when we are first, ...",No,61
421,12516,How can we extend freedom - extend it without ...,No,61
...,...,...,...,...
23178,15426,And we must extend the benefits of our own com...,No,61
23202,15646,You have to learn from experience that making ...,No,61
23278,16410,"Working together, we can do wonderful things f...",No,61
23408,24016,If you look at the cost of not keeping the pea...,No,61


In [16]:
# Manually identify unspecific topics containing samples that should be members of more specific topics
# Apply "outlier" reduction to these topics with appropriate threshold
# Merge the remaining samples in the unspecific topics and outliers into a "unspecific" category

# Topics containing samples that should be members of more specific topics
topics_to_dissolve = [5, 7, 9, 15, 21, 25, 34, 37, 38, 39, 40, 54, 58, 59, 61]
# Topics that should be merged into an "unspecific" topic class
unspecific_topics = []

In [17]:
for tid in topics_to_dissolve:
    topics = reduce_topic(
        topic_model=topic_model,
        topic_id=tid,
        text_list=text_list,
        topics=topics,
        threshold=0.7,
        strategy="embeddings",
        embeddings=embeddings)



Reassigned 18 documents from "5_do_we_to_going" to 2 new topics




Reassigned 0 documents from "7_let_me_just_you" to 0 new topics




Reassigned 31 documents from "9_president_the_to_be" to 1 new topics




Reassigned 16 documents from "15_problem_issue_problems_issues" to 1 new topics




Reassigned 13 documents from "21_washington_congress_the_house" to 2 new topics




Reassigned 18 documents from "25_thank_jim_university_bob" to 1 new topics




Reassigned 32 documents from "34_safe_we_protect_security" to 1 new topics




Reassigned 39 documents from "37_record_look_records_at" to 1 new topics




Reassigned 19 documents from "38_years_four_ago_last" to 1 new topics




Reassigned 26 documents from "39_difference_different_opinion_differences" to 1 new topics




Reassigned 44 documents from "40_leadership_leader_lead_kind" to 2 new topics




Reassigned 15 documents from "54_programs_program_that_uh" to 1 new topics




Reassigned 29 documents from "58_plan_my_plans_you" to 2 new topics




Reassigned 10 documents from "59_test_testing_tests_litmus" to 1 new topics
Reassigned 348 documents from "61_peace_we_leadership_world" to 14 new topics


In [18]:
df["Topic"] = topics

In [19]:
topic_info = get_topic_info(topic_model=topic_model, text_list=text_list, dataframe=df)
topic_info
topic_info.sort_values(by=["Count"], ascending=False)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
0,-1,3213,-1_you_he_it_was,"[you, he, it, was, and, the, of, they, that, t...","[You know, back in 1987, he took out a $100,00...",2701,512,0.159353
3,2,1407,2_economy_budget_spending_billion,"[economy, budget, spending, billion, deficit, ...","[I will pay down the national debt., When Pres...",769,638,0.453447
1,0,1353,0_health_care_social_medicare,"[health, care, social, medicare, insurance, se...",[I want to do something about Social Security....,822,531,0.392461
6,5,994,5_do_we_to_going,"[do, we, to, going, it, were, can, what, weve,...","[Now, there's more work to do., But we have so...",968,26,0.026157
2,1,981,1_tax_taxes_cut_for,"[tax, taxes, cut, for, cuts, the, increase, pe...","[This is a family tax cut., I have a tax cut.,...",527,454,0.462793
...,...,...,...,...,...,...,...,...
37,36,108,36_romney_governor_romneys_he,"[romney, governor, romneys, he, said, says, wh...","[Governor Romney, here's what we did., Governo...",64,44,0.407407
45,44,102,44_mccain_senator_mccains_john,"[mccain, senator, mccains, john, earmarks, cam...",[What Senator McCain doesn't mention is he's b...,77,25,0.245098
56,55,86,55_ford_mr_uh_has,"[ford, mr, uh, has, president, the, administra...","[Mr. Ford hasn't moved on this., I hope Mr. Fo...",45,41,0.476744
54,53,75,53_islands_formosa_quemoy_matsu,"[islands, formosa, quemoy, matsu, pescadores, ...",[But I do not believe that that line in case o...,41,34,0.453333


In [20]:
# Select which topics will be excluded
excluded_topics = [-1, 5, 9, 7, 24, 15, 33, 34, 38, 58, 40, 39, 61, 25, 48, 57, 45, 54, 37, 53, 59]
topic_info_selection = topic_info[~topic_info["Topic"].isin(excluded_topics)]
# Reduce dataset to selected topics
df_filtered = df[~df["Topic"].isin(excluded_topics)]
df_filtered["Topic_Name"] = df_filtered["Topic"].map(topic_info.set_index("Topic")["Name"])

topic_info_selection = topic_info_selection.sort_values(by="Count", ascending=False)
topic_info_selection

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,No,Yes,pos_ratio
3,2,1407,2_economy_budget_spending_billion,"[economy, budget, spending, billion, deficit, ...","[I will pay down the national debt., When Pres...",769,638,0.453447
1,0,1353,0_health_care_social_medicare,"[health, care, social, medicare, insurance, se...",[I want to do something about Social Security....,822,531,0.392461
2,1,981,1_tax_taxes_cut_for,"[tax, taxes, cut, for, cuts, the, increase, pe...","[This is a family tax cut., I have a tax cut.,...",527,454,0.462793
4,3,841,3_jobs_unemployment_million_small,"[jobs, unemployment, million, small, business,...",[We've had record numbers of new small busines...,519,322,0.382878
17,16,682,16_military_troops_our_war,"[military, troops, our, war, we, forces, to, a...","[Some of our troops are not well-equipped., Bu...",526,156,0.228739
5,4,611,4_education_schools_school_teachers,"[education, schools, school, teachers, college...",[So we've got to get our education system righ...,443,168,0.274959
15,14,607,14_senator_senate_republican_voted,"[senator, senate, republican, voted, obama, wa...","[Of course, we have to do some things tactical...",395,212,0.349259
7,6,479,6_energy_oil_coal_gas,"[energy, oil, coal, gas, production, we, our, ...",[I want to control our own energy by developin...,292,187,0.390397
20,19,429,19_government_federal_reform_the,"[government, federal, reform, the, of, to, we,...","[I'd like the government to have that., Here i...",357,72,0.167832
9,8,423,8_she_her_shes_hillary,"[she, her, shes, hillary, emails, and, was, wo...","[The problem is, you talk, but you don't get a...",322,101,0.238771


In [21]:
# Create two non-overlapping datasets
set1 = pd.DataFrame()
set2 = pd.DataFrame()
# Iterate size sorted topics in pairs
for t1, t2 in zip(topic_info_selection.iloc[::2].iterrows(), topic_info_selection.iloc[1::2].iterrows()):
    # Get samples corresponding to the topics
    t1_df = df_filtered[df_filtered["Topic"] == t1[1]["Topic"]]
    t2_df = df_filtered[df_filtered["Topic"] == t2[1]["Topic"]]

    # Concatenate the bigger topic with the smaller set and vice versa
    if len(set1) > len(set2):
        if len(t1_df) > len(t2_df):
            set1 = pd.concat([set1, t2_df])
            set2 = pd.concat([set2, t1_df])
        else:
            set1 = pd.concat([set1, t1_df])
            set2 = pd.concat([set2, t2_df])
    else:
        if len(t1_df) > len(t2_df):
            set1 = pd.concat([set1, t1_df])
            set2 = pd.concat([set2, t2_df])
        else:
            set1 = pd.concat([set1, t2_df])
            set2 = pd.concat([set2, t1_df])

print(len(set1))
print(len(set2))
print(len(set1[set1["class_label"] == "Yes"]) / len(set1))
print(len(set2[set2["class_label"] == "Yes"]) / len(set2))

7326
7313
0.33551733551733554
0.33365239983590866


In [22]:
# Adjustable split ratios
train_ratio = 0.7
dev_ratio = 0.1
test_ratio = 0.2

# Split set-wise
set_splits = []
for s in [set1, set2]:
    train_df, temp_df = train_test_split(s, train_size=train_ratio, random_state=42, shuffle=True)
    dev_df, test_df = train_test_split(temp_df, train_size=dev_ratio/(dev_ratio + test_ratio), random_state=42, shuffle=True)
    set_splits.append((train_df, dev_df, test_df))

In [23]:
#Save datasets
dest_path = "../data/CT24_checkworthy_english/topic_split"

for i, splits in enumerate(set_splits, start=1):
    train, dev, test = splits
    train.to_csv(os.path.join(dest_path, f"train_set_{i}.csv"), index=False)
    dev.to_csv(os.path.join(dest_path, f"dev_set_{i}.csv"), index=False)
    test.to_csv(os.path.join(dest_path, f"test_set_{i}.csv"), index=False)