## Semi-Supervised Topic Modelling with BERTopic

##### 

###### https://github.com/MaartenGr/BERTopic/issues/292

In [1]:
import pandas as pd
# Import libraries for visualisation
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Import libraries for Topic Modelling with BERTopic
# Documentation on BERTopic https://maartengr.github.io/BERTopic/
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Convert data into a DataFrame
df = pd.read_csv("../data/Euronews.csv")

In [None]:
df.head(5)

In [15]:
# Extract embeddings using a model all-MiniLM-L6-v2
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [16]:
# Pre-compute embeddings
embeddings = embedding_model.encode(df['text'], show_progress_bar=False)

In [17]:
# Reduce dimensionality (the distance between words in an embedding space that are meaningful)
# NOTE: n_components=5 for Euronews, n_components=10 for Kyivpost
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='euclidean', random_state=42)

In [18]:
# Cluster reduced embeddings
#NOTE: min_cluster_size=10 for Euronews, min_cluster_size=5 for Kyivpost 
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [19]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
#X = vectorizer_model.fit_transform(df_eu['text'])

In [20]:
#Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [27]:
# Set λ to 0.5 to get the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=30)

In [28]:
# Set the parameters for the model
# Increase the min_topic_size parameter to filter out small, less meaningful topics.
topic_model = BERTopic(
  embedding_model=embedding_model,            # Step 1 - Extract embeddings
  umap_model=umap_model,                      # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
  representation_model=representation_model,  # Step 6 - Diversify topic words         
  nr_topics=None,
  min_topic_size=1, 
  verbose=True,
  top_n_words=30                        
)

In [29]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['text'], embeddings)

2025-07-22 16:50:52,895 - BERTopic - Reduced dimensionality
2025-07-22 16:50:52,904 - BERTopic - Clustered reduced embeddings


In [30]:
# Reduce outliers using the `embeddings` strategy
# Documentation https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html#strategies
new_topics = topic_model.reduce_outliers(df['text'], topics)

100%|██████████| 1/1 [00:00<00:00,  1.72it/s]


In [31]:
# Update topic representation
topic_model.update_topics(df['text'], topics=new_topics, vectorizer_model=vectorizer_model, top_n_words=100)

In [32]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(20)

Number of topics: 11


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,160,0_ukraine_russian_said_russia,"[ukraine, russian, said, russia, ukrainian, wa...","[1. Two dead after Russian strike on Kherson, ..."
1,1,54,1_refugees_said_people_italy,"[refugees, said, people, italy, frontex, ukrai...","[Italy's newly appointed prime minister, Giorg..."
2,2,49,2_said_bank_inflation_ukraine,"[said, bank, inflation, ukraine, prices, 000, ...",[You paid how much for that? Estonians have se...
3,3,76,3_russia_ukraine_said_russian,"[russia, ukraine, said, russian, nato, putin, ...","[Before his conversation with Biden, the Russi..."
4,4,37,4_energy_gas_europe_wind,"[energy, gas, europe, wind, eu, climate, price...",[Energy costs for households across Europe nea...
5,5,41,5_ukraine_said_nato_defence,"[ukraine, said, nato, defence, tanks, military...",[WARSAW/KYIV -Poland said on Monday it would a...
6,6,32,6_grain_ukraine_said_food,"[grain, ukraine, said, food, sea, tonnes, turk...",[As the United Nations tries to broker a path ...
7,7,40,7_eu_european_ukraine_kosovo,"[eu, european, ukraine, kosovo, moldova, war, ...",[The opinions expressed in this article are th...
8,8,32,8_ukraine_zelenskyy_johnson_said,"[ukraine, zelenskyy, johnson, said, ukrainian,...",[By Tom Balmforth and Pavel Polityuk\nKYIV – U...
9,9,23,9_ukraine_music_russian_eurovision,"[ukraine, music, russian, eurovision, russia, ...",[Ukrainian artists and illustrators have been ...


In [33]:
# Print the keywords
a_topic = freq.iloc[1]["Topic"] # Select the 1st topic
topic_model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

[('refugees', 0.01742854538901152),
 ('said', 0.016439236840459972),
 ('people', 0.015191293977435358),
 ('italy', 0.013672750444110275),
 ('frontex', 0.012079660498027223),
 ('ukrainian', 0.012075643631754396),
 ('ukraine', 0.011325105980166306),
 ('million', 0.01076337334443668),
 ('countries', 0.010323808772068772),
 ('country', 0.010166095444172523),
 ('russian', 0.008742189326914323),
 ('border', 0.008654847988771181),
 ('travel', 0.008423416098306291),
 ('year', 0.008315886469430914),
 ('invasion', 0.0082185530071432),
 ('migration', 0.00811171711476733),
 ('government', 0.008110503985212732),
 ('europe', 0.008095419239356094),
 ('eu', 0.007982944783061068),
 ('ukrainians', 0.007858075038218818),
 ('war', 0.007628662234601453),
 ('restrictions', 0.007551886462237747),
 ('poland', 0.007481309278429326),
 ('000', 0.007250689078239809),
 ('covid', 0.007183257364146936),
 ('ukrainian refugees', 0.007089132231396033),
 ('european', 0.007074582162458453),
 ('far', 0.006967962499203704)

In [35]:
#Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=10)

In [None]:
# Visualise clusters of topics
topic_model.visualize_topics() # too small a num ot topics to visualise

In [None]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=13)

In [None]:
# Approximate the topic distribution for documents
topic_distr, _ = topic_model.approximate_distribution(df['text'])
topic_model.visualize_distribution(topic_distr[2])

100%|██████████| 1/1 [00:01<00:00,  1.49s/it]


In [None]:
# Select 3 topics that are semantically similar to an input query
similar_topics, similarity = topic_model.find_topics("displacement", top_n = 4)
similar_topics

[4, 2, 6, 1]

In [None]:
# Find and display the most similar topic to a given topic
most_similar = similar_topics[3]
print("Most Similar Topic Info: \n{}".format(topic_model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[0]))

Most Similar Topic Info: 
[('refugees', 0.01742854538901152), ('said', 0.016439236840459972), ('people', 0.015191293977435358), ('italy', 0.013672750444110275), ('frontex', 0.012079660498027223), ('ukrainian', 0.012075643631754396), ('ukraine', 0.011325105980166306), ('million', 0.01076337334443668), ('countries', 0.010323808772068772), ('country', 0.010166095444172523), ('russian', 0.008742189326914323), ('border', 0.008654847988771181), ('travel', 0.008423416098306291), ('year', 0.008315886469430914), ('invasion', 0.0082185530071432), ('migration', 0.00811171711476733), ('government', 0.008110503985212732), ('europe', 0.008095419239356094), ('eu', 0.007982944783061068), ('ukrainians', 0.007858075038218818), ('war', 0.007628662234601453), ('restrictions', 0.007551886462237747), ('poland', 0.007481309278429326), ('000', 0.007250689078239809), ('covid', 0.007183257364146936), ('ukrainian refugees', 0.007089132231396033), ('european', 0.007074582162458453), ('far', 0.006967962499203704),

In [None]:
# Create a new DataFrame with a topic column
df_BERTopics = pd.DataFrame({"text": df['text'], "Topic_BERTopic": new_topics})
df_BERTopics.head(5) 

In [None]:
# Merge the two DataFrames on the index
df_BERTopics = pd.merge(df, df_BERTopics, on='text', how='inner')
df_BERTopics.head(5)

In [None]:
df_BERTopics.Topic_BERTopic.value_counts()

Topic_BERTopic
0     160
3      76
1      54
2      49
5      41
7      40
4      37
8      32
6      32
9      23
10     18
Name: count, dtype: int64