Topic Modelling with BERTopic

###### Documentation: https://maartengr.github.io/BERTopic/

<div style="color: light grey;">

- **Description**: BERTopic is a topic modeling technique that leverages transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.
- **Purpose**: It is used in this project to understand the diversity of topics and their distribution across articles. It can be run on either the title or the whole article. Transformer models ensure quality for both short-length and full-length texts.
- **Deployment**: The results are deployed for further Sentiment Analysis and News Summarization tasks.
- **Input**: The raw BBC news dataset for the category 'Politics'.

</div>

In [3]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
# Import libraries for Topic Modelling with BERTopic
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
#from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
# local imports
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.query import bbc_news_politics
from src.preprocessing import summarize_long_articles, clean_text, drop_columns, filter_articles_by_length
# Setting secret credentials
from dotenv import load_dotenv #pip install python-dotenv
load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anna_verbytska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use mps:0


False

In [4]:
# Fetch the data from BigQuery
df = bbc_news_politics()
df.head()

This query will process 5114621 bytes.


Unnamed: 0,body,title,filename,category
0,"The ""best person for the job"" should be appoin...",'Best person' for top legal job,bbc/politics/273.txt,politics
1,A cap on donations to political parties should...,'Debate needed' on donations cap,bbc/politics/059.txt,politics
2,A cap on donations to political parties should...,'Debate needed' on donations cap,bbc/politics/298.txt,politics
3,It could cost £80m to run a UK referendum on t...,'EU referendum could cost £80m',bbc/politics/391.txt,politics
4,The initial attempt to sell the Millennium Dom...,'Errors' doomed first Dome sale,bbc/politics/006.txt,politics


In [5]:
# Preprocess the data
df = clean_text(df)
df = drop_columns(df)
df = filter_articles_by_length(df) # 150-1000 words

In [6]:
# Initialize BART summarization pipeline
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
# Summarize long articles (> 1000 words) and replace the original text with the summary
df = summarize_long_articles(df)  

Device set to use mps:0


In [7]:
df.head()

Unnamed: 0,body,title,category,word_count
0,"The ""best person for the job"" should be appoin...",'Best person' for top legal job,politics,465
1,A cap on donations to political parties should...,'Debate needed' on donations cap,politics,549
2,A cap on donations to political parties should...,'Debate needed' on donations cap,politics,549
3,It could cost £80m to run a UK referendum on t...,'EU referendum could cost £80m',politics,194
4,The initial attempt to sell the Millennium Dom...,'Errors' doomed first Dome sale,politics,380


In [8]:
# Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L12-v2")

In [9]:
# Pre-compute embeddings
embeddings = embedding_model.encode(df['body'], show_progress_bar=False)

In [10]:
# Reduce dimensionality with UMAP
umap_model = UMAP(
    n_neighbors=10,  # capture more fine-grained relationships for a small dataset
    n_components=5,  # reduce dimensionality further for better visualization
    min_dist=0.0,  # keep clusters tight
    metric='cosine',  # best for text embeddings
    random_state=42
)

In [11]:
hdbscan_model = HDBSCAN(
    min_cluster_size=15,  # to allow more but smaller clusters
    min_samples=5,  # controls how "noise-tolerant" clustering is, detects smaller clusters
    metric='euclidean',  # distance metric
    cluster_selection_method='eom',  # extracts dense clusters
    prediction_data=True  # allows predicting cluster membership for new points
)

In [12]:
# Tokenize
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [13]:
# Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [14]:
# Set λ to 0.5 to get the optimal mix of diversity and accuracy in the result set
representation_model = MaximalMarginalRelevance(diversity=0.5, top_n_words=15)

In [15]:
# Set the parameters for the model
topic_model = BERTopic(
  embedding_model=embedding_model,            # extract embeddings
  umap_model=umap_model,                      # reduce dimensionality
  hdbscan_model=hdbscan_model,                # cluster reduced embeddings
  vectorizer_model=vectorizer_model,          # tokenize topics
  ctfidf_model=ctfidf_model,                  # extract topic words
  representation_model=representation_model,  # diversify topic words         
  nr_topics=None,                             # no forced merging of topics BUT nr_topics='auto' to merge similar ones
  min_topic_size=20,                          # filter out small, less meaningful topics
  verbose=True,
  top_n_words=15                              # number of words per topic
  )                            

In [16]:
# Initialize BERT model
topics, probabilities = topic_model.fit_transform(df['body'], embeddings)

2025-04-11 12:59:22,020 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-04-11 12:59:25,185 - BERTopic - Dimensionality - Completed ✓
2025-04-11 12:59:25,185 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-11 12:59:25,196 - BERTopic - Cluster - Completed ✓
2025-04-11 12:59:25,198 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-11 12:59:26,214 - BERTopic - Representation - Completed ✓


In [17]:
# Print the topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head(15)   

Number of topics: 12


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,136,-1_labour_new_minister_mr blunkett,"[labour, new, minister, mr blunkett, hunting, ...",[Michael Howard has denied his shadow cabinet ...
1,0,43,0_kennedy_students_education_party,"[kennedy, students, education, party, lib dems...",[The Liberal Democrats are attempting to woo f...
2,1,43,1_police_human rights_suspects_trial,"[police, human rights, suspects, trial, terror...",[UK citizens suspected of involvement in terro...
3,2,38,2_mr blair_prime minister_election_milburn,"[mr blair, prime minister, election, milburn, ...",[Gordon Brown has made an appeal for unity aft...
4,3,24,3_tory_spending_party_tax cuts,"[tory, spending, party, tax cuts, election, la...","[By any measure, New Yorker Dick Morris is tha..."
5,4,20,4_aid_africa_g8_world,"[aid, africa, g8, world, poverty, nations, tsu...",[Tony Blair has had the chance to tackle the p...
6,5,19,5_workers_unions_pensions_public sector,"[workers, unions, pensions, public sector, str...",[Talks aimed at averting national strikes over...
7,6,19,6_uk_tb_asylum seekers_borders,"[uk, tb, asylum seekers, borders, mr howard, q...",[The UK' opposition Conservatives have unveile...
8,7,18,7_straw_constitution_china_turkey,"[straw, constitution, china, turkey, embargo, ...",[French president Jacques Chirac has called th...
9,8,17,8_lord chancellor_courts_house lords_falconer,"[lord chancellor, courts, house lords, falcone...",[In a locked room at the heart of Parliament t...


**Note:**  
All documents labeled as `-1` are considered **outliers**. This means they either:  
1. Contain **complex or ambiguous content**, making it difficult to assign a single topic.  
2. Lack distinct features, preventing clear topic classification.  

##### 🔹 How to Handle These Outliers?  
- **Option 1 (Recommended for Now):** Exclude `-1` documents from summarization and analysis to ensure topic clarity.  
- **Option 2 (Alternative Approach):** Assign them to the closest topic using similarity-based methods (e.g., cosine similarity with topic centroids).  

For now, we will proceed with **Option 1**, focusing only on clearly defined topics.

In [18]:
# Print the keywords
a_topic = freq.iloc[9]["Topic"] # select the topic by index
topic_model.get_topic(a_topic) # Show the words and their c-TF-IDF scores   

[('lord chancellor', np.float64(0.3202256225816858)),
 ('courts', np.float64(0.30798862530066556)),
 ('house lords', np.float64(0.2892111087825098)),
 ('falconer', np.float64(0.288655220640543)),
 ('access', np.float64(0.277932748422755)),
 ('kinnock', np.float64(0.2706330777108257)),
 ('marry', np.float64(0.2639951859750712)),
 ('reform', np.float64(0.2579608624747341)),
 ('peers', np.float64(0.25067812763234604)),
 ('lord scarman', np.float64(0.24332070908542083)),
 ('mediation', np.float64(0.24332070908542083)),
 ('parker bowles', np.float64(0.23952216251629546)),
 ('fathers', np.float64(0.23305109245915656)),
 ('archives', np.float64(0.22760568212613455)),
 ('contact', np.float64(0.219016854601594))]

In [19]:
# Visualise the topics and their keywords
topic_model.visualize_barchart(n_words=10)

In [20]:
# Visualise clusters of topics
topic_model.visualize_topics()

In [21]:
# Visualise the topic hierarchy
topic_model.visualize_hierarchy(top_n_topics=11)

In [22]:

# Visualise a similarity matrix
topic_model.visualize_heatmap(top_n_topics=30)

In [23]:
# Create a new column filled with topics
df['topic'] = topics
df.head(5) 

Unnamed: 0,body,title,category,word_count,topic
0,"The ""best person for the job"" should be appoin...",'Best person' for top legal job,politics,465,8
1,A cap on donations to political parties should...,'Debate needed' on donations cap,politics,549,-1
2,A cap on donations to political parties should...,'Debate needed' on donations cap,politics,549,-1
3,It could cost £80m to run a UK referendum on t...,'EU referendum could cost £80m',politics,194,7
4,The initial attempt to sell the Millennium Dom...,'Errors' doomed first Dome sale,politics,380,-1


In [24]:
# Select topics that are semantically similar to an input query
similar_topics, similarity = topic_model.find_topics('migration', top_n=2)
similar_topics

[6, 9]