# Topic modelling using BERTopic

## Libraries/data required

In [1]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

# helper functions
from helper_functions import get_relevant_topics

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-10-12 16:08:36.590725: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-12 16:08:36.590761: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-12 16:08:36.590794: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-12 16:08:36.599265: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compil

In [2]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [3]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    print("dupa")
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [4]:
# bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [5]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['hunger',
                                                                           'food insecurity',
                                                                           'famine',
                                                                           'malnutrition'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

121 0.5489128
7 0.51720685
171 0.4492692
63 0.4309224
25 0.3723547
14 0.36774674
172 0.35742354
97 0.33924705
31 0.33701497
45 0.32990852


ValueError: Length of values (18226) does not match length of index (18520)

In [None]:
bertopic.get_topic_info()["Name"][3]

'2_education_students_university_school'

In [None]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['refugees', 'displaced'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

33 0.63444126
143 0.6034603
182 0.5878446
204 0.5816232
39 0.5686121
189 0.5625057
55 0.5625042
95 0.5536663
184 0.5500035
96 0.531736


ValueError: Length of values (18226) does not match length of index (18520)

In [None]:
# Get the top 10 topics related to the keyword 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['humanitarian'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

31 0.6197233
222 0.61494356
84 0.59816206
201 0.5787014
220 0.55819273
161 0.54312235
198 0.5380315
12 0.53338945
209 0.52747047
88 0.5249189


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31,89,31_humanitarian_million_aid_assistance,"[humanitarian, million, aid, assistance, fundi...",[International donors today pledged more than ...
222,10,222_humanitarian_eu_million_european,"[humanitarian, eu, million, european, aid, mll...",[Brussels — EU Commissioner for Humanitarian A...
84,38,84_workers_aid_humanitarian_killed,"[workers, aid, humanitarian, killed, worker, a...","[Juba, South Sudan — Six staff members of a na..."
201,13,201_humanitarian_returns_displaced_violence,"[humanitarian, returns, displaced, violence, c...",[Juba — Aid workers warn that while psychosoci...
220,10,220_red_cross_ifrc_crescent,"[red, cross, ifrc, crescent, volunteers, icrc,...",[The South Sudan Red Cross (SSRC) has today be...
161,19,161_oxfam_oxfams_aid_humanitarian,"[oxfam, oxfams, aid, humanitarian, clean, wate...",[A new wave of violence in Darfur has forced m...
198,13,198_maban_aid_workers_bunj,"[maban, aid, workers, bunj, mabaan, mabanese, ...","[Addis Ababa — Early this week, a militia grou..."
12,149,12_unmiss_un_civilians_mission,"[unmiss, un, civilians, mission, peacekeepers,...",[Juba — The United Nations Mission in South Su...
209,12,209_ngos_aid_workers_bill,"[ngos, aid, workers, bill, permit, work, ngo, ...",[Foreign aid groups are facing pressure in sev...
88,37,88_darfur_unamid_dabanga_radio,"[darfur, unamid, dabanga, radio, camp, west, d...","[The failure of the UN/African Union (""hybrid""..."


In [None]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', and 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=['conflict', 'fighting', 'murder'], top_n=10)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

217 0.3455478
216 0.3360504
128 0.33460623
205 0.33157223
169 0.32353568
158 0.31698716
200 0.31670505
87 0.31475306
137 0.31329644
0 0.30282426


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
217,11,217_dinka_is_african_equatorians,"[dinka, is, african, equatorians, nuer, for, t...",[The Big Bang of South Sudan is on since Sunda...
216,11,216_society_civil_civilsociety_games,"[society, civil, civilsociety, games, process,...",[The inclusion of civil society in efforts to ...
128,25,128_penalty_death_executions_moratorium,"[penalty, death, executions, moratorium, legal...",[Juba — South Sudan should join the great majo...
205,12,205_peace_splma_political_dividends,"[peace, splma, political, dividends, is, these...",[It seems the deadlock in the negotiation over...
169,17,169_troika_rarcss_parties_transitional,"[troika, rarcss, parties, transitional, peace,...",[The following statement was issued jointly by...
158,19,158_journalists_cpj_journalist_killed,"[journalists, cpj, journalist, killed, alzinta...",[Somalia continues to stand out as the African...
200,13,200_mbugua_gredo_kimani_ngarama,"[mbugua, gredo, kimani, ngarama, she, her, ken...",[Monica Kimani's father Paul Ngarama has dismi...
87,37,87_district_moyo_lamwo_border,"[district, moyo, lamwo, border, were, police, ...",[A suspected South Sudan rebel group operating...
137,23,137_toner_question_lyman_you,"[toner, question, lyman, you, kordofan, we, nu...","[Washington, DC — Excerpts from the United Sta..."
0,931,0_igad_machar_peace_agreement,"[igad, machar, peace, agreement, talks, partie...","[From the very outset, I want to lay a renunci..."


In [None]:
new_keywords_added = ["summary", "hunger", "refugees", "humanitarian"]

In [None]:
keywords_list_extensive = ['Governance',
                           'Diplomacy',
                           'Conflict',
                           'Crisis',
                           'Security',
                           'Society',
                           'Health',
                           'Development',
                           'Education',
                           'Survival',
                           'International',
                           'Opposition',
                           'Welfare',
                           'Media',
                           'Leadership']

In [None]:
# Define the list of keywords
keywords_list_extensive = ['Governance', 'Diplomacy', 'Conflict', 'Crisis', 'Security', 'Society', 'Health', 'Development', 'Education', 'Survival', 'International', 'Opposition', 'Welfare', 'Media', 'Leadership']

# Initialize an empty dictionary to store the results for each keyword
keyword_results = {}

for keyword in keywords_list_extensive:
    # Get relevant topics for the current keyword
    relevant_topics = get_relevant_topics(bertopic_model=bertopic, keywords=[keyword], top_n=10)
    
    # Extract topic IDs
    topic_ids = [el[0] for el in relevant_topics]
    
    # Add a boolean column to df for the current keyword
    df[keyword] = [t in topic_ids for t in bertopic.topics_]
    
    # Store relevant topic information in the dictionary
    keyword_results[keyword] = bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

# Print the results for each keyword
for keyword, result_df in keyword_results.items():
    print(f"Keyword: {keyword}")
    print(result_df)
    print("\n")

Keyword: Governance
       Count                                               Name  \
Topic                                                             
54        51                54_federal_system_federalism_kokora   
216       11               216_society_civil_civilsociety_games   
169       17             169_troika_rarcss_parties_transitional   
195       13               195_child_children_rights_convention   
90        34  90_conference_development_international_engage...   
140       22  140_constitution_commission_society_constituti...   
192       14         192_elections_constitution_bill_commission   
40        67                         40_splm_party_political_it   
15       138                     15_au_commission_african_union   
114       29             114_rights_human_violations_commission   

                                          Representation  \
Topic                                                      
54     [federal, system, federalism, kokora, powers, ..

In [None]:
final_keywords= new_keywords_added + keywords_list_extensive

In [None]:
df

In [None]:
original_df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])


# Combine article summaries with the newly created features
df = original_df.merge(
    df[final_keywords],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("data/articles_topics.csv", index=False) # Save DataFrame to articles_topics.csv

KeyError: "['summary'] not in index"

In [None]:
# original_df = pd.read_csv("data/all_africa_southsudan.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
# df = original_df.merge(
#     df[final_keywords],
#     how="left",
#     left_on="paragraphs",
#     right_on="paragraphs",
# )

# df.to_csv("data/articles_topics.csv", index=False) # Save DataFrame to articles_topics.csv

In [None]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18226 entries, 0 to 18225
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       18226 non-null  object        
 1   date        18226 non-null  datetime64[ns]
 2   paragraphs  18226 non-null  object        
 3   publisher   18226 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 569.7+ KB


In [None]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))

18226
15752


There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!