In [1]:
# IMPORTS
from bertopic import BERTopic
import warnings
import pandas as pd
import os
from sklearn.cluster import KMeans, AgglomerativeClustering, OPTICS, SpectralClustering
warnings.filterwarnings("ignore")

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

(18520, 5)


In [3]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"
 
if os.path.exists('kmeans_model'):
    kmeans_model = BERTopic.load('kmeans_model')
else:
    cluster_model = KMeans(n_clusters=10)
    kmeans_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=cluster_model) # Initialize the BERTopic model

    kmeans_model.fit_transform(docs) # Fit the model to the list of article summaries
    kmeans_model.save("kmeans_model") # Save the trained model 

if os.path.exists('agglomerative_model'):
    agglomerative_model = BERTopic.load('agglomerative_model')
else:
    cluster_model = AgglomerativeClustering(n_clusters=10)
    agglomerative_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=cluster_model) # Initialize the BERTopic model

    agglomerative_model.fit_transform(docs) # Fit the model to the list of article summaries
    agglomerative_model.save("agglomerative_model") # Save the trained model 

if os.path.exists('optics_model'):
    optics_model = BERTopic.load('optics_model')
else:
    cluster_model = OPTICS(min_samples=5)  # Customize the OPTICS parameters as needed
    optics_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=cluster_model)

    optics_model.fit_transform(docs)
    optics_model.save("optics_model")

if os.path.exists('spectral_model'):
    spectral_model = BERTopic.load('spectral_model')
else:
    cluster_model = SpectralClustering(n_clusters=10)  # Customize the Spectral Clustering parameters as needed
    spectral_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=cluster_model)

    spectral_model.fit_transform(docs)
    spectral_model.save("spectral_model")
    
models = [kmeans_model, agglomerative_model, optics_model, bertopic]

In [4]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [5]:
keyword_sets = keyword_sets = [
    (['hunger', 'food insecurity', 'conflict'], 'hunger'),
    (['refugees', 'displaced'], 'refugees'),
    (['humanitarian'], 'humanitarian'),
    (['conflict', 'fighting', 'murder', 'military'], 'conflict'),
    (["politics", "government", "elections", "independence"], 'politics'),
    (['aid', 'assistance', 'relief'], 'aid')
]

In [6]:
for model in models:
    for keywords, label in keyword_sets:
        # Get the top 10 topics related to the current set of keywords
        relevant_topics = get_relevant_topics(bertopic_model=agglomerative_model, keywords=keywords, top_n=15)

        # Create a list of topic IDs
        topic_ids = [el[0] for el in relevant_topics]

        # # Print the relevant topics
        # print(f"Top 10 topics related to '{label}':")
        # for topic_id, relevancy in relevant_topics:
        #     print(topic_id, relevancy)

        # Add a boolean column to the 'df' DataFrame if the topic is in the list of relevant topics
        df[label] = [t in topic_ids for t in model.topics_]
    print(f"Model: {model.hdbscan_model}")
    print(len(df))
    print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))
    print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False) & (df["politics"] == False) & (df["aid"] == False)]))
    print(20* "-")

Model: KMeans(n_clusters=10)
18520
0
0
--------------------
Model: AgglomerativeClustering(n_clusters=10)
18520
0
0
--------------------
Model: OPTICS()
18520
18177
18177
--------------------
Model: HDBSCAN(min_cluster_size=10, prediction_data=True)
18520
16224
16224
--------------------


In [7]:
df.columns

Index(['summary', 'date', 'location_article', 'lat', 'lng', 'hunger',
       'refugees', 'humanitarian', 'conflict', 'politics', 'aid'],
      dtype='object')

In [8]:
unsorted = df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False) & (df["politics"] == False)]

In [9]:
#Refit models on unsorted data
if os.path.exists("refit_bertopic"):
    refit_bertopic = BERTopic.load("refit_bertopic")
else:
    refit_bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True)
    refit_bertopic.fit_transform(unsorted["summary"].tolist())
    refit_bertopic.save("refit_bertopic")


if os.path.exists('refit_kmeans'):
    refit_kmeans = BERTopic.load('refit_kmeans')
else:
    refit_kmeans = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=KMeans(n_clusters=10))
    refit_kmeans.fit_transform(unsorted["summary"].tolist())
    refit_kmeans.save("refit_kmeans")

if os.path.exists('refit_agglomerative'):
    refit_agglomerative = BERTopic.load('refit_agglomerative')
else:
    refit_agglomerative = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=AgglomerativeClustering(n_clusters=10))
    refit_agglomerative.fit_transform(unsorted["summary"].tolist())
    refit_agglomerative.save("refit_agglomerative")

if os.path.exists('refit_spectral'):
    refit_spectral = BERTopic.load('refit_spectral')
else:
    refit_spectral = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=SpectralClustering(n_clusters=10))
    refit_spectral.fit_transform(unsorted["summary"].tolist())
    refit_spectral.save("refit_spectral")

if os.path.exists('refit_optics'):
    refit_optics = BERTopic.load('refit_optics')
else:
    refit_optics = BERTopic(language="english", calculate_probabilities=True, verbose=True, hdbscan_model=OPTICS(min_samples=5))
    refit_optics.fit_transform(unsorted["summary"].tolist())
    refit_optics.save("refit_optics")

refitted_models = [refit_kmeans, refit_agglomerative, refit_spectral, refit_optics, refit_bertopic]

Batches:   0%|          | 0/507 [00:00<?, ?it/s]

2023-09-25 14:37:14,986 - BERTopic - Transformed documents to Embeddings
2023-09-25 14:37:50,307 - BERTopic - Reduced dimensionality
  File "C:\Users\20210777\Anaconda3\envs\DC3\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(
2023-09-25 14:37:51,307 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/507 [00:00<?, ?it/s]

2023-09-25 14:47:56,636 - BERTopic - Transformed documents to Embeddings
2023-09-25 14:48:06,334 - BERTopic - Reduced dimensionality
2023-09-25 14:48:16,429 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/507 [00:00<?, ?it/s]

2023-09-25 14:57:56,179 - BERTopic - Transformed documents to Embeddings
2023-09-25 14:58:05,640 - BERTopic - Reduced dimensionality
2023-09-25 14:59:20,853 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/507 [00:00<?, ?it/s]

2023-09-25 15:07:09,114 - BERTopic - Transformed documents to Embeddings
2023-09-25 15:07:16,095 - BERTopic - Reduced dimensionality
2023-09-25 15:07:36,613 - BERTopic - Clustered reduced embeddings


In [10]:
for model in refitted_models:
    for keywords, label in keyword_sets:
        # Get the top 10 topics related to the current set of keywords
        relevant_topics = get_relevant_topics(bertopic_model=model, keywords=keywords, top_n=10)
        
        # Create a list of topic IDs
        topic_ids = [el[0] for el in relevant_topics]
        
        # # Print the relevant topics
        # print(f"Top 10 topics related to '{label}':")
        # for topic_id, relevancy in relevant_topics:
        #     print(topic_id, relevancy)
        
        # Add a boolean column to 'unsorted' DataFrame if the topic is in the list of relevant topics
        unsorted[label] = [t in topic_ids for t in refit_bertopic.topics_]
    print(f"Model: {model.hdbscan_model}")
    print(len(unsorted))
    print(len(unsorted[(unsorted["hunger"]==False) & (unsorted["refugees"] == False) & (unsorted["humanitarian"] == False) & (unsorted["conflict"] == False)]))
    print(len(unsorted[(unsorted["hunger"]==False) & (unsorted["refugees"] == False) & (unsorted["humanitarian"] == False) & (unsorted["conflict"] == False) & (unsorted["politics"] == False) & (unsorted["aid"] == False)]))
    print(20* "-")

Model: KMeans(n_clusters=10)
16224
14656
14656
--------------------
Model: AgglomerativeClustering(n_clusters=10)
16224
14656
14656
--------------------
Model: SpectralClustering(n_clusters=10)
16224
14656
14656
--------------------
Model: OPTICS()
16224
15793
15585
--------------------
Model: HDBSCAN(min_cluster_size=10, prediction_data=True)
16224
14860
14274
--------------------


In [12]:
#Update the original dataframe with the new labels
df.update(unsorted)
unsorted2 = df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False) & (df["politics"] == False)]

In [13]:
#Refit a second time
if os.path.exists("refit2_bertopic"):
    refit2_bertopic = BERTopic.load("refit2_bertopic")
else:
    refit2_bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True)
    refit2_bertopic.fit_transform(unsorted2["summary"].tolist())
    refit2_bertopic.save("refit2_bertopic")


Batches:   0%|          | 0/451 [00:00<?, ?it/s]

2023-09-25 15:24:40,424 - BERTopic - Transformed documents to Embeddings
2023-09-25 15:24:46,794 - BERTopic - Reduced dimensionality
2023-09-25 15:25:21,070 - BERTopic - Clustered reduced embeddings


In [16]:
for keywords, label in keyword_sets:
        # Get the top 10 topics related to the current set of keywords
        relevant_topics = get_relevant_topics(bertopic_model=refit2_bertopic, keywords=keywords, top_n=10)
        
        # Create a list of topic IDs
        topic_ids = [el[0] for el in relevant_topics]
        
        # # Print the relevant topics
        # print(f"Top 10 topics related to '{label}':")
        # for topic_id, relevancy in relevant_topics:
        #     print(topic_id, relevancy)
        
        # Add a boolean column to 'unsorted' DataFrame if the topic is in the list of relevant topics
        unsorted2[label] = [t in topic_ids for t in refit2_bertopic.topics_]
print(f"Model: {refit2_bertopic.hdbscan_model}")
print(len(unsorted2))
print(len(unsorted2[(unsorted2["hunger"]==False) & (unsorted2["refugees"] == False) & (unsorted2["humanitarian"] == False) & (unsorted2["conflict"] == False)]))
print(len(unsorted2[(unsorted2["hunger"]==False) & (unsorted2["refugees"] == False) & (unsorted2["humanitarian"] == False) & (unsorted2["conflict"] == False) & (unsorted2["politics"] == False) & (unsorted2["aid"] == False)]))
print(20* "-")

Model: HDBSCAN(min_cluster_size=10, prediction_data=True)
14412
12954
12545
--------------------


In [17]:
df.update(unsorted2)