### Bertopic Model

Load Dataset

In [1]:
import pandas as pd

# load dataset
dataset_path = 'cnn_news_articles_final_downsampled_cleaned.csv'
df = pd.read_csv(dataset_path)

texts = df['text'].tolist()

Bertopic Model

In [2]:
from bertopic import BERTopic

bertopic_model = BERTopic()
topics, probabilities = bertopic_model.fit_transform(texts)

  from .autonotebook import tqdm as notebook_tqdm


MLflow Server

In [3]:
import mlflow
from mlflow.tracking import MlflowClient

# convert model name to string
model_name = bertopic_model.__class__.__name__

# start MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("topic-modelling")

# initialize MLflow client
client = MlflowClient()

# experiment ID
experiment_id = client.get_experiment_by_name("topic-modelling").experiment_id

runs = client.search_runs(experiment_ids=[experiment_id])

# initial version to 0
max_version = 0

# find the max version for the current version model
for run in runs:
    run_name = run.data.tags.get('mlflow.runName')
    if run_name and run_name.startswith(model_name):        
        # extract version number from the run name
        try:
            version = int(run_name.split('_v')[-1])
        except ValueError:
            continue  # skip if version is not an integer

        # update max_version if this version is greater
        max_version = max(max_version, version)

# increase the version by adding 1
new_version = max_version + 1

# new run name
new_run_name = f"{model_name}_v{new_version}"
mlflow.start_run(run_name=new_run_name)

mlflow.log_param("model name", model_name)
mlflow.log_param("dataset_name", dataset_path)
mlflow.log_param("data size", df.shape)


(9570, 2)

Evaluate by Coherence Scores

In [4]:
import gensim
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize

tokenized_texts = [word_tokenize(text.lower()) for text in texts]

# create a dictionary and corpus
dictionary = gensim.corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

topic_words = bertopic_model.get_topics()

# format the topics into a list of lists for the coherence calculation
formatted_topics = [[word for word, _ in topic_words[topic_num]] for topic_num in topic_words]

# calculate c_v 
coherence_model_cv = CoherenceModel(topics=formatted_topics, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
cv_score = coherence_model_cv.get_coherence()

# calculate u_mass 
coherence_model_umass = CoherenceModel(topics=formatted_topics, corpus=corpus, dictionary=dictionary, coherence='u_mass')
umass_score = coherence_model_umass.get_coherence()

# calculate NPMI
coherence_model = CoherenceModel(topics=formatted_topics, texts=tokenized_texts, dictionary=dictionary, coherence='c_npmi')
npmi_score = coherence_model.get_coherence()

print(f"c_v Score: {cv_score}")
print(f"u_mass Score: {umass_score}")
print(f"NPMI Score: {npmi_score}")

mlflow.log_metric("c_v coherence", cv_score)
mlflow.log_metric("u_mass coherence", umass_score)
mlflow.log_metric("NPMI coherence", npmi_score)

c_v Score: 0.6862945390942535
u_mass Score: -2.3836138897041694
NPMI Score: 0.14719946215949392


Select Top 10 topics

In [5]:
# get the top 10 topics 
top_topics = list(topic_words.keys())[:10]

top_topic_nums = list(range(len(top_topics)))  # Topic numbers for the top 10 topics

In [6]:
bertopic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2266,-1_crisis_said_photo_russia,"[crisis, said, photo, russia, hide, caption, y...",[title cover cass sun stein book normal printe...
1,0,564,0_film_movie_series_actor,"[film, movie, series, actor, lost, character, ...",[touching tale across social medium following ...
2,1,406,1_vaccine_pandemic_virus_health,"[vaccine, pandemic, virus, health, mask, varia...",[food drug administration gave green light vac...
3,2,209,2_exit_minister_prime_party,"[exit, minister, prime, party, parliament, gov...",[may spending weekend trying plot way exit sta...
4,3,186,3_league_club_goal_football,"[league, club, goal, football, champion, best,...",[story highlight crush real first leg champion...
...,...,...,...,...,...
192,191,11,191_war_lynch_moment_life,"[war, lynch, moment, life, cull, invasion, hus...",[evening news site bath candle young woman go ...
193,192,10,192_drone_aircraft_military_pilot,"[drone, aircraft, military, pilot, intercept, ...",[minute international black sea morning local ...
194,193,10,193_autopsy_theater_wound_massacre,"[autopsy, theater, wound, massacre, colorado, ...",[family environmental activist law enforcement...
195,194,10,194_russia_missile_attack_military,"[russia, missile, attack, military, war, strik...",[russia full scale invasion approach one year ...


In [7]:
bertopic_model.visualize_hierarchy()

In [8]:
bertopic_model.visualize_barchart(top_topic_nums) # top_topic_nums

Save Model

In [11]:
import joblib

# save the BERTopic model u
joblib.dump(bertopic_model, 'berTopic.pkl')
print('BERTopic model saved')

BERTopic model saved


Plot Topics over time

Condition

(The trained dataset need date column) (Better)

(When apply to a new dataset (got date column), the trained saved model index need match with the new dataset)

For example, used 1000 rows data to train bertopic model, but apply the model into a new 1500 row data, it wont work, index must be same

In [10]:
import pandas as pd
import joblib

df = pd.read_csv('cnn_news_articles_final_cleaned.csv')
df = df.sample(n=9570, random_state=42)

df['date published'] = pd.to_datetime(df['date published'])  

# extract timestamps and texts
timestamps = df['date published'].tolist()
texts = df['text'].tolist()

bertopic_model = joblib.load('berTopic.pkl')
print('BERTopic model loaded')

topics_over_time = bertopic_model.topics_over_time(docs=texts, 
                                                    timestamps=timestamps, 
                                                    nr_bins=20)

topics_over_time = bertopic_model.topics_over_time(docs=texts, 
                                                    timestamps=timestamps, 
                                                    nr_bins=20,
                                                    global_tuning=True, 
                                                    evolution_tuning=True)

bertopic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)


BERTopic model loaded


#### Performance

c_v Score    = 0.68

u_mass Score = -2.38

NPMI Score   = 0.15