Topic Modelling by using LSA

In [None]:
import pandas as pd

# load dataset
dataset_path = 'balanced_category.csv'
df = pd.read_csv(dataset_path)
# df = df[['text','category']]

print(df['category'].value_counts())

Feature Extraction (TF-IDF and BoW)

In [None]:
# use tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_doc_term_matrix = tfidf_vectorizer.fit_transform(df['text'])

In [None]:
tfidf_doc_term_matrix.shape

In [None]:
# use bow vectorizer

from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
bow_doc_term_matrix = bow_vectorizer.fit_transform(df['text'])


In [None]:
bow_doc_term_matrix.shape

LSA Model

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

# LSA model
n_topics = 10
lsa = TruncatedSVD(n_components=n_topics, random_state=42, algorithm='randomized')

# fit the LSA model to the matrix
lsa.fit(bow_doc_term_matrix)
tsvd_mat = lsa.transform(bow_doc_term_matrix)

In [None]:
# Apply t-SNE for LSA transformation
tsne_lsa = TSNE(n_components=2, random_state=42)
tsne_lsa_mat = tsne_lsa.fit_transform(tsvd_mat)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define a custom bright color palette
bright_palette = sns.color_palette(["#FF6347", "#4682B4", "#32CD32", "#FFD700", "#FF69B4"])

# Visualize LSA topics using t-SNE with custom bright colors
plt.figure(figsize=(10, 8))
sns.scatterplot(x=tsne_lsa_mat[:, 0], y=tsne_lsa_mat[:, 1], hue=df['category'], palette=bright_palette)
plt.title('t-SNE visualization of LSA topics')
plt.show()


MLflow Server

In [None]:
import mlflow
from mlflow.tracking import MlflowClient

# convert model name to string
model_name = lsa.__class__.__name__
vectorizeer_name = bow_vectorizer.__class__.__name__   #can CHANGE

# start MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("topic-modelling")

# initialize MLflow client
client = MlflowClient()

# experiment ID
experiment_id = client.get_experiment_by_name("topic-modelling").experiment_id

runs = client.search_runs(experiment_ids=[experiment_id])

# initial version to 0
max_version = 0

# find the max version for the current version model
for run in runs:
    run_name = run.data.tags.get('mlflow.runName')
    if run_name and run_name.startswith(model_name):        
        # extract version number from the run name
        try:
            version = int(run_name.split('_v')[-1])
        except ValueError:
            continue  # skip if version is not an integer

        # update max_version if this version is greater
        max_version = max(max_version, version)

# increase the version by adding 1
new_version = max_version + 1

# new run name
new_run_name = f"{model_name}_v{new_version}"
mlflow.start_run(run_name=new_run_name)

mlflow.log_param("model name", model_name)
mlflow.log_param("vectorizer name", vectorizeer_name)
mlflow.log_param("dataset_name", dataset_path)
mlflow.log_param("data size", df.shape)


Word Cloud for 10 topics

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

feature_names = bow_vectorizer.get_feature_names_out()   # can CHANGE

for i, topic in enumerate(lsa.components_):
    plt.figure()
    wordcloud = WordCloud(width=800, height=400, background_color='white').fit_words(dict(zip(feature_names, topic)))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Topic {i+1}')
    plt.show()


Topic Words Score visualization

In [None]:
import numpy as np

# colors for each topic
colors = plt.cm.tab10(np.linspace(0, 1, 10))

# subplots
fig, axes = plt.subplots(2, 5, figsize=(15, 6), sharex=False, sharey=False)
axes = axes.flatten()

for i, (topic, color) in enumerate(zip(lsa.components_, colors)):
    top_words_idx = np.argsort(topic)[::-1][:10]  # descending order
    top_words = feature_names[top_words_idx]
    top_scores = topic[top_words_idx]

    ax = axes[i]
    ax.barh(top_words, top_scores, color=color)
    ax.set_title(f'Topic {i}', fontsize=12, fontweight='bold')
    ax.invert_yaxis()
    ax.tick_params(axis='both', which='major', labelsize=10)
    for spine in ax.spines.values():
        spine.set_visible(False)

fig.suptitle('Top Words per Topic', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


Using Gensim to calculate the Coherence Scores and Visualize Intertopic Chart

In [None]:
from gensim.corpora.dictionary import Dictionary

# create dic, corpus and tokenized
tokenized_docs = [doc.split() for doc in df['text']]
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

Visualize in intertopic

In [None]:
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# pyLDAvis
lda_gensim = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics)
vis_data = gensimvis.prepare(lda_gensim, corpus, dictionary)
pyLDAvis.display(vis_data)

Evaluate by Coherence Scores with c_v and u_mass

In [None]:
from gensim.models import CoherenceModel
topics = lsa.components_

# convert topics to words
top_words_per_topic = []
for topic in topics:
    top_words_idx = topic.argsort()[-10:]  # Get indices of the top words for this topic
    top_words = [feature_names[i] for i in top_words_idx]
    top_words_per_topic.append(top_words)

# C_V coherence score
cv_coherence_model = CoherenceModel(topics=top_words_per_topic, texts=tokenized_docs, dictionary=dictionary, coherence='c_v')
cv_coherence = cv_coherence_model.get_coherence()

# U_Mass coherence score
umass_coherence_model = CoherenceModel(topics=top_words_per_topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')
umass_coherence = umass_coherence_model.get_coherence()


plt.figure(figsize=(10, 6))

plt.plot(range(n_topics), cv_coherence_model.get_coherence_per_topic(), marker='o', label='C_V Coherence')

plt.plot(range(n_topics), umass_coherence_model.get_coherence_per_topic(), marker='o', label='U_Mass Coherence')

plt.xlabel('Topic')
plt.ylabel('Coherence Score')
plt.title('Coherence Scores per Topic')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()

print(f"Overall C_V Coherence Score: {cv_coherence}")
print(f"Overall U_Mass Coherence Score: {umass_coherence}")

mlflow.log_metric("c_v coherence", cv_coherence)
mlflow.log_metric("u_mass coherence", umass_coherence)

In [None]:
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(range(n_topics), cv_coherence_model.get_coherence_per_topic())
plt.xlabel('Topic')
plt.ylabel('C_V Coherence Score')
plt.title('C_V Coherence Scores per Topic')

plt.subplot(1, 2, 2)
plt.plot(range(n_topics), umass_coherence_model.get_coherence_per_topic())
plt.xlabel('Topic')
plt.ylabel('U_Mass Coherence Score')
plt.title('U_Mass Coherence Scores per Topic')

plt.tight_layout()
plt.show()

print(f"Overall C_V Coherence Score: {cv_coherence}")
print(f"Overall U_Mass Coherence Score: {umass_coherence}")

Save Model

In [None]:
import joblib

joblib.dump(lsa, 'lsa_bow_with_balance.pkl')
joblib.dump(bow_vectorizer, 'bow_vectorizer_with_balance.pkl')   #can CHANGE
