### Imports and MongoDB Connection

In [4]:
import pymongo
import os
from pathlib import Path
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.metrics import silhouette_score
import networkx as nx
from pyvis.network import Network
import json
import pickle
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

# Load environment
project_root = Path.cwd().parents[0] if 'parents' in dir(Path.cwd()) else Path.cwd()
backend_env_path = project_root / "3_app_system" / "backend" / ".env"
load_dotenv(backend_env_path)

client = pymongo.MongoClient(os.getenv("MONGO_URI"))
db = client["MyParliament"]

# Load same raw text as Pipeline 1 (fair comparison)
raw_col = db["hansard_core500"]
docs = list(raw_col.find({"split_type": "train"}, {"full_text": 1}))
texts = [doc["full_text"] for doc in docs if doc.get("full_text") and doc["full_text"].strip()]
print(f"Loaded {len(texts)} raw documents for Pipeline 2 (TF-IDF + LDA)")

Loaded 350 raw documents for Pipeline 2 (TF-IDF + LDA)


### Bilingual TF-IDF Preprocessing 

In [5]:
nltk.download('stopwords', quiet=True)

malay_stopwords_path = "stopwords-ms-MannualOp.txt"  
with open(malay_stopwords_path, "r", encoding="utf-8") as f:
    malay_stopwords = [line.strip() for line in f if line.strip()]

english_stopwords = list(nltk_stopwords.words('english'))
combined_stopwords = english_stopwords + malay_stopwords
print(f"Using same bilingual stopwords: {len(combined_stopwords)} total")

vectorizer = TfidfVectorizer(
    max_df=0.7,
    min_df=5,
    stop_words=combined_stopwords,
    ngram_range=(1, 3),
    sublinear_tf=True,
    lowercase=True
)

tfidf_matrix = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

Using same bilingual stopwords: 673 total
TF-IDF matrix shape: (350, 226228)


### Tokenization + LDA Topic Number Tuning (10-50)

In [6]:
tokenized_texts = [text.lower().split() for text in texts]
dictionary = Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

best_coherence = -np.inf
best_perplexity = np.inf
best_model = None
best_num_topics = 0

print("Tuning LDA topic number (10-50)...")
for num_topics in range(10, 51, 5):
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
        iterations=100,
        eval_every=1
    )
    
    perplexity = lda_model.log_perplexity(corpus)
    
    topics = [[dictionary[id] for id, prob in lda_model.get_topic_terms(topic_id, topn=10)] 
              for topic_id in range(num_topics)]
    coherence_model = CoherenceModel(topics=topics, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    
    print(f"Topics={num_topics}: Perplexity={perplexity:.2f}, C_V={coherence:.4f}")
    
    if coherence > best_coherence:
        best_coherence = coherence
        best_perplexity = perplexity
        best_model = lda_model 
        best_num_topics = num_topics

print(f"\nBest LDA model: {best_num_topics} topics (C_V={best_coherence:.4f}, Perplexity={best_perplexity:.2f})")

with open("model/best_lda_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("Best LDA model saved to model/best_lda_model.pkl")

Tuning LDA topic number (10-50)...
Topics=10: Perplexity=-8.10, C_V=0.4296
Topics=15: Perplexity=-8.20, C_V=0.4322
Topics=20: Perplexity=-8.31, C_V=0.4312
Topics=25: Perplexity=-8.43, C_V=0.4292
Topics=30: Perplexity=-8.55, C_V=0.4240
Topics=35: Perplexity=-8.67, C_V=0.4256
Topics=40: Perplexity=-8.80, C_V=0.4285
Topics=45: Perplexity=-8.91, C_V=0.4232
Topics=50: Perplexity=-9.07, C_V=0.4185

Best LDA model: 15 topics (C_V=0.4322, Perplexity=-8.20)
Best LDA model saved to results/best_lda_model.pkl


In [7]:
# Matrix construction
doc_topic_matrix = np.zeros((len(corpus), best_num_topics))
for i, bow in enumerate(corpus):
    if bow:
        dist = best_model.get_document_topics(bow, minimum_probability=0.0)
        for topic_id, prob in dist:
            if topic_id < best_num_topics:
                doc_topic_matrix[i, topic_id] = prob

# Normalize
doc_topic_matrix = np.nan_to_num(doc_topic_matrix)
row_sums = doc_topic_matrix.sum(axis=1, keepdims=True)
doc_topic_matrix = np.divide(doc_topic_matrix, row_sums, out=np.zeros_like(doc_topic_matrix), where=row_sums!=0)

labels = np.argmax(doc_topic_matrix, axis=1)
silhouette = silhouette_score(doc_topic_matrix, labels, sample_size=15000, random_state=42)
print(f"Pipeline 2 - Silhouette Score: {silhouette:.4f}")

Pipeline 2 - Silhouette Score: 0.8402


### Extract Top Words + Compute Metrics

In [8]:
top_words_per_topic = []
for topic_id in range(best_num_topics):
    words_probs = best_model.get_topic_terms(topic_id, topn=10)
    words = [dictionary[id] for id, prob in words_probs]
    top_words_per_topic.append(words)

# Filter empty
valid_top_words = [words for words in top_words_per_topic if words]
valid_n_topics = len(valid_top_words)

# Coherence 
coherence_cv = best_coherence
coherence_npmi = CoherenceModel(topics=valid_top_words, texts=tokenized_texts, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Topic Diversity
all_top_words_set = set(word for cluster in valid_top_words for word in cluster[:10])
td = len(all_top_words_set) / (valid_n_topics * 10) if valid_n_topics > 0 else 0

print(f"Pipeline 2 - C_V Coherence: {coherence_cv:.4f}")
print(f"Pipeline 2 - NPMI Coherence: {coherence_npmi:.4f}")
print(f"Pipeline 2 - Topic Diversity: {td:.4f} (based on {valid_n_topics} topics)")
print(f"Pipeline 2 - Perplexity: {best_perplexity:.4f}")

Pipeline 2 - C_V Coherence: 0.4322
Pipeline 2 - NPMI Coherence: -0.0236
Pipeline 2 - Topic Diversity: 0.1600 (based on 15 topics)
Pipeline 2 - Perplexity: -8.1974


### Network Graph

In [10]:
def generate_network_graph(top_words_per_topic, num_topics):
    network_graph_dir = project_root / "2_ml_modeling/network_graph"
    network_graph_dir.mkdir(parents=True, exist_ok=True)
    html_path = network_graph_dir / "pipeline2_network.html"

    G = nx.Graph()
    for topic_id, words in enumerate(top_words_per_topic):
        if not words:
            continue
        topic_node = f"Topic_{topic_id}"
        G.add_node(topic_node, size=80, group=topic_id, title=f"Topic {topic_id}", shape='ellipse', color='red')
        for word in words:
            G.add_node(word, size=30, group=topic_id, color='lightblue')
            G.add_edge(topic_node, word, weight=5)
        for i in range(len(words)):
            for j in range(i+1, len(words)):
                G.add_edge(words[i], words[j], weight=2)
    
    net = Network(height="900px", width="100%", notebook=True, cdn_resources='in_line')
    net.from_nx(G)
    net.show_buttons(filter_=['physics'])
    net.force_atlas_2based()
    net.show(str(html_path))
    print("Network graph saved: {html_path}")

generate_network_graph(top_words_per_topic, best_num_topics)

/mnt/data/MyParliament/2_ml_modeling/network_graph/pipeline2_network.html
Network graph saved: {html_path}


### Save Results

In [11]:
results = {
    "pipeline": "02_tfidf_lda",
    "best_num_topics": best_num_topics,
    "silhouette": float(silhouette),
    "coherence_cv": float(coherence_cv),
    "coherence_npmi": float(coherence_npmi),
    "topic_diversity": float(td),
    "perplexity": float(best_perplexity),
    "valid_topics": valid_n_topics
}

with open("results/pipeline2_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Pipeline 2 COMPLETED! Results saved.")

Pipeline 2 COMPLETED! Results saved.
