# Pipeline 1: TF-IDF + KMeans Clustering (Classical Baseline)

## Purpose
This notebook implements **Pipeline 1** of the six comparative experiments defined in the FYP project "NLP-based AI Platform for Parliament Proceedings Analysis to Enhance Governmental Transparency and Civic Engagement".

- **Input**: Raw full_text from hansard_core500 collection (train split only, **no CPATF preprocessing**)
- **Method**: Classical TF-IDF vectorization + KMeans clustering
- **Objective**: Establish the weakest baseline performance to highlight the incremental contributions of subsequent pipelines (especially CPATF preprocessing, MEHTC hybrid similarity, neural embeddings, and domain-specific fine-tuning).
- **Clustering**: Fixed to 20 clusters (standard baseline setting)
- **Hardware**: c2d-highcpu-32 (32 vCPUs, 64 GB RAM) with multi-threading optimization
- **Metrics** (uniform across all 6 pipelines):
  - Silhouette Score (extrinsic)
  - C_V Coherence
  - NPMI Coherence
  - Topic Diversity
- **Visualization**: Interactive network graph (big nodes = topics, small nodes = top words/sub-issues)
- **Output**: Results saved as `results/results_pipeline1.json` for final comparison in `07_comparison_summary.ipynb`

This pipeline intentionally uses raw noisy text and simple TF-IDF to produce the lowest expected scores, serving as the reference point for ablation study.

### Imports and MongoDB Connection

In [7]:
import pymongo
import os
from pathlib import Path
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import networkx as nx
from pyvis.network import Network
import json
import warnings
from joblib import parallel_backend 
import multiprocessing

warnings.filterwarnings("ignore")

n_jobs = -1  # Use all available cores 

# Load environment variables
project_root = Path.cwd().parents[0] if 'parents' in dir(Path.cwd()) else Path.cwd()
backend_env_path = project_root / "3_app_system" / "backend" / ".env"
load_dotenv(backend_env_path)

# Connect to MongoDB
client = pymongo.MongoClient(os.getenv("MONGO_URI"))
db = client["MyParliament"]

# For Pipeline 1: Use RAW text from hansard_core500 (not CPATF cleaned)
raw_col = db["hansard_core500"]
docs = list(raw_col.find({"split_type": "train"}, {"full_text": 1}))
texts = [doc["full_text"] for doc in docs if doc.get("full_text") and doc["full_text"].strip()]
print(f"Loaded {len(texts)} raw documents for Pipeline 1 (TF-IDF + KMeans)")
print(f"Using n_jobs={n_jobs}")

Loaded 350 raw documents for Pipeline 1 (TF-IDF + KMeans)
Using n_jobs=-1


### Bilingual TF-IDF Vectorization and KMeans Clustering

In [12]:
# Download NLTK English stopwords if not already (run once)
nltk.download('stopwords', quiet=True)

# Load custom Malay stopwords from local file (stopwords-ms collection)
malay_stopwords_path = "stopwords-ms-MannualOp.txt" 
with open(malay_stopwords_path, "r", encoding="utf-8") as f:
    malay_stopwords = [line.strip() for line in f if line.strip()]

# Combine English (NLTK) + Malay (stopwords-ms)
english_stopwords = list(nltk_stopwords.words('english'))
combined_stopwords = english_stopwords + malay_stopwords

print(f"Loaded {len(english_stopwords)} English + {len(malay_stopwords)} Malay stopwords")
print(f"Total bilingual stopwords: {len(combined_stopwords)}")

# Optimized TF-IDF for Malay-English code-switched parliament text
vectorizer = TfidfVectorizer(
    max_df=0.7,           
    min_df=5,            
    stop_words=combined_stopwords,   
    ngram_range=(1, 3),   
    sublinear_tf=True,    
    lowercase=True  
)

tfidf_matrix = vectorizer.fit_transform(texts)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# KMeans clustering 
n_clusters = 20
kmeans = KMeans(
    n_clusters=n_clusters,
    random_state=42,
    n_init=15,
)
labels = kmeans.fit_predict(tfidf_matrix)

# Sampled silhouette for large matrices
silhouette = silhouette_score(tfidf_matrix, labels, sample_size=15000, random_state=42)
print(f"Pipeline 1 (Bilingual) - Silhouette Score: {silhouette:.4f}")

Loaded 198 English + 475 Malay stopwords
Total bilingual stopwords: 673
TF-IDF matrix shape: (350, 226228)
Pipeline 1 (Bilingual) - Silhouette Score: 0.0382


### Parallel Coherence (C_V and NPMI) and Topic Diversity Computation

In [16]:
def get_top_words_per_cluster(tfidf_matrix, labels, feature_names, top_n=10):
    top_words = []
    for i in range(n_clusters):
        cluster_mask = (labels == i)
        if cluster_mask.sum() == 0:
            top_words.append([])
            continue
        cluster_tfidf = tfidf_matrix[cluster_mask].mean(axis=0)
        cluster_tfidf = np.array(cluster_tfidf).flatten()
        top_indices = cluster_tfidf.argsort()[-top_n:][::-1]
        top_words.append([feature_names[idx] for idx in top_indices])
    return top_words

# Get top words
feature_names = vectorizer.get_feature_names_out()
top_words_str_per_cluster = get_top_words_per_cluster(tfidf_matrix, labels, feature_names, top_n=10)

# Tokenize for gensim
tokenized_texts = [text.lower().split() for text in texts]
dictionary = Dictionary(tokenized_texts)

# Convert to token IDs for CoherenceModel
top_words_ids_per_cluster = []
for cluster_words in top_words_str_per_cluster:
    ids = [dictionary.token2id[word] for word in cluster_words if word in dictionary.token2id]
    top_words_ids_per_cluster.append(ids)

# Filter empty clusters
valid_top_words_ids = [ids for ids in top_words_ids_per_cluster if ids]
valid_n_clusters = len(valid_top_words_ids)
print(f"Found {n_clusters - valid_n_clusters} empty clusters")

if valid_n_clusters == 0:
    coherence_cv = coherence_npmi = 0.0
    td = 0.0
else:
    coherence_cv = CoherenceModel(topics=valid_top_words_ids, texts=tokenized_texts, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_npmi = CoherenceModel(topics=valid_top_words_ids, texts=tokenized_texts, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Topic Diversity
all_top_words_set = set(word for cluster in top_words_str_per_cluster if cluster for word in cluster[:10])
td = len(all_top_words_set) / (valid_n_clusters * 10) if valid_n_clusters > 0 else 0

print(f"Pipeline 1 - C_V Coherence: {coherence_cv:.4f}")
print(f"Pipeline 1 - NPMI Coherence: {coherence_npmi:.4f}")
print(f"Pipeline 1 - Topic Diversity: {td:.4f} (based on {valid_n_clusters} topics)")

Found 2 empty clusters
Pipeline 1 - C_V Coherence: 0.7946
Pipeline 1 - NPMI Coherence: nan
Pipeline 1 - Topic Diversity: 0.8500 (based on 18 topics)


### Network Graph

In [27]:
def generate_network_graph(top_words_str_per_cluster, n_clusters):
    # Create the directory if it doesn't exist
    network_graph_dir = project_root / "2_ml_modeling/network_graph"
    network_graph_dir.mkdir(parents=True, exist_ok=True)
    html_path = network_graph_dir / "pipeline1_network.html"

    G = nx.Graph()
    for cluster_id, words in enumerate(top_words_str_per_cluster):
        if not words:
            continue
        topic_node = f"Topic_{cluster_id}"
        G.add_node(topic_node, size=60, group=cluster_id, title=f"Topic {cluster_id}", shape='ellipse')
        for word in words:
            G.add_node(word, size=25, group=cluster_id)
            G.add_edge(topic_node, word, weight=3)
        for i in range(len(words)):
            for j in range(i+1, len(words)):
                G.add_edge(words[i], words[j], weight=1)
    
    net = Network(height="900px", width="100%", notebook=True, cdn_resources='in_line')
    net.from_nx(G)
    net.show_buttons(filter_=['physics'])
    net.save_graph(str(html_path))
    print(f"Network graph saved: {html_path}")

generate_network_graph(top_words_str_per_cluster, n_clusters)

Network graph saved: /mnt/data/MyParliament/2_ml_modeling/network_graph/pipeline1_network.html


### Save Results

In [19]:
results = {
    "pipeline": "01_tfidf_kmeans",
    "silhouette": float(silhouette),
    "coherence_cv": float(coherence_cv),
    "coherence_npmi": float(coherence_npmi),
    "topic_diversity": float(td),
    "valid_clusters": valid_n_clusters,
    "total_clusters": n_clusters
}

Path("results").mkdir(exist_ok=True)
with open("results/pipeline1_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Pipeline 1 COMPLETED! All results saved.")

Pipeline 1 COMPLETED! All results saved.


### Save the TF-IDF + KMeans model to pickle file

In [25]:
import pickle
from datetime import datetime

save_dir = project_root / "2_ml_modeling/model"
save_dir.mkdir(parents=True, exist_ok=True)  

model_filename = save_dir / "tfidf_kmeans_model.pkl"

# Bundle all important objects for future reuse
model_data = {
    "kmeans": kmeans,
    "vectorizer": vectorizer,
    "n_clusters": n_clusters,
    "feature_names": feature_names.tolist(),  
    "top_words_per_cluster": top_words_str_per_cluster,
    "labels": labels.tolist(),                
    "silhouette_score": silhouette,
    "coherence_cv": coherence_cv,
    "coherence_npmi": coherence_npmi,
    "topic_diversity": td,
    "saved_at": datetime.now().isoformat(),
    "description": "Pipeline 1: TF-IDF + KMeans (20 clusters) on raw hansard_core500 with bilingual stopwords and n-grams (1-3)"
}

# Save to pickle file
with open(model_filename, "wb") as f:
    pickle.dump(model_data, f)

print(f"Model successfully saved to: {model_filename}")
print(f"Included objects: kmeans, vectorizer, top words, cluster labels, evaluation metrics, etc.")

Model successfully saved to: /mnt/data/MyParliament/2_ml_modeling/model/tfidf_kmeans_model.pkl
Included objects: kmeans, vectorizer, top words, cluster labels, evaluation metrics, etc.
