### Imports and load .env

In [1]:
import pymongo
import json
import re
import random
import os
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from dotenv import load_dotenv
from pathlib import Path
from fuzzywuzzy import process

project_root = Path.cwd().parents[1]
backend_env_path = project_root / "3_app_system" / "backend" / ".env"

if not backend_env_path.exists():
    raise FileNotFoundError(f".env not found at {backend_env_path}")

load_dotenv(backend_env_path)

MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise ValueError("MONGO_URI not found in .env")

client = pymongo.MongoClient(MONGO_URI)
db = client["MyParliament"]

hansard_col = db["hansard_core500"]
honorific_col = db["honorific_dictionary"]
segmented_col = db["hansard_segmented500"]
mp_col = db["MP"]

mp_names = [mp["full_name_with_titles"] for mp in mp_col.find({}, {"full_name_with_titles": 1})]

thread_local = threading.local()
def get_db_connection():
    if not hasattr(thread_local, "client"):
        thread_local.client = pymongo.MongoClient(MONGO_URI)
    return thread_local.client["MyParliament"]

print(f"MongoDB connected (.env loaded from {backend_env_path})")

MongoDB connected (.env loaded from c:\Users\User\Desktop\MyParliament\MyParliament\3_app_system\backend\.env)


### Load honorific_dictionary and analysis JSON

In [2]:
honorific_doc = honorific_col.find_one({"version": "2.0"})
if not honorific_doc:
    raise ValueError("honorific_dictionary version 2.0 not found")

all_honorifics = set()
for category in honorific_doc["categories"].values():
    all_honorifics.update([h.strip().rstrip("'") for h in category])

all_honorifics.update(["Yang Berhormat", "Timbalan Yang di-Pertua", "Enche'", "Mr."])

honorific_regex = "|".join(sorted(all_honorifics, key=len, reverse=True))
print(f"Dynamic honorific regex built: {len(all_honorifics)} terms")

with open("../03_patternAnalysis/combined_parliament_analysis.json", "r", encoding="utf-8") as f:
    analysis_data = json.load(f)

Dynamic honorific regex built: 24 terms


### Regex patterns

In [3]:
PRIMARY_PATTERN = re.compile(
    rf"^({honorific_regex})\s+([A-Za-z'\s]+?)\s*(\[([A-Za-z\s\-]+)\])?:?\s*",
    re.IGNORECASE
)
FALLBACK_PATTERN = re.compile(r"^([A-Z][A-Za-z'\s]+?)\s*(\[.*?\])?:?\s*", re.IGNORECASE)
ENGLISH_OLD_PATTERN = re.compile(r"^(Mr\.|Encik|Tuan|Enche')\s+([A-Za-z\s]+?):", re.IGNORECASE)

def get_decade(year: int) -> str:
    return "pre1970" if year < 1970 else "post1970"

### Helper function

In [4]:
def clean_line(line):
    line = re.sub(r'\s+', ' ', line.strip())
    if not line or re.match(r'^[\d\W]+$', line):  # Only skip empty or pure symbols/numbers
        return None
    return line

def skip_header_and_doa(lines, max_lines=300):
    start_idx = 0
    in_attendance = False
    attendance_keywords = ["KEHADIRAN AHLI-AHLI", "AHLI-AHLI YANG HADIR", "KEHADIRAN"]
    for i, line in enumerate(lines[:max_lines]):
        stripped = line.strip().upper()
        if any(kw in stripped for kw in attendance_keywords + ["NASKHAH BELUM DISEM", "DEWAN RAKYAT", "PENGGAL", "MESYUARAT", "KANDUNGAN", "WAKTU PERTANYAAN", "BIL."]):
            start_idx = i + 1
            if any(kw in stripped for kw in attendance_keywords):
                in_attendance = True
        if "DOA" in stripped and len(stripped.split()) < 10:  # Short DOA line
            start_idx = max(start_idx, i + 1)
        # Exit attendance mode when real speech starts
        if in_attendance and (":" in stripped and any(h in stripped.upper() for h in all_honorifics)):
            in_attendance = False
            start_idx = i
    return start_idx

def extract_speaker(line, decade):
    if not ':' in line:  # Must have colon for speaker tag
        return None, None
    
    if decade == "pre1970":
        m = ENGLISH_OLD_PATTERN.match(line)
        if m:
            candidate = m.group(1) + " " + m.group(2).strip()
            best_match, score = process.extractOne(candidate, mp_names)
            if score > 85:
                return best_match, None
    
    m = PRIMARY_PATTERN.match(line)
    if m:
        honorific = m.group(1).strip()
        name = m.group(2).strip()
        candidate = f"{honorific} {name}"
        best_match, score = process.extractOne(candidate, mp_names)
        if score > 85:
            constituency = m.group(4) if m.group(4) else None
            return best_match, constituency
    
    m = FALLBACK_PATTERN.match(line)
    if m:
        candidate = m.group(1).strip()
        best_match, score = process.extractOne(candidate, mp_names)
        if score > 85:
            return best_match, m.group(2)[1:-1] if m.group(2) else None
    return None, None

### Core segmentation function

In [5]:
def segment_document(doc_id, text, year):
    lines = [clean_line(l) for l in text.splitlines() if clean_line(l)]
    decade = get_decade(year)
    start_idx = skip_header_and_doa(lines)
    
    segments = []
    current_speaker = None
    current_constituency = None
    current_text = []
    current_start_line = start_idx
    
    for i, line in enumerate(lines[start_idx:], start=start_idx):
        speaker, constituency = extract_speaker(line, decade)
        if speaker:
            if current_speaker and current_text:
                segments.append({
                    "speaker": current_speaker,
                    "constituency": current_constituency,
                    "start_line": current_start_line,
                    "text": " ".join(current_text).strip()
                })
            current_speaker = speaker
            current_constituency = constituency
            # Pure speech content after colon
            content = line.split(':', 1)[1].strip() if ':' in line else line
            current_text = [content] if content else []
            current_start_line = i
        elif current_speaker:
            current_text.append(line)
    
    if current_speaker and current_text:
        segments.append({
            "speaker": current_speaker,
            "constituency": current_constituency,
            "start_line": current_start_line,
            "text": " ".join(current_text).strip()
        })
    
    return {
        "document_id": str(doc_id),
        "hansardDate": year,
        "decade": decade,
        "segment_count": len(segments),
        "segments": segments
    }

### Test 10 random samples + QA/Injection insight

In [6]:
all_docs = list(hansard_col.find({}, {"_id": 1, "full_text": 1, "content_text": 1, "hansardDate": 1}))
random.shuffle(all_docs)
test_docs = all_docs[:10]

print("=== TESTING 10 RANDOM SAMPLES ===")
for idx, doc in enumerate(test_docs):
    text = doc.get("full_text") or doc.get("content_text") or ""
    if not text:
        continue
    result = segment_document(doc["_id"], text, doc["hansardDate"].year)
    print(f"\nSample {idx+1} | ID: {doc['_id']} | Date: {doc['hansardDate'].date()} | Decade: {result['decade']} | Segments: {result['segment_count']}")
    
    if result['segments']:
        print(f"   First speaker: {result['segments'][0]['speaker']} [{result['segments'][0]['constituency'] or 'None'}]")
        print(f"   First text: {result['segments'][0]['text'][:150]}{'...' if len(result['segments'][0]['text']) > 150 else ''}")
        
        # QA & Injection detection example
        qa_examples = []
        injection_examples = []
        for i, seg in enumerate(result['segments']):
            text_lower = seg['text'].lower()
            if "?" in seg['text'] or "pertanyaan" in text_lower or "soalan" in text_lower:
                qa_examples.append(f"Q ({seg['speaker']}): {seg['text'][:100]}...")
                if i+1 < len(result['segments']):
                    next_text = result['segments'][i+1]['text'].lower()
                    if "jawab" in next_text or "menjawab" in next_text:
                        qa_examples.append(f"   A ({result['segments'][i+1]['speaker']}): {result['segments'][i+1]['text'][:100]}...")
            # Injection: other speaker name appears in text
            for other_seg in result['segments']:
                if other_seg['speaker'] != seg['speaker'] and other_seg['speaker'] in seg['text']:
                    injection_examples.append(f"Injection in {seg['speaker']}'s speech: mentions {other_seg['speaker']}")
        
        if qa_examples:
            print("   QA Example:")
            for ex in qa_examples[:4]:
                print(f"     {ex}")
        if injection_examples:
            print("   Injection Example:")
            for ex in injection_examples[:2]:
                print(f"     {ex}")

print("\nCheck output above. ")

=== TESTING 10 RANDOM SAMPLES ===

Sample 1 | ID: 673390ead052c3fbabd83b44 | Date: 2000-03-30 | Decade: post1970 | Segments: 273
   First speaker: Tuan Ong Tin Kim [None]
   First text: Tuan Yang di-Pertua, peruntukan untuk menubuhkan Bursa Buruh Elektronik (ELX) ini adalah sebanyak RM28.26 juta. Kerajaan sedang dalam proses peringkat...
   QA Example:
     Q (YB Tuan Haji Abdul Latiff bin Abdul Rahman): Terima kasih Tuan Yang di Pertua, soalan tambahan. Saya ingin mengucap tahniah di atas usaha-usaha k...
     Q (Dr V. David): Tuan Yang di-Pertua, terima kasih Yang Berhormat bagi Sungai Benut. Sepatutnya soalan tambahan ini s...
     Q (YB Tuan Lim Lip Eng): Tuan Yang di-Pertua, terima kasih. Soalan tambahan saya adalah jikalau peguam asing akan dibenarkan ...
     Q (YB Tuan Haji Abdul Latiff bin Abdul Rahman): Tuan Yang di-Pertua, terima kasih. Soalan mengenai ada sekatan di antara peguam Semenanjung dengan p...

Sample 2 | ID: 67338d77d052c3fbabd83b09 | Date: 1999-04-19 | Decade: p

### Multi-threaded run

In [None]:
print("=== FULL MULTI-THREADED RUN - SAVING 500 DOCUMENTS WITH METADATA ===")

def process_single_doc(doc):
    text = doc.get("full_text") or doc.get("content_text") or ""
    if not text:
        return None
    year = doc["hansardDate"].year
    
    # Run segmentation
    segmented_result = segment_document(doc["_id"], text, year)
    
    # Combine original metadata + segmentation output
    saved_doc = {
        "original_id": str(doc["_id"]),
        "hansardDate": doc.get("hansardDate"),
        "full_text": text,  # Keep raw text for reference
        "split_type": doc.get("split_type"),
        "mesyuarat": doc.get("mesyuarat"),
        "parlimen": doc.get("parlimen"),
        "parlimen_range": doc.get("parlimen_range"),
        "penggal": doc.get("penggal"),
        "decade": segmented_result["decade"],
        "segment_count": segmented_result["segment_count"],
        "segmentation_output": segmented_result["segments"]  # Main result
    }
    return saved_doc

all_documents = list(hansard_col.find({}, {
    "_id": 1,
    "full_text": 1,
    "content_text": 1,
    "hansardDate": 1,
    "split_type": 1,
    "mesyuarat": 1,
    "parlimen": 1,
    "parlimen_range": 1,
    "penggal": 1
}))

MAX_WORKERS = 20
batch_size = 50
inserted_count = 0

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_single_doc, doc) for doc in all_documents]
    batch = []
    
    for future in as_completed(futures):
        result = future.result()
        if result:
            batch.append(result)
            inserted_count += 1
        
        if len(batch) >= batch_size:
            local_db = get_db_connection()
            local_db["hansard_segmented500"].insert_many(batch)
            print(f"Inserted batch of {len(batch)} | Total: {inserted_count}/{len(all_documents)}")
            batch = []
    
    if batch:
        local_db = get_db_connection()
        local_db["hansard_segmented500"].insert_many(batch)
        print(f"Final batch inserted: {len(batch)}")

print(f"\nSegmentation completed! {inserted_count} documents with full metadata saved to 'hansard_segmented500'")

=== FULL MULTI-THREADED RUN - SAVING 500 DOCUMENTS WITH METADATA ===
Inserted batch of 50 | Total: 50/500
Inserted batch of 50 | Total: 100/500
Inserted batch of 50 | Total: 150/500
Inserted batch of 50 | Total: 200/500
Inserted batch of 50 | Total: 250/500
Inserted batch of 50 | Total: 300/500
Inserted batch of 50 | Total: 350/500
Inserted batch of 50 | Total: 400/500


### Continue run after interruption

In [7]:
from bson import ObjectId  

print("=== FULL MULTI-THREADED RUN - SAVING 500 DOCUMENTS WITH METADATA ===")
db = get_db_connection()  # Get DB connection early to check progress

# Count already processed and total documents
already_processed = db["hansard_segmented500"].count_documents({})
total_docs = hansard_col.count_documents({})
print(f"Already processed: {already_processed} documents | Total: {total_docs} | Remaining: {total_docs - already_processed}")

# Get list of original_id values that have already been processed
processed_ids = db["hansard_segmented500"].distinct("original_id")

# Build query to fetch only unprocessed documents
if processed_ids:
    query_filter = {"_id": {"$nin": [ObjectId(pid) for pid in processed_ids]}}
else:
    query_filter = {}  # First run: process everything

all_documents = list(hansard_col.find(query_filter, {
    "_id": 1,
    "full_text": 1,
    "content_text": 1,
    "hansardDate": 1,
    "split_type": 1,
    "mesyuarat": 1,
    "parlimen": 1,
    "parlimen_range": 1,
    "penggal": 1
}))

if not all_documents:
    print("All documents have already been processed! Nothing to do.")
    exit()

print(f"This run will process {len(all_documents)} new documents\n")

# === Proceed segmentation and saving ===

def process_single_doc(doc):
    text = doc.get("full_text") or doc.get("content_text") or ""
    if not text:
        return None
    year = doc["hansardDate"].year
    
    # Run segmentation
    segmented_result = segment_document(doc["_id"], text, year)
    
    # Combine original metadata + segmentation output
    saved_doc = {
        "original_id": str(doc["_id"]),
        "hansardDate": doc.get("hansardDate"),
        "full_text": text,  # Keep raw text for reference
        "split_type": doc.get("split_type"),
        "mesyuarat": doc.get("mesyuarat"),
        "parlimen": doc.get("parlimen"),
        "parlimen_range": doc.get("parlimen_range"),
        "penggal": doc.get("penggal"),
        "decade": segmented_result["decade"],
        "segment_count": segmented_result["segment_count"],
        "segmentation_output": segmented_result["segments"]  # Main result
    }
    return saved_doc

MAX_WORKERS = 20
batch_size = 50
inserted_count = 0  # Count of documents inserted in this run

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(process_single_doc, doc) for doc in all_documents]
    batch = []
    
    for future in as_completed(futures):
        result = future.result()
        if result:
            batch.append(result)
            inserted_count += 1
        
        if len(batch) >= batch_size:
            local_db = get_db_connection()
            local_db["hansard_segmented500"].insert_many(batch)
            print(f"Inserted batch of {len(batch)} | This run: {inserted_count}/{len(all_documents)} | "
                  f"Total processed: {already_processed + inserted_count}/{total_docs}")
            batch = []
    
    # Insert any remaining documents in the final batch
    if batch:
        local_db = get_db_connection()
        local_db["hansard_segmented500"].insert_many(batch)
        print(f"Final batch inserted: {len(batch)}")

print(f"\nThis run completed! Inserted {inserted_count} new documents.")
print(f"Overall: {already_processed + inserted_count} / {total_docs} documents saved to 'hansard_segmented500'")

=== FULL MULTI-THREADED RUN - SAVING 500 DOCUMENTS WITH METADATA ===
Already processed: 450 documents | Total: 500 | Remaining: 50
This run will process 50 new documents

Inserted batch of 50 | This run: 50/50 | Total processed: 500/500

This run completed! Inserted 50 new documents.
Overall: 500 / 500 documents saved to 'hansard_segmented500'
