## BM25 Model - Information Retrieval System
Model: BM25Okapi + tokenized preprocessed questions.
Tujuan: bangun baseline BM25, fungsi search, evaluasi P@k/Recall/AP/MAP/NDCG.

In [1]:
!pip install rank-bm25 Sastrawi

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import time
from pathlib import Path
from tqdm import tqdm
import re
from rank_bm25 import BM25Okapi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import warnings
warnings.filterwarnings('ignore')
print("Libraries loaded for BM25")

Libraries loaded for BM25


In [None]:
# Load full processed data (selaras dengan TF-IDF setup)

data_path = Path('../data/processed/processed_full.csv')
print(f"Loading: {data_path}")

df = pd.read_csv(data_path)
df = df.dropna(subset=['processed_question']).reset_index(drop=True)

# Ensure text columns are string
df['title'] = df['title'].astype(str).fillna('')
df['processed_question'] = df['processed_question'].astype(str)
df['processed_answer'] = df['processed_answer'].astype(str)

print(f"Loaded shape: {df.shape}")
print(df[['title','processed_question','topic_set']].head(3))

Loading: ..\data\processed\processed_full.csv
Loaded shape: (360513, 20)
                                              title  \
0                        Khasiat obat zinc sulphate   
1                      Perbedaan jenis formula zinc   
2  Mengkonsumsi suplemen zinc yang sudah kadaluarsa   

                                  processed_question      topic_set  
0  khasiat obat zinc sulphate dok mau tanya anak ...  zinc-sulphate  
1  beda jenis formula zinc siang dokter dokter sa...  zinc-sulphate  
2  konsumsi suplemen zinc kadaluarsa malam dok ba...  zinc-sulphate  


In [None]:
# Susun korpus gabungan: title + processed_question (sudah di-stem dari preprocessing)
print("Menyusun korpus gabungan (title + processed_question) tanpa cleaning tambahan...")

df['doc_text'] = (df['title'] + ' ' + df['processed_question']).str.strip()
documents = df['doc_text'].tolist()

print(f"Total documents: {len(documents):,}")
print("Sample document (gabungan):")
print(documents[0][:200])
print("\n Dokumen = title + processed_question; tidak ada pembersihan tambahan")

Menyusun korpus gabungan (title + processed_question) tanpa cleaning tambahan...
Total documents: 360,513
Sample document (gabungan):
Khasiat obat zinc sulphate khasiat obat zinc sulphate dok mau tanya anak kan nak fimosis terus kasih obat zincpro zinc sulphate drops 10mg ml kata buat vitamin lancar bak la terus tak baca kok malah o

 Dokumen = title + processed_question; tidak ada pembersihan tambahan


In [None]:
# TF-IDF vectorization (untuk konstruksi relevansi yang adil vs TF-IDF notebook)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("\nInitializing TF-IDF for relevance construction...")
tfidf_vectorizer_rel = TfidfVectorizer(
    max_features=30000,
    min_df=2,
    max_df=0.90,
    ngram_range=(1, 2),
    sublinear_tf=True,
    norm='l2',
    lowercase=True,
    token_pattern=r'(?u)\b\w+\b'
)

start_time = time.time()
tfidf_matrix_rel = tfidf_vectorizer_rel.fit_transform(df['doc_text'].tolist())
print(f"TF-IDF (relevance) built in {time.time()-start_time:.2f}s | shape={tfidf_matrix_rel.shape}")


Initializing TF-IDF for relevance construction...
TF-IDF (relevance) built in 37.71s | shape=(360513, 30000)


In [6]:
# Tokenize corpus (dokumen sudah di-stem dari preprocessing)
tokenized_corpus = [doc.split() for doc in df['doc_text'].tolist()]

# Build BM25 dengan default params (baseline approach)
bm25 = BM25Okapi(tokenized_corpus)

print("BM25 index built")
print(f"Docs: {len(tokenized_corpus):,}")

BM25 index built
Docs: 360,513


In [7]:
# Query preprocessing (mirror TF-IDF pipeline)
stemmer = StemmerFactory().create_stemmer()
stopwords = StopWordRemoverFactory().get_stop_words()
def preprocess_query(query):
    if pd.isna(query) or query == "":
        return ""
    text = str(query).lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'\d+px', '', text)
    text = re.sub(r'padding|margin|font|vertical|align', '', text)
    text = re.sub(r'\b\w*\d+\w*\b', ' ', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\d{1,2}:\d{2}', '', text)
    text = re.sub(r'wib|wit|wita', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [t for t in tokens if len(t) > 1 and not any(c.isdigit() for c in t)]
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)
test_queries = [
    "anak saya demam tinggi dan batuk",
    "bagaimana cara mengatasi asam lambung?",
    "Ibu hamil boleh minum obat apa ya dok",
    "anak 2 tahun demam 39 derajat"
]
print("\nQuery preprocessing examples:")
for i,q in enumerate(test_queries,1):
    processed = preprocess_query(q)
    print(f"{i}. {q} -> {processed}")


Query preprocessing examples:
1. anak saya demam tinggi dan batuk -> anak demam tinggi batuk
2. bagaimana cara mengatasi asam lambung? -> bagaimana cara atas asam lambung
3. Ibu hamil boleh minum obat apa ya dok -> ibu hamil minum obat apa dok
4. anak 2 tahun demam 39 derajat -> anak tahun demam derajat


In [8]:
def search_bm25(query, top_k=10, preprocessed=False):
    processed_query = query if preprocessed else preprocess_query(query)
    if not processed_query:
        print("Empty query after preprocessing")
        return pd.DataFrame()
    tokens = processed_query.split()
    scores = bm25.get_scores(tokens)
    top_indices = np.argsort(scores)[-top_k:][::-1]
    top_scores = scores[top_indices]
    results = df.iloc[top_indices].copy()
    results['bm25_score'] = top_scores
    results['rank'] = range(1, len(results)+1)
    cols = ['rank','bm25_score','title','answer','topic_set','answer_count','year']
    cols = [c for c in cols if c in results.columns]
    return results[cols]
test_queries_short = [
    "anak demam tinggi",
    "sakit kepala dan mual",
    "cara mengatasi asam lambung",
    "ibu hamil minum obat",
    "batuk berdahak tidak sembuh"
]
for q in test_queries_short:
    print(f"\nQuery: {q}")
    res = search_bm25(q, top_k=3)
    for _,row in res.iterrows():
        print(f"{row['rank']}. [{row['bm25_score']:.4f}] {row['title'][:70]}")


Query: anak demam tinggi
1. [16.2118] Demam tinggi pada anak
2. [15.8561] Penanganan demam tinggi pada anak secara tiba-tiba
3. [15.6844] Solusi atasi demam tinggi pada anak

Query: sakit kepala dan mual
1. [12.1610] sakit kepala disertai mual
2. [12.0896] sakit kepala dan mual pada sinusitis
3. [12.0202] Apa obat untuk sakit kepala dan mual

Query: cara mengatasi asam lambung
1. [21.0566] Cara mengatasi asam lambung
2. [20.9223] cara mengatasi mual karena asam lambung
3. [20.9223] Cara mengatasi gejala asam lambung

Query: ibu hamil minum obat
1. [15.4001] Bahaya tidak ibu hamil minum obat gatal
2. [15.2865] Bolehkah ibu hamil minum obat mabuk?
3. [15.2865] Bolehkah ibu hamil minum obat mabuk?

Query: batuk berdahak tidak sembuh
1. [18.2365] Batuk berdahak putih tidak sembuh-sembuh
2. [18.2356] Solusi atasi batuk berdahak pada bayi tak kunjung sembuh
3. [17.5963] Penyebab batuk berdahak pada anak tidak kunjung sembuh dan menjadi bat


In [9]:
def precision_at_k(relevant_docs, retrieved_docs, k):
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = len(set(retrieved_at_k) & set(relevant_docs))
    return relevant_retrieved / k if k > 0 else 0.0
def recall_at_k(relevant_docs, retrieved_docs, k):
    if len(relevant_docs) == 0:
        return 0.0
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = len(set(retrieved_at_k) & set(relevant_docs))
    return relevant_retrieved / len(relevant_docs)
def average_precision(relevant_docs, retrieved_docs, k=None):
    if len(relevant_docs) == 0:
        return 0.0
    if k is None:
        k = len(retrieved_docs)
    retrieved_at_k = retrieved_docs[:k]
    precisions = []
    num_relevant = 0
    for i, doc_id in enumerate(retrieved_at_k, 1):
        if doc_id in relevant_docs:
            num_relevant += 1
            precisions.append(num_relevant / i)
    if len(precisions) == 0:
        return 0.0
    return sum(precisions) / len(relevant_docs)
def ndcg_at_k(relevant_docs, retrieved_docs, k):
    retrieved_at_k = retrieved_docs[:k]
    relevant_set = set(relevant_docs)
    dcg = 0.0
    for i, doc_id in enumerate(retrieved_at_k, 1):
        relevance = 1 if doc_id in relevant_set else 0
        dcg += relevance / np.log2(i + 1)
    m = min(len(relevant_set), k)
    idcg = sum(1 / np.log2(i + 1) for i in range(1, m + 1))
    return dcg / idcg if idcg > 0 else 0.0
print("Metrics ready: Precision/Recall/AP/MAP/NDCG")

Metrics ready: Precision/Recall/AP/MAP/NDCG


In [10]:
# Relevansi deterministik (BM25-based): ambil hingga 30 dokumen paling mirip (BM25 score) dalam topik, excl. dirinya
k_values = [5, 10, 20]
test_topics = df['topic_set'].value_counts().head(10).index.tolist()
test_queries_data = []

for topic in test_topics:
    topic_docs = df[df['topic_set'] == topic]
    if len(topic_docs) < 5:
        continue
    query_samples = topic_docs.sample(n=3, random_state=42)

    for idx, query_row in query_samples.iterrows():
        topic_indices = topic_docs.index.tolist()
        query_tokens = query_row['doc_text'].split()

        # Skor BM25 terhadap seluruh korpus, lalu ambil yang satu topik saja
        scores = bm25.get_scores(query_tokens)
        ranked = [(doc_idx, scores[doc_idx]) for doc_idx in topic_indices if doc_idx != idx]
        ranked = sorted(ranked, key=lambda x: x[1], reverse=True)

        # Ambil top-30 terdekat sebagai relevan
        relevant_indices = [r[0] for r in ranked[:30]]

        test_queries_data.append({
            'query': query_row['title'],
            'query_processed': query_row['processed_question'],
            'topic': topic,
            'relevant_doc_indices': relevant_indices
        })

print(f"\n Created {len(test_queries_data)} test queries (BM25-based relevance)")
if test_queries_data:
    avg_rel = np.mean([len(q['relevant_doc_indices']) for q in test_queries_data])
    print(f"Average relevant docs per query: {avg_rel:.1f}")
    print("\n  Relevansi = top-30 terdekat (BM25 score) dalam topik, excl. diri sendiri")
else:
    print("No test queries created; cek ukuran topik.")


 Created 30 test queries (BM25-based relevance)
Average relevant docs per query: 30.0

  Relevansi = top-30 terdekat (BM25 score) dalam topik, excl. diri sendiri


In [None]:
results_summary = []
print(f"Evaluating {len(test_queries_data)} queries...")

for query_data in tqdm(test_queries_data, desc="Evaluating"):

    query_raw = query_data['query']
    query_text = query_data['query_processed']
    relevant_indices = set(query_data['relevant_doc_indices'])
    search_results = search_bm25(query_text, top_k=max(k_values), preprocessed=True)
    if search_results.empty:
        continue

    retrieved_indices = search_results.index.tolist()

    for k in k_values:
        retrieved_at_k = retrieved_indices[:k]
        hits_k = len(set(retrieved_at_k) & relevant_indices)
        success_k = 1 if hits_k > 0 else 0
        recall_cap_k = hits_k / min(k, len(relevant_indices)) if len(relevant_indices) > 0 else 0.0

        rr_k = 0.0
        for rank_pos, doc_id in enumerate(retrieved_at_k, 1):
            if doc_id in relevant_indices:
                rr_k = 1.0 / rank_pos
                break
            
        precision = precision_at_k(relevant_indices, retrieved_indices, k)
        recall = recall_at_k(relevant_indices, retrieved_indices, k)
        ap = average_precision(relevant_indices, retrieved_indices, k)
        ndcg = ndcg_at_k(relevant_indices, retrieved_indices, k)

        results_summary.append({
            'query': query_raw[:50],
            'topic': query_data['topic'],
            'k': k,
            'precision': precision,
            'recall': recall,
            'ap': ap,
            'ndcg': ndcg,
            'num_relevant': len(relevant_indices),
            'hits@k': hits_k,
            'success@k': success_k,
            'recall_cap': recall_cap_k,
            'rr@k': rr_k

        })



eval_df = pd.DataFrame(results_summary)

print("Evaluation done")

Evaluating 30 queries...


Evaluating: 100%|██████████| 30/30 [02:52<00:00,  5.75s/it]

Evaluation done





In [None]:
# Aggregate summary + MAP/MRR
if eval_df.empty:
    print("⚠️ eval_df kosong; pastikan test_queries_data terisi.")

else:
    agg_rows = []
    map_rows = []

    for k in k_values:
        
        k_results = eval_df[eval_df['k'] == k]
        precision_mean = k_results['precision'].mean()
        recall_mean = k_results['recall'].mean()
        ap_mean = k_results['ap'].mean()
        ndcg_mean = k_results['ndcg'].mean()
        hit_rate = k_results['hits@k'].mean()
        success_rate = k_results['success@k'].mean()
        recall_cap_mean = k_results['recall_cap'].mean()
        mrr_mean = k_results['rr@k'].mean()

        agg_rows.append({
            'k': k,
            'precision_mean': precision_mean,
            'precision_std': k_results['precision'].std(),
            'recall_mean': recall_mean,
            'recall_std': k_results['recall'].std(),
            'ap_mean': ap_mean,
            'ap_std': k_results['ap'].std(),
            'ndcg_mean': ndcg_mean,
            'ndcg_std': k_results['ndcg'].std(),
            'hit_rate_mean': hit_rate,
            'success_rate_mean': success_rate,
            'recall_cap_mean': recall_cap_mean,
            'recall_cap_std': k_results['recall_cap'].std(),
            'mrr_mean': mrr_mean
        })

        map_score = ap_mean  # MAP@k = mean AP@k across queries
        map_rows.append({'k': k, 'map': map_score})

        print(f"\n{'='*70}")
        print(f"Results @ k={k}")
        print(f"{'='*70}")
        print(f"Precision@{k}:   {precision_mean:.4f} ± {k_results['precision'].std():.4f}")
        print(f"Recall@{k}:      {recall_mean:.4f} ± {k_results['recall'].std():.4f}")
        print(f"AP@{k}:          {ap_mean:.4f} ± {k_results['ap'].std():.4f}")
        print(f"NDCG@{k}:        {ndcg_mean:.4f} ± {k_results['ndcg'].std():.4f}")
        print(f"Hits@{k}:        {hit_rate:.2f} (avg count)")
        print(f"Success@{k}:     {success_rate:.2f} (prop query dgn >=1 relevan)")
        print(f"RecallCap@{k}:   {recall_cap_mean:.4f} ± {k_results['recall_cap'].std():.4f}")
        print(f"MRR@{k}:         {mrr_mean:.4f}")
        print(f"MAP@{k}:         {map_score:.4f}")

    agg_df = pd.DataFrame(agg_rows)
    map_df = pd.DataFrame(map_rows)

    eval_summary = agg_df.merge(map_df, on='k').sort_values('k')
    numeric_cols = ['precision_mean', 'precision_std', 'recall_mean', 'recall_std', 'ap_mean', 'ap_std', 'ndcg_mean', 'ndcg_std', 'hit_rate_mean', 'success_rate_mean', 'recall_cap_mean', 'recall_cap_std', 'mrr_mean', 'map']
    eval_summary[numeric_cols] = eval_summary[numeric_cols].apply(lambda s: s.round(4))

    print("\nSummary table (dibulatkan 4 desimal):")
    display(eval_summary)

    print("\nMarkdown summary (salin ke laporan jika perlu):")
    print(eval_summary.to_markdown(index=False))


Results @ k=5
Precision@5:   0.3133 ± 0.2609
Recall@5:      0.0522 ± 0.0435
AP@5:          0.0274 ± 0.0299
NDCG@5:        0.2624 ± 0.2192
Hits@5:        1.57 (avg count)
Success@5:     0.77 (prop query dgn >=1 relevan)
RecallCap@5:   0.3133 ± 0.2609
MRR@5:         0.3100
MAP@5:         0.0274

Results @ k=10
Precision@10:   0.3167 ± 0.2451
Recall@10:      0.1056 ± 0.0817
AP@10:          0.0548 ± 0.0585
NDCG@10:        0.2824 ± 0.2169
Hits@10:        3.17 (avg count)
Success@10:     0.87 (prop query dgn >=1 relevan)
RecallCap@10:   0.3167 ± 0.2451
MRR@10:         0.3226
MAP@10:         0.0548

Results @ k=20
Precision@20:   0.3183 ± 0.2002
Recall@20:      0.2122 ± 0.1335
AP@20:          0.1016 ± 0.0954
NDCG@20:        0.2955 ± 0.1925
Hits@20:        6.37 (avg count)
Success@20:     0.93 (prop query dgn >=1 relevan)
RecallCap@20:   0.3183 ± 0.2002
MRR@20:         0.3274
MAP@20:         0.1016

Summary table (dibulatkan 4 desimal):


Unnamed: 0,k,precision_mean,precision_std,recall_mean,recall_std,ap_mean,ap_std,ndcg_mean,ndcg_std,hit_rate_mean,success_rate_mean,recall_cap_mean,recall_cap_std,mrr_mean,map
0,5,0.3133,0.2609,0.0522,0.0435,0.0274,0.0299,0.2624,0.2192,1.5667,0.7667,0.3133,0.2609,0.31,0.0274
1,10,0.3167,0.2451,0.1056,0.0817,0.0548,0.0585,0.2824,0.2169,3.1667,0.8667,0.3167,0.2451,0.3226,0.0548
2,20,0.3183,0.2002,0.2122,0.1335,0.1016,0.0954,0.2955,0.1925,6.3667,0.9333,0.3183,0.2002,0.3274,0.1016



Markdown summary (salin ke laporan jika perlu):
|   k |   precision_mean |   precision_std |   recall_mean |   recall_std |   ap_mean |   ap_std |   ndcg_mean |   ndcg_std |   hit_rate_mean |   success_rate_mean |   recall_cap_mean |   recall_cap_std |   mrr_mean |    map |
|----:|-----------------:|----------------:|--------------:|-------------:|----------:|---------:|------------:|-----------:|----------------:|--------------------:|------------------:|-----------------:|-----------:|-------:|
|   5 |           0.3133 |          0.2609 |        0.0522 |       0.0435 |    0.0274 |   0.0299 |      0.2624 |     0.2192 |          1.5667 |              0.7667 |            0.3133 |           0.2609 |     0.31   | 0.0274 |
|  10 |           0.3167 |          0.2451 |        0.1056 |       0.0817 |    0.0548 |   0.0585 |      0.2824 |     0.2169 |          3.1667 |              0.8667 |            0.3167 |           0.2451 |     0.3226 | 0.0548 |
|  20 |           0.3183 |          0.2002 

In [None]:
# Save BM25 artifacts for deployment (baseline approach)
import pickle
import json

artifacts_dir = Path('../artifacts/bm25')
artifacts_dir.mkdir(parents=True, exist_ok=True)

with open(artifacts_dir / 'bm25.pkl', 'wb') as f:
    pickle.dump(bm25, f)

# Simpan metadata selaras dengan korpus yang diindeks (doc_text)
df_meta = df[['title','answer','topic_set','answer_count','year','doc_text','processed_question']].copy()
df_meta.to_pickle(artifacts_dir / 'corpus_meta.pkl')

with open(artifacts_dir / 'stopwords.json', 'w') as f:
    json.dump(stopwords, f)

# Save config (default params)
config = {'k1': bm25.k1, 'b': bm25.b, 'n_docs': len(df_meta), 'relevance_method': 'BM25-based top-30 within topic'}
with open(artifacts_dir / 'bm25_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print(f"✅ BM25 artifacts saved (baseline approach) to {artifacts_dir.resolve()}")