
# MDP — Coverage & Faithfulness Dataset Builder (WHO + Pathology Textbook)

This notebook constructs two evaluation datasets:
- **Coverage / Ground-truth Recall:** Does top‑K retrieved context contain the gold evidence?
- **Context Consistency / Faithfulness:** Are answers faithful to the evidence (and are citations precise)?

It uses **DeepSeek** to generate informative QA pairs from each chunk (and, optionally, to score faithfulness via NLI).


In [1]:

# ==== Configuration (edit paths here) ====

WHO_CSV = "/home/gulizhu/MDP/combined_health_topics_with_source.csv"
TEXTBOOK_PATH = "/home/gulizhu/MDP/textbook_pathology.txt"
OUT_DIR = "/home/gulizhu/MDP/benchmark_data/coverage_faithfulness"
DEEPSEEK_API_KEY="sk-9788b084799748b9ac49471f46225d8f"

# Retrieval & sampling
TOP_K = 10
MAX_QAS_PER_CHUNK = 2      # QA generated per selected chunk
SAMPLE_CHUNKS = 400        # how many chunks to sample for QA generation

# Faithfulness scoring with DeepSeek NLI
USE_DEEPSEEK_NLI = True    # set False to skip NLI scoring (much faster)

# Chunking params
MAX_CHARS = 1100
OVERLAP = 100

# Random seed for reproducibility
SEED = 42


In [16]:

import os
os.environ["DEEPSEEK_API_KEY"] = "sk-9788b084799748b9ac49471f46225d8f"
print("Key set?", bool(os.environ.get("DEEPSEEK_API_KEY")))



Key set? True


In [None]:
import sys, subprocess, pkgutil, os, sys
print("Python:", sys.executable)

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U"] + pkgs)
pip_install(["rank_bm25", "openai>=1.0.0"])

print("rank_bm25 installed?", pkgutil.find_loader("rank_bm25") is not None)
print("openai installed?", pkgutil.find_loader("openai") is not None)


Python: /home/gulizhu/envguli/venv311/bin/python
rank_bm25 installed? True
openai installed? True



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:

# ==== Imports ====
import os, re, json, math, hashlib, random
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Dict, Optional

import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Optional: BM25 hybrid (pip install rank_bm25)
try:
    from rank_bm25 import BM25Okapi
    HAS_BM25 = True
except Exception:
    HAS_BM25 = False

# DeepSeek client (OpenAI-compatible)
try:
    from openai import OpenAI
    HAS_OPENAI = True
except Exception:
    HAS_OPENAI = False

random.seed(SEED)
np.random.seed(SEED)

Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

print("BM25 available:", HAS_BM25)
print("OpenAI client available:", HAS_OPENAI)
print("OUT_DIR:", OUT_DIR)


BM25 available: True
OpenAI client available: True
OUT_DIR: /home/gulizhu/MDP/benchmark_data/coverage_faithfulness


In [19]:

# ==== DeepSeek API Key Check ====
# Set DEEPSEEK_API_KEY in your environment before running:
#   export DEEPSEEK_API_KEY="sk-..."
api_key = DEEPSEEK_API_KEY
masked = api_key[:6] + "..." if api_key else "(missing)"
print("DEEPSEEK_API_KEY:", masked)
if not api_key:
    print("⚠️ No API key found. QA/NLI will fail. Set DEEPSEEK_API_KEY and re-run this cell.")


DEEPSEEK_API_KEY: sk-978...


In [20]:

def md5(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()[:10]

def normalize_ws(x: str) -> str:
    return re.sub(r"\s+", " ", str(x)).strip()


In [21]:

@dataclass
class Doc:
    doc_id: str
    source: str
    title: str
    url: Optional[str]
    text: str

@dataclass
class Chunk:
    doc_id: str
    chunk_id: str
    text: str
    start_char: int
    end_char: int

def load_who_csv(path: str) -> List[Doc]:
    df = pd.read_csv(path)
    docs = []
    for _, r in df.iterrows():
        title = normalize_ws(r.get("topic", ""))
        url = None if pd.isna(r.get("url")) else str(r.get("url"))
        text = normalize_ws(r.get("text", ""))
        source = normalize_ws(r.get("source", "WHO"))
        doc_id = f"WHO::{md5(title + (url or ''))}"
        docs.append(Doc(doc_id=doc_id, source=source, title=title, url=url, text=text))
    return docs

def load_textbook(path: str) -> List[Doc]:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()
    parts = re.split(r"\n\s*\n+", raw)
    docs = []
    for i, part in enumerate(parts):
        title = f"Pathology Section {i+1}"
        doc_id = f"TEXTBOOK::{i+1:04d}"
        docs.append(Doc(doc_id=doc_id, source="TEXTBOOK", title=title, url=None, text=normalize_ws(part)))
    return docs

def greedy_chunk(text: str, max_chars: int = 1200, overlap: int = 120) -> List[Tuple[str,int,int]]:
    text = normalize_ws(text)
    n = len(text)
    chunks = []
    i = 0
    while i < n:
        j = min(i + max_chars, n)
        cut = text.rfind(". ", i, j)
        if cut == -1 or cut < i + 200:
            cut = j
        else:
            cut = cut + 1
        chunks.append((text[i:cut], i, cut))
        i = max(cut - overlap, i + 1)
    return chunks

def build_corpus_chunks(docs: List[Doc], max_chars=1200, overlap=120) -> List[Chunk]:
    out = []
    for d in docs:
        for idx, (ctext, s, e) in enumerate(greedy_chunk(d.text, max_chars, overlap)):
            out.append(Chunk(doc_id=d.doc_id, chunk_id=f"{d.doc_id}::CH{idx:04d}", text=ctext, start_char=s, end_char=e))
    return out


In [22]:

who_docs = load_who_csv(WHO_CSV)
tb_docs = load_textbook(TEXTBOOK_PATH)
all_docs = who_docs + tb_docs

chunks = build_corpus_chunks(all_docs, max_chars=MAX_CHARS, overlap=OVERLAP)
chunks_by_id = {c.chunk_id: c for c in chunks}

print(f"Loaded WHO docs: {len(who_docs)}")
print(f"Loaded textbook docs: {len(tb_docs)}")
print(f"Total chunks: {len(chunks)}")

pd.DataFrame([{
    "doc_id": c.doc_id, "chunk_id": c.chunk_id, "chars": len(c.text), "preview": c.text[:160] + ("..." if len(c.text)>160 else "")
} for c in chunks[:5]])


Loaded WHO docs: 1285
Loaded textbook docs: 19
Total chunks: 139943


Unnamed: 0,doc_id,chunk_id,chars,preview
0,WHO::8d7aa84649,WHO::8d7aa84649::CH0000,1057,Common goods for health are population-based f...
1,WHO::8d7aa84649,WHO::8d7aa84649::CH0001,1020,gislation (ex. environmental regulations and g...
2,WHO::8d7aa84649,WHO::8d7aa84649::CH0002,950,"ge of legal instruments (such as laws, decrees..."
3,WHO::8d7aa84649,WHO::8d7aa84649::CH0003,1036,"ealth coverage, including preventing and mitig..."
4,WHO::8d7aa84649,WHO::8d7aa84649::CH0004,246,"zed over time and across levels of government,..."


In [23]:

class HybridRetriever:
    def __init__(self, chunks: List[Chunk]):
        self.chunks = chunks
        self.texts = [c.text for c in chunks]
        self.ids = [c.chunk_id for c in chunks]
        self.vectorizer = TfidfVectorizer(max_features=60000, ngram_range=(1,2))
        self.tf_matrix = self.vectorizer.fit_transform(self.texts)
        if HAS_BM25:
            self.bm25 = BM25Okapi([t.split() for t in self.texts])
        else:
            self.bm25 = None

    def search(self, query: str, k: int = 10):
        tf_q = self.vectorizer.transform([query])
        tf_scores = cosine_similarity(tf_q, self.tf_matrix).ravel()
        if self.bm25 is not None:
            bm_scores = self.bm25.get_scores(query.split())
            # min-max normalize both then average
            def norm(a): 
                return (a - a.min()) / (a.max() - a.min() + 1e-9)
            hybrid = 0.5*norm(tf_scores) + 0.5*norm(np.array(bm_scores))
            order = np.argsort(-hybrid)[:k]
            return [(self.ids[i], float(hybrid[i])) for i in order]
        else:
            order = np.argsort(-tf_scores)[:k]
            return [(self.ids[i], float(tf_scores[i])) for i in order]

retriever = HybridRetriever(chunks)
print("Retriever built. vocab size:", len(retriever.vectorizer.vocabulary_))


Retriever built. vocab size: 60000


In [24]:

def get_ds_client() -> Optional[OpenAI]:
    if not HAS_OPENAI:
        return None
    key = os.environ.get("DEEPSEEK_API_KEY")
    if not key:
        return None
    return OpenAI(api_key=key, base_url="https://api.deepseek.com")

def llm_make_qas(client: OpenAI, text: str, n: int = 2):
    prompt = (
        "You are a biomedical editor. Read the passage and create concise, factual QA pairs.\n"
        "Rules: Focus on atomic facts answerable directly from the passage; avoid multi-sentence answers.\n"
        "Return a JSON list of objects with 'q' and 'a'.\n\n"
        f"PASSAGE:\n{text}\n"
    )
    try:
        resp = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role":"user","content":prompt}],
            temperature=0.2, max_tokens=600
        )
        content = resp.choices[0].message.content
        m = re.search(r"\[.*\]", content, flags=re.S)
        items = json.loads(m.group(0)) if m else []
        out = []
        for it in items[:n]:
            q = normalize_ws(it.get("q",""))
            a = normalize_ws(it.get("a",""))
            if q and a:
                out.append((q,a))
        return out
    except Exception as e:
        # surface error for debugging
        print("DeepSeek QA error:", e)
        return []


In [25]:

def build_coverage_dataset(chunks: List[Chunk], retriever: HybridRetriever, k: int, max_qas_per_chunk: int, sample_chunks: int):
    client = get_ds_client()
    rows = []
    retrieval_lists = {}
    # sample
    idxs = list(range(len(chunks)))
    random.shuffle(idxs)
    pool = idxs[: min(sample_chunks, len(chunks))]
    print(f"Sampling {len(pool)} chunks for QA generation...")

    for i in tqdm(pool):
        ch = chunks[i]
        qas = llm_make_qas(client, ch.text, n=max_qas_per_chunk)
        if not qas:
            continue
        for (q,a) in qas:
            qid = f"Q::{md5(q + ch.chunk_id)}"
            top = retriever.search(q, k=k)
            retrieval_lists[qid] = top
            top_ids = [cid for cid,_ in top]

            hit_doc = int(any(cid.startswith(ch.doc_id) for cid in top_ids))
            hit_chunk = int(ch.chunk_id in top_ids)

            joined_ctx = " \n\n".join([retriever.chunks[retriever.ids.index(cid)].text for cid in top_ids if cid in retriever.ids])
            ans_tokens = set([t.lower() for t in re.findall(r"\b\w+\b", a) if len(t)>3])
            ctx_tokens = set([t.lower() for t in re.findall(r"\b\w+\b", joined_ctx) if len(t)>3])
            overlap = ans_tokens.intersection(ctx_tokens)
            context_recall = len(overlap) / (len(ans_tokens) + 1e-9)

            rows.append({
                "qid": qid,
                "question": q,
                "answer": a,
                "gt_doc_id": ch.doc_id,
                "gt_chunk_id": ch.chunk_id,
                "hit_doc@K": hit_doc,
                "hit_chunk@K": hit_chunk,
                "context_recall@K": round(float(context_recall),4),
            })
    df = pd.DataFrame(rows)
    return df, retrieval_lists


In [26]:

cov_df, retrieval_lists = build_coverage_dataset(chunks, retriever, k=TOP_K, 
                                                 max_qas_per_chunk=MAX_QAS_PER_CHUNK, 
                                                 sample_chunks=SAMPLE_CHUNKS)
print("Coverage rows:", len(cov_df))
display(cov_df.head(10))

snapshot = {}
if len(cov_df):
    snapshot = {
        "num_questions": int(len(cov_df)),
        f"hit_doc@{TOP_K}": float(cov_df["hit_doc@K"].mean()),
        f"hit_chunk@{TOP_K}": float(cov_df["hit_chunk@K"].mean()),
        f"avg_context_recall@{TOP_K}": float(cov_df["context_recall@K"].mean()),
    }
snapshot


Sampling 400 chunks for QA generation...


100%|██████████| 400/400 [27:32<00:00,  4.13s/it]

Coverage rows: 458





Unnamed: 0,qid,question,answer,gt_doc_id,gt_chunk_id,hit_doc@K,hit_chunk@K,context_recall@K
0,Q::b7b927d779,What is the purpose of the assistance mentioned?,Rehabilitation,WHO::def5effffe,WHO::def5effffe::CH0075,0,0,0.0
1,Q::a4aedf3d70,What does the acronym NIH stand for?,National Institutes of Health,WHO::ba091c3aa0,WHO::ba091c3aa0::CH0022,0,0,0.3333
2,Q::1a6c0caae5,What is the full name of the NIDCD?,National Institute on Deafness and Other Commu...,WHO::ba091c3aa0,WHO::ba091c3aa0::CH0022,0,0,0.1667
3,Q::4d04fecd09,What is the full name of the organization abbr...,Centers for Disease Control and Prevention,WHO::c746a8289b,WHO::c746a8289b::CH0048,0,0,0.25
4,Q::be034ca29c,How long do these symptoms typically last?,A couple of days.,WHO::5d456f490d,WHO::5d456f490d::CH0041,0,0,0.0
5,Q::6a276532ab,Who has a higher risk of getting cancer in the...,Someone who has had cancer in one testicle.,WHO::8c8bdba1fe,WHO::8c8bdba1fe::CH0003,1,1,0.6667
6,Q::bc30540a16,What is important to do regularly for the othe...,Check it regularly.,WHO::8c8bdba1fe,WHO::8c8bdba1fe::CH0003,1,1,1.0
7,Q::181a4da1f6,What is one key measure to prevent diarrhoea r...,Access to safe drinking-water.,WHO::82af9760db,WHO::82af9760db::CH0002,0,0,0.0
8,Q::f0aac8e0b2,What type of sanitation helps prevent diarrhoea?,Use of improved sanitation.,WHO::82af9760db,WHO::82af9760db::CH0002,0,0,0.5
9,Q::e7c93c3f5f,What is the subject of the passage?,Policy frameworks for good urban governance.,WHO::efe533a7d1,WHO::efe533a7d1::CH0025,0,0,0.0


{'num_questions': 458,
 'hit_doc@10': 0.34934497816593885,
 'hit_chunk@10': 0.12445414847161572,
 'avg_context_recall@10': 0.36204606986899557}

In [27]:

cov_csv = Path(OUT_DIR) / "coverage_dataset.csv"
cov_df.to_csv(cov_csv, index=False)

topk_path = Path(OUT_DIR) / "coverage_retrieval_topk.jsonl"
with open(topk_path, "w", encoding="utf-8") as f:
    for qid, items in retrieval_lists.items():
        rec = {"qid": qid, "topk": [{"chunk_id": cid, "score": s} for cid,s in items]}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

with open(Path(OUT_DIR) / "coverage_snapshot.json", "w", encoding="utf-8") as f:
    json.dump(snapshot, f, ensure_ascii=False, indent=2)

print("Saved:", cov_csv)
print("Saved:", topk_path)
print("Saved:", Path(OUT_DIR) / "coverage_snapshot.json")


Saved: /home/gulizhu/MDP/benchmark_data/coverage_faithfulness/coverage_dataset.csv
Saved: /home/gulizhu/MDP/benchmark_data/coverage_faithfulness/coverage_retrieval_topk.jsonl
Saved: /home/gulizhu/MDP/benchmark_data/coverage_faithfulness/coverage_snapshot.json


In [28]:

NEG_SWAP_TABLE = {
    r"\bincrease(s|d)?\b": "decrease",
    r"\bdecrease(s|d)?\b": "increase",
    r"\bhigher\b": "lower",
    r"\blower\b": "higher",
    r"\badenocarcinoma\b": "squamous cell carcinoma",
    r"\bsquamous( cell)? carcinoma\b": "adenocarcinoma",
    r"\bnon-small cell lung cancer\b": "small cell lung cancer",
    r"\bsmall cell lung cancer\b": "non-small cell lung cancer",
}

def make_unfaithful(answer: str) -> Optional[str]:
    cand = answer
    flips = 0
    for pat, repl in NEG_SWAP_TABLE.items():
        if re.search(pat, cand, flags=re.I):
            cand = re.sub(pat, repl, cand, flags=re.I)
            flips += 1
    return cand if flips>0 else None

def llm_faithfulness_score(client: OpenAI, answer: str, evidence: str) -> float:
    prompt = (
        "You are a strict NLI judge. Score if the ANSWER is entailed by the EVIDENCE from 0.0 to 1.0.\n"
        "Return only a JSON object: {\"score\": number}.\n\n"
        f"EVIDENCE:\n{evidence}\n\nANSWER:\n{answer}\n"
    )
    try:
        resp = client.chat.completions.create(
            model="deepseek-chat",
            messages=[{"role":"user","content":prompt}],
            temperature=0.0, max_tokens=60
        )
        content = resp.choices[0].message.content
        m = re.search(r"\{.*\}", content, flags=re.S)
        obj = json.loads(m.group(0)) if m else {"score": 0.0}
        score = float(obj.get("score", 0.0))
        return max(0.0, min(1.0, score))
    except Exception as e:
        print("DeepSeek NLI error:", e)
        return 0.0


In [29]:

def build_faithfulness_dataset(cov_df: pd.DataFrame, chunks_by_id: Dict[str, Chunk], retrieval_lists: Dict[str, list], use_deepseek_nli: bool):
    client = get_ds_client() if use_deepseek_nli else None
    rows = []
    for _, r in cov_df.iterrows():
        qid = r["qid"]; q = r["question"]; a = r["answer"]
        gt_chunk_id = r["gt_chunk_id"]
        evidence_text = chunks_by_id.get(gt_chunk_id).text if gt_chunk_id in chunks_by_id else ""

        # positive
        pos_score = llm_faithfulness_score(client, a, evidence_text) if client else 1.0
        rows.append({
            "qid": qid, "question": q, "answer": a, "label_faithful": 1,
            "evidence_chunk_id": gt_chunk_id, "faithfulness_score": round(float(pos_score),3)
        })

        # negative (synthetic swap)
        neg_a = make_unfaithful(a)
        if neg_a:
            neg_score = llm_faithfulness_score(client, neg_a, evidence_text) if client else 0.0
            rows.append({
                "qid": qid, "question": q, "answer": neg_a, "label_faithful": 0,
                "evidence_chunk_id": gt_chunk_id, "faithfulness_score": round(float(neg_score),3)
            })

        # top1 evidence eval (citation precision proxy)
        top = retrieval_lists.get(qid, [])
        if top:
            top1_id = top[0][0]
            top1_text = chunks_by_id.get(top1_id).text if top1_id in chunks_by_id else ""
            nli_top1 = llm_faithfulness_score(client, a, top1_text) if client else (1.0 if top1_id==gt_chunk_id else 0.5)
            rows.append({
                "qid": qid, "question": q, "answer": a,
                "label_faithful": int(top1_id == gt_chunk_id),
                "evidence_chunk_id": top1_id,
                "faithfulness_score": round(float(nli_top1),3),
                "note": "top1_evidence_eval"
            })
    return pd.DataFrame(rows)

faith_df = build_faithfulness_dataset(cov_df, chunks_by_id, retrieval_lists, USE_DEEPSEEK_NLI)
print("Faithfulness rows:", len(faith_df))
display(faith_df.head(10))


Faithfulness rows: 919


Unnamed: 0,qid,question,answer,label_faithful,evidence_chunk_id,faithfulness_score,note
0,Q::b7b927d779,What is the purpose of the assistance mentioned?,Rehabilitation,1,WHO::def5effffe::CH0075,1.0,
1,Q::b7b927d779,What is the purpose of the assistance mentioned?,Rehabilitation,0,WHO::1a5d8db1de::CH0035,0.0,top1_evidence_eval
2,Q::a4aedf3d70,What does the acronym NIH stand for?,National Institutes of Health,1,WHO::ba091c3aa0::CH0022,0.0,
3,Q::a4aedf3d70,What does the acronym NIH stand for?,National Institutes of Health,0,WHO::3a2b2c11ed::CH0056,0.0,top1_evidence_eval
4,Q::1a6c0caae5,What is the full name of the NIDCD?,National Institute on Deafness and Other Commu...,1,WHO::ba091c3aa0::CH0022,1.0,
5,Q::1a6c0caae5,What is the full name of the NIDCD?,National Institute on Deafness and Other Commu...,0,TEXTBOOK::0006::CH0529,0.0,top1_evidence_eval
6,Q::4d04fecd09,What is the full name of the organization abbr...,Centers for Disease Control and Prevention,1,WHO::c746a8289b::CH0048,1.0,
7,Q::4d04fecd09,What is the full name of the organization abbr...,Centers for Disease Control and Prevention,0,TEXTBOOK::0006::CH0529,0.0,top1_evidence_eval
8,Q::be034ca29c,How long do these symptoms typically last?,A couple of days.,1,WHO::5d456f490d::CH0041,1.0,
9,Q::be034ca29c,How long do these symptoms typically last?,A couple of days.,0,WHO::7f7d82b63a::CH0094,0.0,top1_evidence_eval


In [30]:

faith_csv = Path(OUT_DIR) / "faithfulness_dataset.csv"
faith_df.to_csv(faith_csv, index=False)
print("Saved:", faith_csv)


Saved: /home/gulizhu/MDP/benchmark_data/coverage_faithfulness/faithfulness_dataset.csv


In [31]:

chunk_index_csv = Path(OUT_DIR) / "chunk_index.csv"
pd.DataFrame([{
    "chunk_id": c.chunk_id,
    "doc_id": c.doc_id,
    "source": next((d.source for d in (who_docs+tb_docs) if d.doc_id==c.doc_id), ""),
    "title": next((d.title for d in (who_docs+tb_docs) if d.doc_id==c.doc_id), ""),
    "url": next((d.url for d in (who_docs+tb_docs) if d.doc_id==c.doc_id), ""),
    "chars": len(c.text),
    "text": c.text
} for c in chunks]).to_csv(chunk_index_csv, index=False)

print("Saved:", chunk_index_csv)


Saved: /home/gulizhu/MDP/benchmark_data/coverage_faithfulness/chunk_index.csv
