In [9]:
#sk-9788b084799748b9ac49471f46225d8f

In [None]:
# === Cell 1: Load BioLLM weights & base deps ===
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "/home/gulizhu/MDP/biogpt_local"   
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

# === Config & Imports ===
import pandas as pd
import torch
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple

from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re, math, numpy as np
from collections import Counter, defaultdict


In [None]:
# === Cell 2: Load sources & QA ===

# Paths to your data files
CSV_PATH = Path("/home/gulizhu/MDP/combined_health_topics_with_source.csv")  
TXT_PATH = Path("/home/gulizhu/MDP/textbook_pathology.txt")                  
XLSX_PATH = Path("/home/gulizhu/MDP/LLM Questions.xlsx")                     

# --- Load CSV (WHO topics) ---
df_csv = pd.read_csv(CSV_PATH)
df_csv = df_csv.rename(columns={"text": "context"})
df_csv["source"] = "WHO"

# --- Load TXT (pathology textbook) and chunk ---
with open(TXT_PATH, "r", encoding="utf-8") as f:
    txt_content = f.read()

chunk_size = 800  # adjust as needed
txt_chunks = [txt_content[i:i+chunk_size] for i in range(0, len(txt_content), chunk_size)]
df_txt = pd.DataFrame([{"context": chunk, "source": "textbook_pathology"} for chunk in txt_chunks])

print("Textbook chunks:", len(df_txt))

# --- Load Excel QA ---
df_qa = pd.read_excel(XLSX_PATH)
df_qa = df_qa.rename(columns={c: c.lower() for c in df_qa.columns})

if "question" not in df_qa.columns:
    if "q" in df_qa.columns:       df_qa = df_qa.rename(columns={"q": "question"})
    elif "prompt" in df_qa.columns: df_qa = df_qa.rename(columns={"prompt": "question"})
    elif "ques" in df_qa.columns:   df_qa = df_qa.rename(columns={"ques": "question"})
if "question" not in df_qa.columns:
    raise ValueError("Excel QA file must contain a 'question' column")

# --- Combine knowledge sources ---
docs_df = pd.concat([df_csv[["context","source"]], df_txt], ignore_index=True)
print("Knowledge base size:", len(docs_df))
display(docs_df.head(2))


Textbook chunks: 4759
Knowledge base size: 6044


Unnamed: 0,context,source
0,Common goods for health are population-based f...,WHO
1,The social determinants of health (SDH) are th...,WHO


In [12]:
# === Cell 3: Retrievers, Embeddings, RAG backend (your code, consolidated) ===

# Quick peek of QA
print(df_qa.columns)
display(df_qa.head())

class TFIDFRetriever:
    def __init__(self, docs: List[str]):
        self.vectorizer = TfidfVectorizer(max_features=50000)
        self.doc_mat = self.vectorizer.fit_transform(docs)
        self.docs = docs
    def search(self, query: str, k=5):
        q_vec = self.vectorizer.transform([query])
        sims = cosine_similarity(q_vec, self.doc_mat)[0]
        idxs = sims.argsort()[::-1][:k]
        return [(int(i), float(sims[i])) for i in idxs]

class BM25Retriever:
    def __init__(self, docs: List[str], k1=1.5, b=0.75):
        self.docs = docs
        self.k1, self.b = k1, b
        self.tokenizer = re.compile(r"\w+").findall
        self.tokenized = [self.tokenizer(d.lower()) for d in docs]
        self.doc_lens = [len(t) for t in self.tokenized]
        self.avgdl = sum(self.doc_lens)/max(1,len(self.doc_lens))
        df = defaultdict(int)
        for toks in self.tokenized:
            for w in set(toks):
                df[w]+=1
        self.N = len(docs)
        self.idf = {w: math.log(1+(self.N-c+0.5)/(c+0.5)) for w,c in df.items()}
        self.tf = [Counter(toks) for toks in self.tokenized]
    def _score(self, q_toks, idx):
        score=0.0; dl=self.doc_lens[idx]; tf_d=self.tf[idx]
        for w in q_toks:
            if w not in self.idf: continue
            idf=self.idf[w]; f=tf_d.get(w,0)
            denom=f+self.k1*(1-self.b+self.b*dl/(self.avgdl or 1))
            score+=idf*(f*(self.k1+1))/(denom or 1e-12)
        return score
    def search(self, query:str,k=5):
        q_toks=self.tokenizer(query.lower())
        scores=[(i,self._score(q_toks,i)) for i in range(self.N)]
        scores.sort(key=lambda x:x[1], reverse=True)
        return scores[:k]

@dataclass
class Message:
    role: str
    content: str

class BioLLMBackend:
    def __init__(self, model, tokenizer, device=None):
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device
    def generate(self, messages: List[Message]) -> str:
        query = next((m.content for m in messages[::-1] if m.role == "user"), "")
        context = "\n\n".join(m.content for m in messages if m.role in ("system", "tool"))
        context = context[:2000]
        prompt = f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.7
            )
        raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = raw.split("Answer:")[-1].strip()
        return answer

# Embedding backends config
EMBED_MODELS = [
    ("minilm", "sentence-transformers/all-MiniLM-L6-v2"),
    ("bge-small", "BAAI/bge-small-en-v1.5"),
]

class EmbeddingBackend:
    def embed_texts(self, texts): raise NotImplementedError
    def embed_query(self, text):  raise NotImplementedError

class SentenceTransformersEmbedding(EmbeddingBackend):
    def __init__(self, model_id: str, device: str = None):
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(model_id, device=device)
    def embed_texts(self, texts):
        vecs = self.model.encode(texts, batch_size=64, show_progress_bar=False,
                                 convert_to_numpy=True, normalize_embeddings=True)
        return vecs
    def embed_query(self, text):
        return self.embed_texts([text])[0]

class HFMeanPoolingEmbedding(EmbeddingBackend):
    def __init__(self, model_id: str, device: str = None):
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        from transformers import AutoModel, AutoTokenizer
        self.tok = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModel.from_pretrained(model_id).to(device)
        self.device = device
    def _mean_pool(self, outputs, attention_mask):
        last_hidden = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        summed = (last_hidden * mask).sum(1)
        counts = mask.sum(1).clamp(min=1e-9)
        return (summed / counts).detach().cpu().numpy()
    def embed_texts(self, texts):
        import torch, numpy as _np
        all_vecs = []
        bs = 16
        for i in range(0, len(texts), bs):
            batch = texts[i:i+bs]
            enc = self.tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
            with torch.no_grad():
                out = self.model(**enc)
            vecs = self._mean_pool(out, enc["attention_mask"])
            vecs = vecs / (_np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-9)
            all_vecs.append(vecs)
        return _np.vstack(all_vecs)
    def embed_query(self, text):
        return self.embed_texts([text])[0]

class EmbeddingRetriever:
    def __init__(self, docs, backend: EmbeddingBackend):
        self.docs = docs
        self.backend = backend
        self.doc_vecs = self.backend.embed_texts(docs)
    def search(self, query: str, k=5):
        q = self.backend.embed_query(query)
        sims = (self.doc_vecs @ q)
        idxs = np.argsort(-sims)[:k]
        return [(int(i), float(sims[i])) for i in idxs]

class HybridRetriever:
    """lexical shortlist (TF-IDF/BM25) -> embedding re-rank"""
    def __init__(self, docs, base: str, embed_backend: EmbeddingBackend, top_m: int = 50):
        self.docs = docs
        self.top_m = top_m
        self.embed_backend = embed_backend
        if base == "tfidf": self.base = TFIDFRetriever(docs)
        elif base == "bm25": self.base = BM25Retriever(docs)
        else: raise ValueError("base must be 'tfidf' or 'bm25'")
        self.doc_vecs = self.embed_backend.embed_texts(docs)
    def search(self, query: str, k=5):
        base_hits = self.base.search(query, k=self.top_m)
        cand_idxs = [i for i,_ in base_hits]
        q_vec = self.embed_backend.embed_query(query)
        sims = (self.doc_vecs[cand_idxs] @ q_vec)
        order = np.argsort(-sims)[:k]
        return [(int(cand_idxs[i]), float(sims[i])) for i in order]

class SimpleRAG:
    def __init__(self, docs_df: pd.DataFrame, retriever="tfidf", llm=None,
                 embed_backend: EmbeddingBackend = None, hybrid_top_m: int = 50):
        self.df = docs_df.reset_index(drop=True)
        self.contexts = self.df["context"].astype(str).tolist()
        self.llm = llm
        if retriever == "tfidf":
            self.retriever = TFIDFRetriever(self.contexts); self.retriever_name="tfidf"; self.embedding_name="-"
        elif retriever == "bm25":
            self.retriever = BM25Retriever(self.contexts);  self.retriever_name="bm25";  self.embedding_name="-"
        elif retriever == "embed":
            if embed_backend is None: raise ValueError("embed_backend required for retriever='embed'")
            self.retriever = EmbeddingRetriever(self.contexts, embed_backend)
            self.retriever_name="embed"; self.embedding_name=str(embed_backend.__class__.__name__)
        elif retriever in ("hybrid_tfidf", "hybrid_bm25"):
            if embed_backend is None: raise ValueError("embed_backend required for hybrid")
            base = "tfidf" if retriever=="hybrid_tfidf" else "bm25"
            self.retriever = HybridRetriever(self.contexts, base=base, embed_backend=embed_backend, top_m=hybrid_top_m)
            self.retriever_name=retriever; self.embedding_name=str(embed_backend.__class__.__name__)
        else:
            raise ValueError(f"Unknown retriever: {retriever}")
    def ask(self, query: str, k=3):
        hits = self.retriever.search(query, k)
        msgs=[Message(role="tool", content=self.contexts[i][:2000]) for i,_ in hits]
        msgs.append(Message(role="user", content=query))
        ans = self.llm.generate(msgs)
        return {"query":query,
                "answer":ans,
                "hits":hits,
                "context":" ".join(self.contexts[i][:500] for i,_ in hits)}


Index(['question'], dtype='object')


Unnamed: 0,question
0,What is the role of a pathologist in cancer di...
1,Which biomarkers are key in the analysis of br...
2,How does a pathologist prepare and analyze a t...
3,What are key features that a pathologist looks...
4,What is immunohistochemistry and how is it use...


In [None]:
# === Cell 4: Instantiate LLM backend & quick sanity (optional) ===
llm = BioLLMBackend(model, tokenizer)

def compare_answers(df_qa: pd.DataFrame, retrievers=["tfidf","bm25"],
                    llms=[("biollm", None)], n=5):
    if llms[0][1] is None:
        llms = [("biollm", llm)]
    sample = df_qa.sample(min(n, len(df_qa)), random_state=0)
    rows=[]
    for _,row in sample.iterrows():
        q = str(row["question"])
        for rname in retrievers:
            rag = SimpleRAG(docs_df, retriever=rname, llm=llm)
            out = rag.ask(q, k=3)
            rows.append({"question":q,"retriever":rname,"model":"biollm","answer":out["answer"]})
    return pd.DataFrame(rows)

_ = compare_answers(df_qa, retrievers=["tfidf","bm25"], llms=[("biollm", llm)], n=3)
display(_)


Unnamed: 0,question,retriever,model,answer
0,How is pathology used in diagnosing soft tissu...,tfidf,biollm,"We review the literature on the clinical, hist..."
1,How is pathology used in diagnosing soft tissu...,bm25,biollm,An update of the classification of soft tissue...
2,What is the importance of margins in pathology...,tfidf,biollm,"""For Wilms tumor, margins should be at least 2..."
3,What is the importance of margins in pathology...,bm25,biollm,What is the significance of margins after brea...
4,Describe fluorescence in situ hybridization (F...,tfidf,biollm,• FISH is a molecular technique that provides ...
5,Describe fluorescence in situ hybridization (F...,bm25,biollm,a primer: What is the role of fluorescence in ...


In [None]:
# === Cell 5: Load two-dimension benchmark files ===
BENCH_DIR = Path("/home/gulizhu/MDP/benchmark_data/coverage_faithfulness")  
COVERAGE_CSV = BENCH_DIR / "coverage_dataset.csv"
FAITH_CSV    = BENCH_DIR / "faithfulness_dataset.csv"
CHUNK_INDEX  = BENCH_DIR / "chunk_index.csv"

assert COVERAGE_CSV.exists() and FAITH_CSV.exists() and CHUNK_INDEX.exists(), "benchmark files missing"

coverage_df = pd.read_csv(COVERAGE_CSV)  
faith_df    = pd.read_csv(FAITH_CSV)     
chunk_idx   = pd.read_csv(CHUNK_INDEX)    

print("coverage rows =", len(coverage_df))
print("faith rows    =", len(faith_df))
print("chunks        =", len(chunk_idx))

chunk_text = dict(zip(chunk_idx["chunk_id"], chunk_idx["text"]))
chunk_doc  = dict(zip(chunk_idx["chunk_id"], chunk_idx["doc_id"]))


coverage rows = 458
faith rows    = 919
chunks        = 139943


In [None]:
# === Cell 6 (FIXED): Coverage evaluation that USES each retriever's own top-K ===
import hashlib
from functools import lru_cache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def _norm_text(s: str) -> str:
    s = re.sub(r"\s+", " ", str(s)).strip().lower()
    return s

def _hash_text(s: str) -> str:
    return hashlib.sha1(_norm_text(s).encode("utf-8")).hexdigest()

_CHUNK_TFIDF = {}
_CHUNK_TFIDF["vec"] = TfidfVectorizer(max_features=120000, ngram_range=(1,2))
_CHUNK_TFIDF["X"]   = _CHUNK_TFIDF["vec"].fit_transform(chunk_idx["text"].astype(str).tolist())
_CHUNK_TFIDF["ids"] = chunk_idx["chunk_id"].astype(str).tolist()
_CHUNK_ID_POS = {cid: i for i, cid in enumerate(_CHUNK_TFIDF["ids"])}
_chunk_hash_to_id = {}
for cid, ctext in zip(chunk_idx["chunk_id"].astype(str), chunk_idx["text"].astype(str)):
    _chunk_hash_to_id[_hash_text(ctext)] = cid

_DOCIDX_TO_CHUNKID = {}

@lru_cache(maxsize=None)
def _map_doc_text_to_chunk_id(doc_text_norm: str) -> str:
    h = hashlib.sha1(doc_text_norm.encode("utf-8")).hexdigest()
    if h in _chunk_hash_to_id:
        return _chunk_hash_to_id[h]
    qv = _CHUNK_TFIDF["vec"].transform([doc_text_norm])
    sim = cosine_similarity(qv, _CHUNK_TFIDF["X"])[0]
    j = int(np.argmax(sim))
    return _CHUNK_TFIDF["ids"][j]

def map_doc_index_to_chunk_id(doc_idx: int) -> str:

    if doc_idx in _DOCIDX_TO_CHUNKID:
        return _DOCIDX_TO_CHUNKID[doc_idx]
    raw = str(docs_df.iloc[doc_idx]["context"])
    cid = _map_doc_text_to_chunk_id(_norm_text(raw))
    _DOCIDX_TO_CHUNKID[doc_idx] = cid
    return cid

def _tokset(s: str):
    return {t.lower() for t in re.findall(r"\b\w+\b", str(s)) if len(t)>3}

def build_embed_backend(model_id: str, device=None):
    try:
        return SentenceTransformersEmbedding(model_id, device=device)
    except Exception:
        return HFMeanPoolingEmbedding(model_id, device=device)

RETRIEVER_MATRIX = [
    ("tfidf",        None),                                        
    ("bm25",         None),                                        
    ("embed",        "sentence-transformers/all-MiniLM-L6-v2"),    
    ("embed",        "BAAI/bge-small-en-v1.5"),
    ("hybrid_tfidf", "sentence-transformers/all-MiniLM-L6-v2"),    
    ("hybrid_bm25",  "BAAI/bge-small-en-v1.5"),                    
]

K = 30  

def eval_coverage_for_combo(retriever_name: str, embed_model_id: str|None):
    if embed_model_id is None:
        rag = SimpleRAG(docs_df, retriever=retriever_name, llm=llm)
        embed_short = "-"
    else:
        be = build_embed_backend(embed_model_id)
        rag = SimpleRAG(docs_df, retriever=retriever_name, llm=llm, embed_backend=be)
        embed_short = embed_model_id.split("/")[-1]

    rows = []
    for _, r in coverage_df.iterrows():
        qid, q, ans = r["qid"], str(r["question"]), str(r["answer"])
        gt_doc, gt_chunk = str(r["gt_doc_id"]), str(r["gt_chunk_id"])

        hits = rag.retriever.search(q, k=K)              # [(doc_idx_in_docs_df, score)]
        doc_indices = [i for (i, _) in hits]

        top_chunk_ids = [map_doc_index_to_chunk_id(i) for i in doc_indices]

        hit_doc   = int(any(str(cid).startswith(gt_doc) for cid in top_chunk_ids))
        hit_chunk = int(gt_chunk in top_chunk_ids)

        ctx = " \n\n".join([chunk_text.get(cid, "") for cid in top_chunk_ids])
        A = _tokset(ans); C = _tokset(ctx)
        ctx_recall = len(A & C) / (len(A) + 1e-9)

        rows.append({
            "qid": qid,
            "hit_doc@K": hit_doc,
            "hit_chunk@K": hit_chunk,
            "context_recall@K": ctx_recall
        })

    df = pd.DataFrame(rows)
    return {
        "retriever": retriever_name,
        "embedding": embed_short,
        "K": K,
        "Doc-Hit@K": float(df["hit_doc@K"].mean()),
        "Chunk-Hit@K": float(df["hit_chunk@K"].mean()),
        "ContextRecall@K": float(df["context_recall@K"].mean()),
        "detail": df
    }

coverage_summary = []
coverage_details = {}

for retriever_name, embed_model_id in RETRIEVER_MATRIX:
    res = eval_coverage_for_combo(retriever_name, embed_model_id)
    coverage_summary.append({k: res[k] for k in ["retriever","embedding","K","Doc-Hit@K","Chunk-Hit@K","ContextRecall@K"]})
    key = f"{res['retriever']}__{res['embedding']}"
    coverage_details[key] = res["detail"]

coverage_table = pd.DataFrame(coverage_summary).sort_values(
    ["Doc-Hit@K","Chunk-Hit@K","ContextRecall@K"], ascending=False
)
display(coverage_table)


Unnamed: 0,retriever,embedding,K,Doc-Hit@K,Chunk-Hit@K,ContextRecall@K
1,bm25,-,10,0.358079,0.028384,0.503284
4,hybrid_tfidf,all-MiniLM-L6-v2,10,0.299127,0.034934,0.50372
0,tfidf,-,10,0.296943,0.034934,0.475534
5,hybrid_bm25,bge-small-en-v1.5,10,0.29476,0.034934,0.535352
2,embed,all-MiniLM-L6-v2,10,0.229258,0.034934,0.536444
3,embed,bge-small-en-v1.5,10,0.216157,0.034934,0.553757


In [16]:
# === Cell 7a: Faithfulness to retrieved evidence mapped to chunk_index ===
def token_recall(pred, evid):
    A = {t.lower() for t in re.findall(r"\b\w+\b", str(pred)) if len(t)>3}
    E = {t.lower() for t in re.findall(r"\b\w+\b", str(evid)) if len(t)>3}
    return len(A & E) / (len(A) + 1e-9)

def eval_faithfulness_retrieved_chunks(retriever_name: str, embed_model_id: str|None,
                                       K_gen=3, N=None):
    if embed_model_id is None:
        rag = SimpleRAG(docs_df, retriever=retriever_name, llm=llm)
        embed_short = "-"
    else:
        be = build_embed_backend(embed_model_id)
        rag = SimpleRAG(docs_df, retriever=retriever_name, llm=llm, embed_backend=be)
        embed_short = embed_model_id.split("/")[-1]

    gold = faith_df[faith_df["note"].isna()].copy() 
    if N is not None and len(gold) > N:
        gold = gold.sample(N, random_state=0)

    rows=[]
    for _, r in gold.iterrows():
        qid, q = r["qid"], str(r["question"])
        out = rag.ask(q, k=K_gen)                         
        ans = out["answer"]
        doc_idxs = [i for i,_ in out["hits"]]             
        chunk_ids = [map_doc_index_to_chunk_id(i) for i in doc_idxs]
        evid = " ".join(chunk_text[cid][:500] for cid in chunk_ids)
        score = token_recall(ans, evid)
        rows.append({
            "qid": qid, "question": q,
            "retriever": retriever_name, "embedding": embed_short,
            "answer": ans, "evidence_chunks": "|".join(chunk_ids),
            "faithfulness_score_retrieved": score
        })
    return pd.DataFrame(rows)

faith_out_retr = []
for retriever_name, embed_model_id in RETRIEVER_MATRIX:
    df = eval_faithfulness_retrieved_chunks(retriever_name, embed_model_id,
                                            K_gen=3, N=None)
    faith_out_retr.append(df)

faith_retrieved = pd.concat(faith_out_retr, ignore_index=True)
display(faith_retrieved.head())
print("Avg faithfulness (to retrieved evidence) by combo:")
display(faith_retrieved.groupby(["retriever","embedding"])["faithfulness_score_retrieved"]
        .mean().reset_index().sort_values("faithfulness_score_retrieved", ascending=False))


Unnamed: 0,qid,question,retriever,embedding,answer,evidence_chunks,faithfulness_score_retrieved
0,Q::b7b927d779,What is the purpose of the assistance mentioned?,tfidf,-,"In the following, we present a review of the m...",TEXTBOOK::0001::CH0052|TEXTBOOK::0007::CH0152|...,0.157895
1,Q::a4aedf3d70,What does the acronym NIH stand for?,tfidf,-,...,TEXTBOOK::0001::CH0052|WHO::3b019d8021::CH0002...,0.0
2,Q::1a6c0caae5,What is the full name of the NIDCD?,tfidf,-,The major etiological factor responsible for t...,TEXTBOOK::0001::CH0052|TEXTBOOK::0006::CH1693|...,0.444444
3,Q::4d04fecd09,What is the full name of the organization abbr...,tfidf,-,What is the full name of the CDC?,TEXTBOOK::0003::CH0226|TEXTBOOK::0003::CH0223|...,0.333333
4,Q::be034ca29c,How long do these symptoms typically last?,tfidf,-,What is the best treatment for them?,WHO::0363a8d07f::CH0000|WHO::acbab8c20b::CH000...,0.5


Avg faithfulness (to retrieved evidence) by combo:


Unnamed: 0,retriever,embedding,faithfulness_score_retrieved
0,bm25,-,0.475341
3,hybrid_bm25,bge-small-en-v1.5,0.45795
2,embed,bge-small-en-v1.5,0.425229
1,embed,all-MiniLM-L6-v2,0.408902
5,tfidf,-,0.394703
4,hybrid_tfidf,all-MiniLM-L6-v2,0.392703


In [17]:
# === Cell 7b: Faithfulness to GOLD evidence (labels in faithfulness_dataset.csv) ===
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

def eval_faithfulness_gold_evidence(retriever_name: str, embed_model_id: str|None,
                                    K_gen=3, N=None, threshold=0.5):
    if embed_model_id is None:
        rag = SimpleRAG(docs_df, retriever=retriever_name, llm=llm)
        embed_short = "-"
    else:
        be = build_embed_backend(embed_model_id)
        rag = SimpleRAG(docs_df, retriever=retriever_name, llm=llm, embed_backend=be)
        embed_short = embed_model_id.split("/")[-1]

    gold = faith_df[faith_df["note"].isna()].copy()  
    if N is not None and len(gold) > N:
        gold = gold.sample(N, random_state=0)

    scores, labels, rows = [], [], []
    for _, r in gold.iterrows():
        qid, q = r["qid"], str(r["question"])
        ans = rag.ask(q, k=K_gen)["answer"]  
        evid_cid = str(r["evidence_chunk_id"])
        evid = chunk_text.get(evid_cid, "")
        s = token_recall(ans, evid)          
        scores.append(float(s))
        labels.append(int(r["label_faithful"]))
        rows.append({
            "qid": qid, "question": q, "retriever": retriever_name,
            "embedding": embed_short, "answer": ans,
            "gold_evidence_chunk": evid_cid, "faithfulness_score_gold": float(s),
            "label": int(r["label_faithful"])
        })

    df = pd.DataFrame(rows)
    auc = roc_auc_score(labels, scores) if len(set(labels))>1 else float("nan")
    pred = [s>=threshold for s in scores]
    acc = accuracy_score(labels, pred)
    f1  = f1_score(labels, pred)
    summary = {"retriever": retriever_name, "embedding": embed_short,
               "AUC": auc, "ACC": acc, "F1": f1, "threshold": threshold}
    return df, summary

gold_rows, gold_summ = [], []
for retriever_name, embed_model_id in RETRIEVER_MATRIX:
    df, summ = eval_faithfulness_gold_evidence(retriever_name, embed_model_id,
                                               K_gen=3, N=None, threshold=0.5)
    gold_rows.append(df); gold_summ.append(summ)

faith_gold = pd.concat(gold_rows, ignore_index=True)
faith_gold_summary = pd.DataFrame(gold_summ).sort_values("AUC", ascending=False)

display(faith_gold.head())
display(faith_gold_summary)


Unnamed: 0,qid,question,retriever,embedding,answer,gold_evidence_chunk,faithfulness_score_gold,label
0,Q::b7b927d779,What is the purpose of the assistance mentioned?,tfidf,-,Answer yes!,WHO::def5effffe::CH0075,0.0,1
1,Q::a4aedf3d70,What does the acronym NIH stand for?,tfidf,-,"inactivity or inactivity, inactivity, low acti...",WHO::ba091c3aa0::CH0022,0.0,1
2,Q::1a6c0caae5,What is the full name of the NIDCD?,tfidf,-,We should learn about the nosology of NIDCD an...,WHO::ba091c3aa0::CH0022,0.0,1
3,Q::4d04fecd09,What is the full name of the organization abbr...,tfidf,-,We have to recognize the clinical importance o...,WHO::c746a8289b::CH0048,0.090909,1
4,Q::be034ca29c,How long do these symptoms typically last?,tfidf,-,Do you have them?,WHO::5d456f490d::CH0041,0.0,1


Unnamed: 0,retriever,embedding,AUC,ACC,F1,threshold
5,hybrid_bm25,bge-small-en-v1.5,0.47671,0.041215,0.071429,0.5
0,tfidf,-,0.471616,0.030369,0.050955,0.5
3,embed,bge-small-en-v1.5,0.388646,0.041215,0.067511,0.5
1,bm25,-,0.345706,0.032538,0.051064,0.5
4,hybrid_tfidf,all-MiniLM-L6-v2,0.326419,0.043384,0.071579,0.5
2,embed,all-MiniLM-L6-v2,0.312955,0.0282,0.046809,0.5


In [None]:

try:
    from transformers import pipeline as hf_pipeline
    DEVICE = 0 if torch.cuda.is_available() else -1
    _nli = hf_pipeline("text-classification", model="microsoft/deberta-v3-base-mnli", device=DEVICE)
    def nli_entail_prob(answer: str, evidence: str) -> float:
        inp = f"premise: {evidence}\nhypothesis: {answer}"
        out = _nli(inp, truncation=True)[0]
        return float(out["score"]) if "ENTAIL" in out["label"].upper() else 1.0 - float(out["score"])
    USE_NLI = True
    print("NLI judge loaded.")
except Exception as e:
    print("NLI judge not available:", repr(e))
    USE_NLI = False

if USE_NLI:
    rows=[]
    for _, r in faith_result.iterrows():
        score = nli_entail_prob(r["answer"], r.get("evidence", r.get("context","")))
        rows.append(score)
    faith_result["faithfulness_nli"] = rows
    display(faith_result.head())
    print("Avg NLI faithfulness by combo:")
    display(faith_result.groupby(["retriever","embedding"])["faithfulness_nli"].mean().reset_index().sort_values("faithfulness_nli", ascending=False))
