# RAG Pipeline with BioLLM (Pluggable Retrievers & Models)

In [1]:
#from huggingface_hub import snapshot_download
#snapshot_download(repo_id="microsoft/biogpt", local_dir="biogpt_local")

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "/home/gulizhu/MDP/biogpt_local"   
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# === Config & Imports ===
import pandas as pd
import torch
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple

from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re, math
from collections import Counter, defaultdict

# Paths to your data files
CSV_PATH = Path("/home/gulizhu/MDP/combined_health_topics_with_source.csv")
TXT_PATH = Path("/home/gulizhu/MDP/textbook_pathology.txt")
XLSX_PATH = Path("/home/gulizhu/MDP/LLM Questions.xlsx")

# Model path (adjust to your BioLLM model)
MODEL_PATH = "/home/gulizhu/MDP/biogpt_local"  


## 1. Load Data (CSV + TXT + Excel QA)

In [4]:

# --- Load CSV (WHO topics) ---
df_csv = pd.read_csv(CSV_PATH)

# Use 'text' column as context
df_csv = df_csv.rename(columns={"text": "context"})
df_csv["source"] = "WHO"

# --- Load TXT (pathology textbook) and chunk ---
with open(TXT_PATH, "r", encoding="utf-8") as f:
    txt_content = f.read()

chunk_size = 800  # adjust as needed
txt_chunks = [txt_content[i:i+chunk_size] for i in range(0, len(txt_content), chunk_size)]
df_txt = pd.DataFrame([{"context": chunk, "source": "textbook_pathology"} for chunk in txt_chunks])

print("Textbook chunks:", len(df_txt))

# --- Load Excel QA ---
df_qa = pd.read_excel(XLSX_PATH)
df_qa = df_qa.rename(columns={c: c.lower() for c in df_qa.columns})

# normalize question column
if "question" not in df_qa.columns:
    if "q" in df_qa.columns:
        df_qa = df_qa.rename(columns={"q": "question"})
    elif "prompt" in df_qa.columns:
        df_qa = df_qa.rename(columns={"prompt": "question"})
    elif "ques" in df_qa.columns:
        df_qa = df_qa.rename(columns={"ques": "question"})
if "question" not in df_qa.columns:
    raise ValueError("Excel QA file must contain a question-like column")

# --- Combine knowledge sources ---
docs_df = pd.concat([df_csv[["context","source"]], df_txt], ignore_index=True)
print("Knowledge base size:", len(docs_df))
docs_df.head(2)


Textbook chunks: 4759
Knowledge base size: 6044


Unnamed: 0,context,source
0,Common goods for health are population-based f...,WHO
1,The social determinants of health (SDH) are th...,WHO


In [5]:
import pandas as pd

df_qa = pd.read_excel("LLM Questions.xlsx")
print(df_qa.columns)
print(df_qa.head())


Index(['question'], dtype='object')
                                            question
0  What is the role of a pathologist in cancer di...
1  Which biomarkers are key in the analysis of br...
2  How does a pathologist prepare and analyze a t...
3  What are key features that a pathologist looks...
4  What is immunohistochemistry and how is it use...


## 2. Define Retrievers (TF-IDF, BM25)

In [6]:

class TFIDFRetriever:
    def __init__(self, docs: List[str]):
        self.vectorizer = TfidfVectorizer(max_features=50000)
        self.doc_mat = self.vectorizer.fit_transform(docs)
        self.docs = docs

    def search(self, query: str, k=5):
        q_vec = self.vectorizer.transform([query])
        sims = cosine_similarity(q_vec, self.doc_mat)[0]
        idxs = sims.argsort()[::-1][:k]
        return [(int(i), float(sims[i])) for i in idxs]

class BM25Retriever:
    def __init__(self, docs: List[str], k1=1.5, b=0.75):
        self.docs = docs
        self.k1, self.b = k1, b
        self.tokenizer = re.compile(r"\w+").findall
        self.tokenized = [self.tokenizer(d.lower()) for d in docs]
        self.doc_lens = [len(t) for t in self.tokenized]
        self.avgdl = sum(self.doc_lens)/max(1,len(self.doc_lens))
        df = defaultdict(int)
        for toks in self.tokenized:
            for w in set(toks):
                df[w]+=1
        self.N = len(docs)
        self.idf = {w: math.log(1+(self.N-c+0.5)/(c+0.5)) for w,c in df.items()}
        self.tf = [Counter(toks) for toks in self.tokenized]

    def _score(self, q_toks, idx):
        score=0.0; dl=self.doc_lens[idx]; tf_d=self.tf[idx]
        for w in q_toks:
            if w not in self.idf: continue
            idf=self.idf[w]; f=tf_d.get(w,0)
            denom=f+self.k1*(1-self.b+self.b*dl/(self.avgdl or 1))
            score+=idf*(f*(self.k1+1))/(denom or 1e-12)
        return score

    def search(self, query:str,k=5):
        q_toks=self.tokenizer(query.lower())
        scores=[(i,self._score(q_toks,i)) for i in range(self.N)]
        scores.sort(key=lambda x:x[1], reverse=True)
        return scores[:k]


## 3. BioLLM Backend (swappable with other models)

In [7]:

@dataclass
class Message:
    role: str
    content: str

class BioLLMBackend:
    def __init__(self, model, tokenizer, device="cuda"):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device

    def generate(self, messages: List[Message]) -> str:
        query = next((m.content for m in messages[::-1] if m.role == "user"), "")
        context = "\n\n".join(m.content for m in messages if m.role in ("system", "tool"))
        context = context[:2000]
        prompt = f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        ).to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=True,
                top_p=0.95,
                temperature=0.7
            )
        raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = raw.split("Answer:")[-1].strip() 
        return answer


## 4. RAG Pipeline

In [8]:

class SimpleRAG:
    def __init__(self, docs_df: pd.DataFrame, retriever="tfidf", llm=None):
        self.df = docs_df.reset_index(drop=True)
        self.contexts = self.df["context"].astype(str).tolist()
        if retriever=="tfidf":
            self.retriever = TFIDFRetriever(self.contexts)
        else:
            self.retriever = BM25Retriever(self.contexts)
        self.llm = llm

    def ask(self, query: str, k=3):
        hits = self.retriever.search(query, k)
        msgs=[Message(role="tool", content=self.contexts[i]) for i,_ in hits]
        msgs.append(Message(role="user", content=query))
        ans = self.llm.generate(msgs)
        return {
            "query": query,
            "context": " ".join(self.contexts[i][:500] for i,_ in hits),  
            "answer": ans,
            "hits": hits
        }



## 5. Initialize Model

In [9]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
llm = BioLLMBackend(model, tokenizer)


## 6. Compare Outputs Across Retrievers / Models

In [10]:

def compare_answers(df_qa: pd.DataFrame, retrievers=["tfidf","bm25"], llms=[("biogpt", llm)], n=5):
    sample = df_qa.sample(min(n, len(df_qa)), random_state=0)
    rows=[]
    for _,row in sample.iterrows():
        q = str(row["question"])
        for rname in retrievers:
            for lname, lbackend in llms:
                rag = SimpleRAG(docs_df, retriever=rname, llm=lbackend)
                out = rag.ask(q, k=3)
                rows.append({"question":q,"retriever":rname,"model":lname,"answer":out["answer"]})
    return pd.DataFrame(rows)

results = compare_answers(df_qa, retrievers=["tfidf","bm25"], llms=[("biollm", llm)], n=100)
results


Unnamed: 0,question,retriever,model,answer
0,How is pathology used in diagnosing soft tissu...,tfidf,biollm,Soft tissue sarcomas are a complex group of tu...
1,How is pathology used in diagnosing soft tissu...,bm25,biollm,"In this review, we have discussed the patholog..."
2,What is the importance of margins in pathology...,tfidf,biollm,The authors of this paper are the first to dis...
3,What is the importance of margins in pathology...,bm25,biollm,Total mastectomy is the best treatment for bre...
4,Describe fluorescence in situ hybridization (F...,tfidf,biollm,'How can we identify and quantify DNA and RNA ...
...,...,...,...,...
105,What is the role of a pathologist in cancer di...,bm25,biollm,A pathologist should be involved in making the...
106,What are the differences between ductal and lo...,tfidf,biollm,The biological markers for invasive breast car...
107,What are the differences between ductal and lo...,bm25,biollm,The breast carcinoma is a disease of the femal...
108,"What is the difference between sarcomas, carci...",tfidf,biollm,The distinction between the two groups of epit...


In [11]:
results.to_csv("rag_results.csv", index=False, encoding="utf-8")

## Added: Embedding-based retrieval & comparison

In [12]:

# === Embedding backends config ===
EMBED_MODELS = [
    ("minilm", "sentence-transformers/all-MiniLM-L6-v2"),
    ("bge-small", "BAAI/bge-small-en-v1.5"),
]


In [13]:

import numpy as np

class EmbeddingBackend:
    def embed_texts(self, texts):
        raise NotImplementedError
    def embed_query(self, text):
        raise NotImplementedError

class SentenceTransformersEmbedding(EmbeddingBackend):
    def __init__(self, model_id: str, device: str = "cuda"):
        try:
            from sentence_transformers import SentenceTransformer
        except Exception as e:
            raise RuntimeError("sentence-transformers not installed. pip install sentence-transformers") from e
        self.model = SentenceTransformer(model_id, device=device)

    def embed_texts(self, texts):
        vecs = self.model.encode(texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
        return vecs

    def embed_query(self, text):
        return self.embed_texts([text])[0]

class HFMeanPoolingEmbedding(EmbeddingBackend):
    def __init__(self, model_id: str, device: str = "cuda"):
        from transformers import AutoModel, AutoTokenizer
        self.tok = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModel.from_pretrained(model_id).to(device)
        self.device = device

    def _mean_pool(self, outputs, attention_mask):
        last_hidden = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        masked = last_hidden * mask
        summed = masked.sum(1)
        counts = mask.sum(1).clamp(min=1e-9)
        return (summed / counts).detach().cpu().numpy()

    def embed_texts(self, texts):
        import torch, numpy as _np
        all_vecs = []
        bs = 16
        for i in range(0, len(texts), bs):
            batch = texts[i:i+bs]
            enc = self.tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
            with torch.no_grad():
                out = self.model(**enc)
            vecs = self._mean_pool(out, enc["attention_mask"])
            vecs = vecs / (_np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-9)
            all_vecs.append(vecs)
        return _np.vstack(all_vecs)

    def embed_query(self, text):
        return self.embed_texts([text])[0]

class EmbeddingRetriever:
    def __init__(self, docs, backend: EmbeddingBackend):
        self.docs = docs
        self.backend = backend
        self.doc_vecs = self.backend.embed_texts(docs)

    def search(self, query: str, k=5):
        q = self.backend.embed_query(query)
        sims = (self.doc_vecs @ q)
        idxs = np.argsort(-sims)[:k]
        return [(int(i), float(sims[i])) for i in idxs]


In [14]:

class SimpleRAG:
    def __init__(self, docs_df: pd.DataFrame, retriever="tfidf", llm=None, embed_backend: EmbeddingBackend = None):
        self.df = docs_df.reset_index(drop=True)
        self.contexts = self.df["context"].astype(str).tolist()
        self.llm = llm

        if retriever == "tfidf":
            self.retriever = TFIDFRetriever(self.contexts)
            self.retriever_name = "tfidf"
            self.embedding_name = "-"
        elif retriever == "bm25":
            self.retriever = BM25Retriever(self.contexts)
            self.retriever_name = "bm25"
            self.embedding_name = "-"
        elif retriever == "embed":
            if embed_backend is None:
                raise ValueError("embed_backend must be provided when retriever='embed'")
            self.retriever = EmbeddingRetriever(self.contexts, embed_backend)
            self.retriever_name = "embed"
            self.embedding_name = getattr(embed_backend, "model", getattr(embed_backend, "__class__", type(embed_backend))).__class__.__name__
        else:
            raise ValueError(f"Unknown retriever: {retriever}")

    def ask(self, query: str, k=3):
        hits = self.retriever.search(query, k)
        msgs = [Message(role="tool", content=self.contexts[i][:2000]) for i,_ in hits]
        msgs.append(Message(role="user", content=query))
        ans = self.llm.generate(msgs)
        combined_ctx = " ".join(self.contexts[i][:500] for i,_ in hits)
        return {"query":query, "answer":ans, "hits":hits, "context": combined_ctx}


In [15]:

def compare_answers(df_qa: pd.DataFrame, retrievers, llms, embed_models=None, n=5, device="cuda"):
    sample = df_qa.sample(min(n, len(df_qa)), random_state=0)
    rows = []

    def build_backend(model_id: str):
        try:
            return SentenceTransformersEmbedding(model_id, device=device)
        except Exception:
            return HFMeanPoolingEmbedding(model_id, device=device)

    for _, row in sample.iterrows():
        q = str(row["question"])
        for lname, lbackend in llms:
            for rname in retrievers:
                if rname == "embed":
                    if not embed_models:
                        continue
                    for embed_short, embed_id in embed_models:
                        backend = build_backend(embed_id)
                        rag = SimpleRAG(docs_df, retriever="embed", llm=lbackend, embed_backend=backend)
                        out = rag.ask(q, k=3)
                        rows.append({
                            "question": q,
                            "retriever": "embed",
                            "embedding": embed_short,
                            "model": lname,
                            "answer": out["answer"],
                            "context": out["context"]
                        })
                else:
                    rag = SimpleRAG(docs_df, retriever=rname, llm=lbackend)
                    out = rag.ask(q, k=3)
                    rows.append({
                        "question": q,
                        "retriever": rname,
                        "embedding": "-",
                        "model": lname,
                        "answer": out["answer"],
                        "context": out["context"]
                    })
    return pd.DataFrame(rows)


In [None]:

# Compare across TF-IDF, BM25, and Embedding retrievers; include embedding column
results = compare_answers(
    df_qa,
    retrievers=["tfidf", "bm25", "embed"],
    llms=[("biollm", llm)],
    embed_models=EMBED_MODELS,
    n=5
)
results

# Save
results.to_csv("rag_results_with_embeddings.csv", index=False, encoding="utf-8")
print("Saved -> rag_results_with_embeddings.csv")


Saved -> rag_results_with_embeddings.csv


: 