# RAG Pipeline with BioLLM (Pluggable Retrievers & Models)

In [None]:
from huggingface_hub import snapshot_download


snapshot_download(repo_id="microsoft/biogpt", local_dir="biogpt_local")


Fetching 6 files: 100%|██████████| 6/6 [00:13<00:00,  2.28s/it]


'/home/gulizhu/MDP/biogpt_local'

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "/home/gulizhu/MDP/biogpt_local"   
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)


In [15]:

# === Config & Imports ===
import pandas as pd
import torch
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple

from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re, math
from collections import Counter, defaultdict

# Paths to your data files
CSV_PATH = Path("/home/gulizhu/MDP/combined_health_topics_with_source.csv")
TXT_PATH = Path("/home/gulizhu/MDP/textbook_pathology.txt")
XLSX_PATH = Path("/home/gulizhu/MDP/LLM Questions.xlsx")

# Model path (adjust to your BioLLM model)
MODEL_PATH = "/home/gulizhu/MDP/biogpt_local"  


## 1. Load Data (CSV + TXT + Excel QA)

In [16]:

# --- Load CSV ---
df_csv = pd.read_csv(CSV_PATH)

# --- Load TXT ---
with open(TXT_PATH, "r", encoding="utf-8") as f:
    txt_content = f.read()
df_txt = pd.DataFrame([{"context": txt_content, "source": "textbook_pathology"}])

# --- Load Excel QA ---
df_qa = pd.read_excel(XLSX_PATH)

# Normalize QA columns (expect 'question', 'answer' at least)
df_qa = df_qa.rename(columns={c: c.lower() for c in df_qa.columns})
if "question" not in df_qa.columns:
    raise ValueError("Excel QA file must contain a 'question' column")

# --- Combine knowledge sources ---
if "context" not in df_csv.columns:
    # assume one column holds text (choose first non-id column)
    text_col = [c for c in df_csv.columns if c not in ["id","source"]][0]
    df_csv = df_csv.rename(columns={text_col: "context"})
df_csv["source"] = "health_topics"

docs_df = pd.concat([df_csv[["context","source"]], df_txt], ignore_index=True)
print("Knowledge base size:", len(docs_df))
docs_df.head(2)


Knowledge base size: 1286


Unnamed: 0,context,source
0,Common goods for health,health_topics
1,Social determinants of health,health_topics


In [17]:
import pandas as pd

df_qa = pd.read_excel("LLM Questions.xlsx")
print(df_qa.columns)
print(df_qa.head())


Index(['question'], dtype='object')
                                            question
0  What is the role of a pathologist in cancer di...
1  Which biomarkers are key in the analysis of br...
2  How does a pathologist prepare and analyze a t...
3  What are key features that a pathologist looks...
4  What is immunohistochemistry and how is it use...


## 2. Define Retrievers (TF-IDF, BM25)

In [18]:

class TFIDFRetriever:
    def __init__(self, docs: List[str]):
        self.vectorizer = TfidfVectorizer(max_features=50000)
        self.doc_mat = self.vectorizer.fit_transform(docs)
        self.docs = docs

    def search(self, query: str, k=5):
        q_vec = self.vectorizer.transform([query])
        sims = cosine_similarity(q_vec, self.doc_mat)[0]
        idxs = sims.argsort()[::-1][:k]
        return [(int(i), float(sims[i])) for i in idxs]

class BM25Retriever:
    def __init__(self, docs: List[str], k1=1.5, b=0.75):
        self.docs = docs
        self.k1, self.b = k1, b
        self.tokenizer = re.compile(r"\w+").findall
        self.tokenized = [self.tokenizer(d.lower()) for d in docs]
        self.doc_lens = [len(t) for t in self.tokenized]
        self.avgdl = sum(self.doc_lens)/max(1,len(self.doc_lens))
        df = defaultdict(int)
        for toks in self.tokenized:
            for w in set(toks):
                df[w]+=1
        self.N = len(docs)
        self.idf = {w: math.log(1+(self.N-c+0.5)/(c+0.5)) for w,c in df.items()}
        self.tf = [Counter(toks) for toks in self.tokenized]

    def _score(self, q_toks, idx):
        score=0.0; dl=self.doc_lens[idx]; tf_d=self.tf[idx]
        for w in q_toks:
            if w not in self.idf: continue
            idf=self.idf[w]; f=tf_d.get(w,0)
            denom=f+self.k1*(1-self.b+self.b*dl/(self.avgdl or 1))
            score+=idf*(f*(self.k1+1))/(denom or 1e-12)
        return score

    def search(self, query:str,k=5):
        q_toks=self.tokenizer(query.lower())
        scores=[(i,self._score(q_toks,i)) for i in range(self.N)]
        scores.sort(key=lambda x:x[1], reverse=True)
        return scores[:k]


## 3. BioLLM Backend (swappable with other models)

In [19]:

@dataclass
class Message:
    role: str
    content: str

class BioLLMBackend:
    def __init__(self, model, tokenizer, device="cuda"):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device

    def generate(self, messages: List[Message]) -> str:
        query = next((m.content for m in messages[::-1] if m.role=="user"), "")
        context = "\n\n".join(m.content for m in messages if m.role in ("system","tool"))
        prompt = f"Question: {query}\nContext: {context}\nAnswer:"
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_length=256)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


## 4. RAG Pipeline

In [20]:

class SimpleRAG:
    def __init__(self, docs_df: pd.DataFrame, retriever="tfidf", llm=None):
        self.df = docs_df.reset_index(drop=True)
        self.contexts = self.df["context"].astype(str).tolist()
        if retriever=="tfidf":
            self.retriever = TFIDFRetriever(self.contexts)
        else:
            self.retriever = BM25Retriever(self.contexts)
        self.llm = llm

    def ask(self, query: str, k=3):
        hits = self.retriever.search(query, k)
        msgs=[Message(role="tool", content=self.contexts[i]) for i,_ in hits]
        msgs.append(Message(role="user", content=query))
        ans = self.llm.generate(msgs)
        return {"query":query, "answer":ans, "hits":hits}


## 5. Initialize Model

In [21]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
llm = BioLLMBackend(model, tokenizer)


## 6. Compare Outputs Across Retrievers / Models

In [None]:

def compare_answers(df_qa: pd.DataFrame, retrievers=["tfidf","bm25"], llms=[("biollm", llm)], n=5):
    sample = df_qa.sample(min(n, len(df_qa)), random_state=0)
    rows=[]
    for _,row in sample.iterrows():
        q = str(row["question"])
        for rname in retrievers:
            for lname, lbackend in llms:
                rag = SimpleRAG(docs_df, retriever=rname, llm=lbackend)
                out = rag.ask(q, k=3)
                rows.append({"question":q,"retriever":rname,"model":lname,"answer":out["answer"]})
    return pd.DataFrame(rows)

results = compare_answers(df_qa, retrievers=["tfidf","bm25"], llms=[("biollm", llm)], n=5)
results


ValueError: Input length of input_ids is 766492, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

: 