In [1]:
import os
os.getcwd()

'/Users/fereshteh/Documents/MyGithub/RAG/rag-eval-medical-manual/notebooks'

In [9]:
import os, json, time
import pandas as pd

QA_PATH = "../data/eval/qa_set.json"
OUT_DIR = "../results"
OUT_PATH = os.path.join(OUT_DIR, "eval_results.csv")

os.makedirs(OUT_DIR, exist_ok=True)

In [12]:
qa = json.load(open("../data/eval/qa_set.json"))
qa

[{'id': 'q01',
  'question': 'What is the recommended approach to a patient with upper GI complaints? Summarize the key diagnostic steps.',
  'expected_answer': '',
  'source_hint': 'Gastrointestinal Disorders — Chapter 7: Approach to the Patient With Upper GI Complaints',
  'tags': ['gi', 'approach', 'diagnosis']},
 {'id': 'q02',
  'question': 'What is the recommended approach to a patient with lower GI complaints? Summarize key history and workup steps.',
  'expected_answer': '',
  'source_hint': 'Gastrointestinal Disorders — Chapter 8: Approach to the Patient With Lower GI Complaints',
  'tags': ['gi', 'approach', 'diagnosis']},
 {'id': 'q03',
  'question': 'List common causes of GI bleeding and describe how the diagnostic approach differs depending on presentation severity.',
  'expected_answer': '',
  'source_hint': 'Gastrointestinal Disorders — Chapter 10: GI Bleeding',
  'tags': ['gi', 'bleeding', 'triage']},
 {'id': 'q04',
  'question': 'What are key diagnostic and management c

In [13]:
# load 10 questions
with open(QA_PATH, "r", encoding="utf-8") as f:
    qa = json.load(f)

print("Loaded questions:", len(qa))
qa[0]



Loaded questions: 10


{'id': 'q01',
 'question': 'What is the recommended approach to a patient with upper GI complaints? Summarize the key diagnostic steps.',
 'expected_answer': '',
 'source_hint': 'Gastrointestinal Disorders — Chapter 7: Approach to the Patient With Upper GI Complaints',
 'tags': ['gi', 'approach', 'diagnosis']}

In [14]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

# Vector index settings
INDEX_DIR = "../data/index/faiss_merck_hf"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TOP_K = 5

# LLM settings
OLLAMA_MODEL = "llama3.1"
TEMPERATURE = 0

embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vectorstore = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

llm = ChatOllama(model=OLLAMA_MODEL, temperature=TEMPERATURE)

prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You answer using ONLY the provided context from a medical manual. "
     "If the answer is not in the context, say: 'I don't know based on the provided document.' "
     "Cite sources as (page=<page>, chunk_id=<chunk_id>) after each key claim."),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer:")
])

def format_context(docs):
    parts = []
    for d in docs:
        page = d.metadata.get("page", "?")
        chunk_id = d.metadata.get("chunk_id", "?")
        parts.append(f"(page={page}, chunk_id={chunk_id})\n{d.page_content}")
    return "\n\n---\n\n".join(parts)

def rag_answer(question: str, k: int = TOP_K):
    docs = retriever.invoke(question)
    context = format_context(docs[:k])
    msg = prompt.invoke({"question": question, "context": context})
    resp = llm.invoke(msg)
    return resp.content, docs[:k]


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
rows = []

for item in qa:
    qid = item.get("id")
    question = item["question"]

    t0 = time.time()
    answer, src_docs = rag_answer(question)
    latency_s = time.time() - t0

    retrieved = [
        {"page": d.metadata.get("page", "?"), "chunk_id": d.metadata.get("chunk_id", "?")}
        for d in src_docs
    ]

    rows.append({
        "id": qid,
        "question": question,
        "answer": answer,
        "retrieved_sources": json.dumps(retrieved),
        "latency_s": round(latency_s, 3),
        # human eval fields (fill later)
        "correctness": "",
        "faithfulness": "",
        "notes": ""
    })

df = pd.DataFrame(rows)
df.to_csv(OUT_PATH, index=False)
print("Saved results to:", OUT_PATH)
df.head(3)


Saved results to: ../results/eval_results.csv


Unnamed: 0,id,question,answer,retrieved_sources,latency_s,correctness,faithfulness,notes
0,q01,What is the recommended approach to a patient ...,The recommended approach to a patient with upp...,"[{""page"": 130, ""chunk_id"": 491}, {""page"": 129,...",33.3,,,
1,q02,What is the recommended approach to a patient ...,To approach a patient with lower GI complaints...,"[{""page"": 140, ""chunk_id"": 529}, {""page"": 141,...",22.524,,,
2,q03,List common causes of GI bleeding and describe...,**Common causes of GI bleeding:**\n\n1. Upper ...,"[{""page"": 159, ""chunk_id"": 615}, {""page"": 159,...",21.995,,,


In [16]:
print("Mean latency (s):", df["latency_s"].mean())
print("Max latency (s):", df["latency_s"].max())


Mean latency (s): 31.331400000000002
Max latency (s): 67.358
