In [1]:
import os
os.getcwd()

'/Users/fereshteh/Documents/MyGithub/RAG/rag-eval-medical-manual/notebooks'

In [2]:
import os, json, time
import pandas as pd

QA_PATH = "../data/eval/qa_set.json"
OUT_DIR = "../results"
OUT_PATH = os.path.join(OUT_DIR, "eval_results.csv")

os.makedirs(OUT_DIR, exist_ok=True)

In [3]:
qa = json.load(open("../data/eval/qa_set.json"))
qa

[{'id': 'q01',
  'question': 'What is the recommended approach to a patient with upper GI complaints? Summarize the key diagnostic steps.',
  'expected_answer': '',
  'source_hint': 'Gastrointestinal Disorders — Chapter 7: Approach to the Patient With Upper GI Complaints',
  'tags': ['gi', 'approach', 'diagnosis']},
 {'id': 'q02',
  'question': 'What is the recommended approach to a patient with lower GI complaints? Summarize key history and workup steps.',
  'expected_answer': '',
  'source_hint': 'Gastrointestinal Disorders — Chapter 8: Approach to the Patient With Lower GI Complaints',
  'tags': ['gi', 'approach', 'diagnosis']},
 {'id': 'q03',
  'question': 'List common causes of GI bleeding and describe how the diagnostic approach differs depending on presentation severity.',
  'expected_answer': '',
  'source_hint': 'Gastrointestinal Disorders — Chapter 10: GI Bleeding',
  'tags': ['gi', 'bleeding', 'triage']},
 {'id': 'q04',
  'question': 'What are key diagnostic and management c

In [4]:
# load 10 questions
with open(QA_PATH, "r", encoding="utf-8") as f:
    qa = json.load(f)

print("Loaded questions:", len(qa))
qa[0]



Loaded questions: 10


{'id': 'q01',
 'question': 'What is the recommended approach to a patient with upper GI complaints? Summarize the key diagnostic steps.',
 'expected_answer': '',
 'source_hint': 'Gastrointestinal Disorders — Chapter 7: Approach to the Patient With Upper GI Complaints',
 'tags': ['gi', 'approach', 'diagnosis']}

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

# Vector index settings
INDEX_DIR = "../data/index/faiss_merck_hf"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TOP_K = 5

# LLM settings
OLLAMA_MODEL = "llama3.1"
TEMPERATURE = 0

embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vectorstore = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

llm = ChatOllama(model=OLLAMA_MODEL, temperature=TEMPERATURE)

prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You answer using ONLY the provided context from a medical manual. "
     "If the answer is not in the context, say: 'I don't know based on the provided document.' "
     "Cite sources as (page=<page>, chunk_id=<chunk_id>) after each key claim."),
    ("human", "Question: {question}\n\nContext:\n{context}\n\nAnswer:")
])




  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import os, json, time
import pandas as pd

def evaluate_rag_on_questions(
    qa_list,
    retriever,
    llm,
    prompt,
    out_path,
    top_k: int = 5,
):
    """
    Runs the RAG evaluation for a list of QA items and saves the results to a CSV file.
    qa_list: list[dict] with keys like id/question/expected_answer/source_hint/tags
    retriever: LangChain retriever
    llm: ChatOllama
    prompt: ChatPromptTemplate
    out_path: where to save CSV
    top_k: number of retrieved chunks used as context
    """

    def format_context(docs):
        parts = []
        for d in docs:
            page = d.metadata.get("page", "?")
            chunk_id = d.metadata.get("chunk_id", "?")
            parts.append(f"(page={page}, chunk_id={chunk_id})\n{d.page_content}")
        return "\n\n---\n\n".join(parts)

    rows = []

    for item in qa_list:
        qid = item.get("id")
        question = item["question"]
        expected = item.get("expected_answer", "")
        source_hint = item.get("source_hint", "")
        tags = item.get("tags", [])

        # Retrieve
        t0 = time.time()
        docs = retriever.invoke(question)
        retrieved_docs = docs[:top_k]

        # Generate
        context = format_context(retrieved_docs)
        msg = prompt.invoke({"question": question, "context": context})
        resp = llm.invoke(msg)
        latency_s = time.time() - t0

        retrieved = [
            {"page": d.metadata.get("page", "?"), "chunk_id": d.metadata.get("chunk_id", "?")}
            for d in retrieved_docs
        ]

        rows.append({
            "id": qid,
            "question": question,
            "expected_answer": expected,
            "source_hint": source_hint,
            "tags": json.dumps(tags),
            "top_k": top_k,
            "answer": resp.content,
            "retrieved_sources": json.dumps(retrieved),
            "latency_s": round(latency_s, 3),
            "correctness": "",
            "faithfulness": "",
            "notes": ""
        })

    df = pd.DataFrame(rows)

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df.to_csv(out_path, index=False)

    return df


In [8]:
df = evaluate_rag_on_questions(
    qa,
    retriever,
    llm,
    prompt,
    OUT_PATH,
    top_k = 5,
)

In [9]:
print("Mean latency (s):", df["latency_s"].mean())
print("Max latency (s):", df["latency_s"].max())


Mean latency (s): 24.2226
Max latency (s): 30.959


In [10]:
def run_topk_experiments(
    qa_list,
    vectorstore,
    llm,
    prompt,
    top_k_list=[3, 5, 10],
    results_dir="../results",
    file_prefix="eval_results"
):
    """
    Runs the evaluation for multiple TOP_K values and saves one CSV per config.
    Returns a dict: top_k -> dataframe
    """
    results = {}

    for k in top_k_list:
        print(f"Experiment TOP_K={k} ...")

        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
        out_path = os.path.join(results_dir, f"{file_prefix}_k{k}.csv")

        df_k = evaluate_rag_on_questions(
            qa_list=qa_list,
            retriever=retriever,
            llm=llm,
            prompt=prompt,
            out_path=out_path,
            top_k=k
        )

        results[k] = df_k
        print(f"Saved: {out_path} | Mean latency: {df_k['latency_s'].mean():.2f}s")

    return results


In [11]:
topk_results = run_topk_experiments(
    qa_list=qa,
    vectorstore=vectorstore,
    llm=llm,
    prompt=prompt,
    top_k_list=[3, 5, 10],
    results_dir="../results",
    file_prefix="eval_results"
)

Experiment TOP_K=3 ...
Saved: ../results/eval_results_k3.csv | Mean latency: 21.23s
Experiment TOP_K=5 ...
Saved: ../results/eval_results_k5.csv | Mean latency: 32.10s
Experiment TOP_K=10 ...
Saved: ../results/eval_results_k10.csv | Mean latency: 44.60s
