<a href="https://colab.research.google.com/github/Afroza2/gen-ai-RAG-project/blob/master/Document_RAG_final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

0. package install

In [4]:
!pip install -q langchain langchain_openai langchain-community langchain_huggingface langchain-text-splitters sentence-transformers faiss-cpu chromadb pypdf

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.3/84.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.5/329.5 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00

In [5]:
!pip install -q transformers accelerate langchain langchain-community

In [6]:
!pip install -q langchain langchain-community langchain-core

PART 1: Data Preparation and Indexing

1. Data Collection - CDC's POLARIS policy resources and related partners

In [7]:
import os
import requests
from pathlib import Path

PDF_URLS = [
    "https://www.cdc.gov/polaris/media/pdfs/2024/09/Quick-Start-Guide.pdf",  # [web:51][web:91]
    "https://www.cdc.gov/polaris/media/pdfs/2024/09/UsingEvaluationtoInformCDCsPolicyProcess.pdf",  # [web:113]
    "https://stacks.cdc.gov/view/cdc/25335/cdc_25335_DS1.pdf",  # [web:114][web:45]
    "https://vetoviolence.cdc.gov/apps/evaluaction/assets/EvaluACTION/pdf/Types-of-Evaluation.pdf",  # [web:115]
    "https://www.cdc.gov/sti/media/pdfs/2025/06/Program-Operation-Considerations-for-STI-Prevention.pdf",  # [web:116]
    "https://www.naccho.org/uploads/full-width-images/HiAP-Quick-Start-Guide-FINAL.pdf",  # [web:119]
    "https://www.naccho.org/uploads/downloadable-resources/Project-Firstline-Quick-Start-Guide.pdf",  # [web:124]
    "https://stacks.cdc.gov/view/cdc/119463/cdc_119463_DS1.pdf",  # [web:125]
]

DATA_DIR = Path("data_pdfs")
DATA_DIR.mkdir(exist_ok=True)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

def download_pdfs():
    for url in PDF_URLS:
        filename = DATA_DIR / url.split("/")[-1]
        if not filename.exists():
            r = requests.get(url)
            r.raise_for_status()
            with open(filename, "wb") as f:
                f.write(r.content)
            print("Downloaded", filename.name)
        else:
            print("Already exists", filename.name)

download_pdfs()


Downloaded Quick-Start-Guide.pdf
Downloaded UsingEvaluationtoInformCDCsPolicyProcess.pdf
Downloaded cdc_25335_DS1.pdf
Downloaded Types-of-Evaluation.pdf
Downloaded Program-Operation-Considerations-for-STI-Prevention.pdf
Downloaded HiAP-Quick-Start-Guide-FINAL.pdf
Downloaded Project-Firstline-Quick-Start-Guide.pdf
Downloaded cdc_119463_DS1.pdf


2. Document Processing:

Load all PDFs into LangChain Document

In [8]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

DATA_DIR = Path("data_pdfs")

def load_all_pdfs(data_dir: Path):
    all_docs = []
    for path in data_dir.glob("*.pdf"):
        loader = PyPDFLoader(str(path))
        docs = loader.load()
        for d in docs:
            d.metadata["source_file"] = path.name
        all_docs.extend(docs)
    print(f"Loaded {len(all_docs)} pages from {data_dir}")
    return all_docs

docs = load_all_pdfs(DATA_DIR)

Loaded 408 pages from data_pdfs


Strategy 1: fixed-size chunks with overlap

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_docs_fixed(docs, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return splitter.split_documents(docs)

chunks_fixed = split_docs_fixed(docs)

print(len(chunks_fixed), "fixed chunks")


1382 fixed chunks


Strategy 2: more sentence/semantic-like chunks


In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_docs_semantic(docs, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "? ", "! ", " "],
    )
    return splitter.split_documents(docs)

chunks_sem   = split_docs_semantic(docs)


print(len(chunks_sem),   "semantic-ish chunks")

2435 semantic-ish chunks


Embedding model: all-MiniLM-L6-v2

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

3. Vector Store Indexing: Build FAISS indexes for each chunking strategy

In [12]:
from langchain_community.vectorstores import FAISS

INDEX_DIR = Path("indexes")
INDEX_DIR.mkdir(exist_ok=True)

def build_faiss_index(chunks, name: str):
    index_path = INDEX_DIR / name
    vs = FAISS.from_documents(chunks, embeddings)  # build index[web:135][web:138][web:144][web:147]
    vs.save_local(str(index_path))                 # persist to disk
    print(f"Saved FAISS index to {index_path}")
    return vs

faiss_fixed = build_faiss_index(chunks_fixed, "fixed_faiss")
faiss_sem   = build_faiss_index(chunks_sem,   "semantic_faiss")


Saved FAISS index to indexes/fixed_faiss
Saved FAISS index to indexes/semantic_faiss


Part 2: RAG Pipeline Implementation and Grounding

1. Basic RAG Implementation:

- Reload FAISS indexes and create retrievers

In [13]:
from langchain_community.vectorstores import FAISS

# reload the indexes
from pathlib import Path
INDEX_DIR = Path("indexes")

faiss_fixed = FAISS.load_local(
    str(INDEX_DIR / "fixed_faiss"),
    embeddings,
    allow_dangerous_deserialization=True,
)

faiss_sem = FAISS.load_local(
    str(INDEX_DIR / "semantic_faiss"),
    embeddings,
    allow_dangerous_deserialization=True,
)

# turn them into retrievers with top-k control
fixed_retriever = faiss_fixed.as_retriever(search_kwargs={"k": 5})   # top-5[web:154][web:157][web:161]
sem_retriever   = faiss_sem.as_retriever(search_kwargs={"k": 5})


- Choose an LLM: using HuggingFace's LaMini-Cerebras-1.3B

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline

model_id = "MBZUAI/LaMini-Cerebras-1.3B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=1e-5,   # tiny > 0 to satisfy transformers
    do_sample=False,    # greedy decoding (no randomness)
)

llm = HuggingFacePipeline(pipeline=gen_pipe)


tokenizer_config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/5.36G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  llm = HuggingFacePipeline(pipeline=gen_pipe)


Top-k vector search for 2 types of chunking strategy

2. Prompt Engineering for Grounding:

In [15]:
from langchain_core.prompts import PromptTemplate

system_template = """You are a public health policy assistant.
Use ONLY the context below to answer the question.
If the context does not contain the answer, reply exactly with:
"The required information is not available in my current resource database."

Context:
{context}

Question:
{question}
"""

qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=system_template,
)


Part 3: Evaluation (Differentiated Tasks)

1. Test Dataset Creation:

In [16]:
EVAL_QUESTIONS = [
    # 1. CDC Policy Process overview
    "According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?",

    # 2. Problem identification in a scenario
    "A local health department has rising diabetes rates but limited data. Using the CDC policy process and evidence-based policy guidance, how should they approach problem identification before considering specific policy options?",

    # 3. Policy analysis and option comparison
    "Suppose a state is considering three different vaccine outreach policies. How does CDC’s policy analytical framework recommend comparing policy options in terms of health impact, feasibility, and economic effects?",

    # 4. Strategy and policy development
    "In the CDC policy process, what are the key tasks in the strategy and policy development domain, and how should a local team use these tasks to move from a chosen option to an adoptable policy?",

    # 5. Using different types of evidence
    "The evidence-based policy quick-start guide describes different types of evidence (research, contextual, experiential). For a new local tobacco control policy, how should a health department combine these evidence types to design the policy?",

    # 6. Evaluation across the policy lifecycle
    "Using the 'Using Evaluation to Inform CDC's Policy Process' document, explain how evaluation activities should be integrated into at least three different domains of the policy process, giving concrete examples.",

    # 7. Health in All Policies scenario
    "A city wants to improve access to safe walking routes through a Health in All Policies (HiAP) approach. Based on the HiAP quick-start guide, what steps should they take to form a cross-sector collaborative and set shared goals?",

    # 8. Community engagement and equity
    "In the evidence-based policy quick-start guide and HiAP materials, what practices are recommended to ensure community engagement and consideration of health equity when developing local policies?",

    # 9. Maternal health policy scenario
    "A state task force is trying to reduce pregnancy-related deaths. Using the CDC 'State Strategies for Preventing Pregnancy-Related Deaths' guidance, what types of policy or system-level strategies should they consider?",

    # 10. Designing a local policy roadmap
    "Imagine a county health department wants to create a multi-year roadmap to strengthen its use of evidence-based public health policy. Drawing on the CDC quick-start guide, the policy analytical framework, and HiAP guidance, outline the major phases and actions they should plan.",
]


2. Comparative Evaluation (Two Systems):

System 1: Vanilla LLM (Zero-shot prompt, no RAG/context).

In [17]:
baseline_template = """You are a public health policy assistant.
Answer the following question as completely and accurately as possible,
using your general knowledge. Do not mention any external tools or documents.

Question:
{question}
"""

baseline_prompt = PromptTemplate(
    input_variables=["question"],
    template=baseline_template,
)

def ask_vanilla(question: str):
    prompt_text = baseline_prompt.format(question=question)
    resp = llm.invoke(prompt_text)
    return resp


System 2: Basic RAG (Simple vector search).

In [25]:
def ask_rag_core(retriever, question: str, qa_prompt, llm, max_context_chars=1500):
    """
    Retrieves documents and generates an answer using the LLM.
    Truncates context to avoid overflow.
    """
    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)
    context = context[:max_context_chars]

    prompt_text = qa_prompt.format(context=context, question=question)
    resp = llm.invoke(prompt_text)
    return resp, docs
def ask_rag(retriever, question: str):
    # Use the utility function, passing global prompt and llm
    return ask_rag_core(retriever, question, qa_prompt, llm)
def ask_rag_fixed(question: str):
    return ask_rag(fixed_retriever, question)
def ask_rag_sem(question: str):
    return ask_rag(sem_retriever, question)


3. Metrics Calculation

In [26]:
def run_two_systems(questions):
    """
    Runs each question through Vanilla, Basic RAG (Fixed Chunks),
    and Basic RAG (Semantic Chunks) and captures all three results.

    Note: The 'two systems' designation refers to the Vanilla LLM and the
    Basic RAG system, which is tested using two different chunking methods.
    """
    rows = []
    for i, q in enumerate(questions, start=1):
        print("="*80)
        print(f"Q{i}: {q}\n")

        # System 1: Vanilla
        vanilla_answer = ask_vanilla(q)
        print("== System 1: Vanilla LLM ==")
        print(vanilla_answer, "\n")

        # System 2.1: Basic RAG (Fixed Chunks) - Use unique variables
        fixed_rag_answer, fixed_rag_sources = ask_rag_fixed(q)
        print("== System 2.1: Basic RAG (Fixed Chunks) ==")
        print(fixed_rag_answer, "\n")

        # System 2.2: Basic RAG (Semantic Chunks) - Use unique variables
        sem_rag_answer, sem_rag_sources = ask_rag_sem(q)
        print("== System 2.2: Basic RAG (Sentence Chunks) ==")
        print(sem_rag_answer, "\n")

        # Capture context for evaluation for BOTH RAG systems
        fixed_rag_context = "\n\n".join(d.page_content for d in fixed_rag_sources)
        sem_rag_context = "\n\n".join(d.page_content for d in sem_rag_sources)

        rows.append({
            "id": i,
            "question": q,
            "vanilla_answer": vanilla_answer,

            # --- FIXED RAG RESULTS ---
            "fixed_rag_answer": fixed_rag_answer,
            "fixed_rag_sources_meta": [d.metadata for d in fixed_rag_sources],
            "fixed_rag_context": fixed_rag_context,

            # --- SEMANTIC RAG RESULTS ---
            "sem_rag_answer": sem_rag_answer,
            "sem_rag_sources_meta": [d.metadata for d in sem_rag_sources],
            "sem_rag_context": sem_rag_context,
        })
    return rows

# Example of how you would execute this:
# results = run_two_systems(EVAL_QUESTIONS)

In [27]:
results = run_two_systems(EVAL_QUESTIONS)


Q1: According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?

== System 1: Vanilla LLM ==
You are a public health policy assistant.
Answer the following question as completely and accurately as possible,
using your general knowledge. Do not mention any external tools or documents.

Question:
According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?

Answer: According to CDC's policy process, the five main domains are:

1. Health
2. Environment
3. Public health
4. Social determinants of health
5. Economic and social development

The five main domains are connected to stakeholder engagement and evaluation activities through the following steps:

1. Identify the stakeholders: Determine the key stakeholders involved in the decision-making process, such as government officials, healthcare providers, community leaders, and 

- Faithfulness/Factuality (Grounding): What percentage of the generated answer's claims are verifiably supported by the source documents?

- Answer Relevancy: How well did the answer fully address all parts of the user's complex query?

In [29]:
import re
from langchain_core.prompts import PromptTemplate

def ask_rag_core(retriever, question: str, qa_prompt, llm, max_context_chars=1500):
    """
    Retrieves documents and generates an answer using the LLM.
    Truncates context to avoid overflow.
    """
    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)
    context = context[:max_context_chars]

    prompt_text = qa_prompt.format(context=context, question=question)
    resp = llm.invoke(prompt_text)
    return resp, docs

def score_question(scores_list, index, vf, vr, rf, rr):
    """
    Updates the scores list at the given index (1-based) with the provided values.
    """
    # Adjust index to 0-based
    idx = index - 1
    if 0 <= idx < len(scores_list):
        scores_list[idx]["vanilla_faith"] = vf
        scores_list[idx]["vanilla_relev"] = vr
        scores_list[idx]["rag_faith"] = rf
        scores_list[idx]["rag_relev"] = rr
    else:
        print(f"Error: Index {index} out of range.")

def compute_metrics(scores_list):
    """
    Computes average faithfulness and relevancy for both Vanilla and RAG systems.
    """
    metrics = {
        "vanilla_faith_avg": 0.0,
        "vanilla_relev_avg": 0.0,
        "rag_faith_avg": 0.0,
        "rag_relev_avg": 0.0,
        "count": 0
    }

    valid_entries = 0
    for s in scores_list:
        # Check if all scores are present (not None)
        if all(k in s and s[k] is not None for k in ["vanilla_faith", "vanilla_relev", "rag_faith", "rag_relev"]):
            metrics["vanilla_faith_avg"] += s["vanilla_faith"]
            metrics["vanilla_relev_avg"] += s["vanilla_relev"]
            metrics["rag_faith_avg"] += s["rag_faith"]
            metrics["rag_relev_avg"] += s["rag_relev"]
            valid_entries += 1

    if valid_entries > 0:
        metrics["vanilla_faith_avg"] /= valid_entries
        metrics["vanilla_relev_avg"] /= valid_entries
        metrics["rag_faith_avg"] /= valid_entries
        metrics["rag_relev_avg"] /= valid_entries

    metrics["count"] = valid_entries
    return metrics

def parse_score(response_text):
    """
    Extracts a number from the LLM response and normalizes it to 0.0-1.0.
    Prioritizes explicit 'Score: X' pattern, then searches for a single number.
    """
    try:
        # 1. Look for explicit 'Score: <number>' pattern (preferred output)
        match = re.search(r'Score:\s*(\d+(\.\d+)?)', response_text, re.IGNORECASE)
        if match:
            val = float(match.group(1))
        else:
            # 2. Fallback: Search for any single number (float or int) at the start or end
            # This is a dangerous fallback but necessary for stubborn LLMs.
            matches = re.findall(r'(\d+(\.\d+)?)\s*$', response_text.strip())
            if not matches:
                 matches = re.findall(r'(\d+(\.\d+)?)', response_text.strip())

            if matches:
                # Take the first matched number if multiple exist
                val = float(matches[0][0])
            else:
                return 0.0

        # Clamp to 0-10
        val = max(0, min(10, val))
        return val / 10.0
    except:
        return 0.0

def evaluate_answer_with_llm(llm, question, answer, context=None):
    """
    Uses the LLM to evaluate the answer for Faithfulness and Relevancy.
    Returns a tuple (faithfulness_score, relevancy_score).
    """

    # === RELEVANCY PROMPT (Measures Answer Quality) ===
    relevancy_template = """You are a highly efficient, impartial judge. Your ONLY task is to rate the Relevancy of the answer to the question on a scale from 0 to 10.

Question: {question}
Answer: {answer}

Rate:
0 = Completely irrelevant.
10 = Fully and perfectly addresses all parts of the question.

You MUST output the score ONLY in the format 'Score: <number>'. Do not include any other text, explanation, or commentary.

Your Response:"""

    # === FAITHFULNESS PROMPT (Measures Grounding/Hallucination) ===
    faithfulness_template = """You are a highly efficient, impartial judge. Your ONLY task is to rate the Faithfulness of the answer to the context on a scale from 0 to 10.

Context: {context}
Answer: {answer}

Rate:
0 = Contains major hallucinations or is completely unsupported by context.
10 = Every claim in the answer is fully supported by the context.

You MUST output the score ONLY in the format 'Score: <number>'. Do not include any other text, explanation, or commentary.

Your Response:"""

    try:
        # Calculate Relevancy
        rel_prompt = relevancy_template.format(question=question, answer=answer)
        rel_resp = llm.invoke(rel_prompt)
        rel_score = parse_score(rel_resp)

        # Calculate Faithfulness
        if context:
            faith_prompt = faithfulness_template.format(context=context, answer=answer)
            faith_resp = llm.invoke(faith_prompt)
            faith_score = parse_score(faith_resp)
        else:
            # Vanilla LLM without context defaults to 0.0 Faithfulness.
            faith_score = 0.0

        return faith_score, rel_score

    except Exception as e:
        # This will catch the Q10 crash due to Max Length, etc.
        print(f"Error during LLM evaluation: {e}")
        return 0.0, 0.0

In [35]:
# --- SCORE LIST INITIALIZATION (FIXED for 3-SYSTEM MANUAL SCORING) ---

# Note: Assumes 'results' exists from a successful run_two_systems comparison.

scores = []
# Initialize with placeholders for ALL 3 systems (Vanilla, Basic RAG, Advanced RAG)
for row in results:
    scores.append({
        "id": row["id"],
        "question": row["question"],
        "vanilla_faith": None,
        "vanilla_relev": None,
        "rag_faith": None,       # Basic RAG (Fixed Chunks)
        "rag_relev": None,       # Basic RAG (Fixed Chunks)
        "advanced_faith": None,  # Placeholder for Advanced RAG Manual Score
        "advanced_relev": None   # Placeholder for Advanced RAG Manual Score
    })


print("Starting Automated Evaluation with LLM Judge (2-System Pass)...")

# --- AUTOMATED SCORING LOOP (FIXED KEY ACCESS & INDENTATION) ---

# NOTE: This loop relies on the existing 5-argument score_question function
# (index, vf, vr, rf, rr) defined in your notebook.

MAX_JUDGE_CONTEXT_CHARS = 1000  # Define this once outside the loop

for i, row in enumerate(scores):
    q = row['question']
    # Perform lookup in the 'results' list
    res_row = next((r for r in results if r['id'] == row['id']), None)

    if not res_row:
        print(f"Warning: No result found for ID {row['id']}")
        continue

    # Get answers and context for Vanilla (v)
    v_ans = res_row['vanilla_answer']

    # Get answers and context for Basic RAG (r) using the selected FIXED keys
    r_ans = res_row['fixed_rag_answer']
    r_context = res_row['fixed_rag_context']

    # 1. Vanilla Evaluation (No context required)
    vf, vr = evaluate_answer_with_llm(llm, q, v_ans, context=None)

    # 2. RAG Evaluation: Apply context truncation before evaluation call
    if len(r_context) > MAX_JUDGE_CONTEXT_CHARS:
        r_context = r_context[:MAX_JUDGE_CONTEXT_CHARS] + "..." # Truncate and add ellipsis

    # Perform RAG Evaluation
    rf, rr = evaluate_answer_with_llm(llm, q, r_ans, context=r_context)

    print(f"\nEvaluating Q{row['id']}...")
    print(f"  Vanilla -> Faith: {vf:.2f}, Relev: {vr:.2f}")
    print(f"  RAG     -> Faith: {rf:.2f}, Relev: {rr:.2f}")

    # Update the score list using the 2-system function signature:
    score_question(scores, row['id'], vf, vr, rf, rr)

# Compute and print metrics (Uses the 2-system compute_metrics)
metrics = compute_metrics(scores)
print("\nFinal Metrics (2-System Judge Output):")
print(metrics)

Starting Automated Evaluation with LLM Judge (2-System Pass)...

Evaluating Q1...
  Vanilla -> Faith: 0.00, Relev: 0.00
  RAG     -> Faith: 0.00, Relev: 0.00

Evaluating Q2...
  Vanilla -> Faith: 0.00, Relev: 1.00
  RAG     -> Faith: 1.00, Relev: 0.00

Evaluating Q3...
  Vanilla -> Faith: 0.00, Relev: 0.00
  RAG     -> Faith: 1.00, Relev: 0.00

Evaluating Q4...
  Vanilla -> Faith: 0.00, Relev: 1.00
  RAG     -> Faith: 1.00, Relev: 0.00

Evaluating Q5...
  Vanilla -> Faith: 0.00, Relev: 1.00
  RAG     -> Faith: 0.00, Relev: 0.00

Evaluating Q6...
  Vanilla -> Faith: 0.00, Relev: 0.00
  RAG     -> Faith: 1.00, Relev: 0.00

Evaluating Q7...
  Vanilla -> Faith: 0.00, Relev: 0.00
  RAG     -> Faith: 0.40, Relev: 1.00

Evaluating Q8...
  Vanilla -> Faith: 0.00, Relev: 1.00
  RAG     -> Faith: 1.00, Relev: 0.00

Evaluating Q9...
  Vanilla -> Faith: 0.00, Relev: 1.00
  RAG     -> Faith: 0.00, Relev: 0.00

Evaluating Q10...
  Vanilla -> Faith: 0.00, Relev: 1.00
  RAG     -> Faith: 1.00, Relev: 

MANUAL SCORING

# Note: The automated LLM Judge was unstable, producing uniform scores (0.80) and crashing on Q10. The metrics below are based on manual human verification against the definitions of Faithfulness and Relevancy.

In [36]:
def score_question(scores_list, index, vf, vr, rf, rr):
    """
    Updates the scores list at the given index (1-based) with the provided values.

    """
    idx = index - 1
    if 0 <= idx < len(scores_list):
        scores_list[idx]["vanilla_faith"] = vf
        scores_list[idx]["vanilla_relev"] = vr
        scores_list[idx]["rag_faith"] = rf
        scores_list[idx]["rag_relev"] = rr
    else:
        print(f"Error: Index {index} out of range.")

def compute_metrics(scores_list):
    """
    Computes average faithfulness and relevancy for both Vanilla and RAG systems.

    """
    metrics = {
        "vanilla_faith_avg": 0.0,
        "vanilla_relev_avg": 0.0,
        "rag_faith_avg": 0.0,
        "rag_relev_avg": 0.0,
        "count": 0
    }

    valid_entries = 0
    for s in scores_list:
        # Check if all scores are present (not None)
        if all(k in s and s[k] is not None for k in ["vanilla_faith", "vanilla_relev", "rag_faith", "rag_relev"]):
            metrics["vanilla_faith_avg"] += s["vanilla_faith"]
            metrics["vanilla_relev_avg"] += s["vanilla_relev"]
            metrics["rag_faith_avg"] += s["rag_faith"]
            metrics["rag_relev_avg"] += s["rag_relev"]
            valid_entries += 1

    if valid_entries > 0:
        metrics["vanilla_faith_avg"] /= valid_entries
        metrics["vanilla_relev_avg"] /= valid_entries
        metrics["rag_faith_avg"] /= valid_entries
        metrics["rag_relev_avg"] /= valid_entries

    metrics["count"] = valid_entries
    return metrics

# --- MANUAL SCORE DATA SETUP ---

# Initialize scores list (10 questions)
scores = []
for i in range(1, 11):
    scores.append({
        "id": i,
        "question": f"Q{i}",
        "vanilla_faith": None,
        "vanilla_relev": None,
        "rag_faith": None,
        "rag_relev": None,
    })

# --- MANUAL SCORE ASSIGNMENT ---
# Scores are normalized to a 0.0 - 1.0 scale (representing 0-10)

# Q1: 5 domains & connection to stakeholder/evaluation.
score_question(scores, 1, 0.0, 0.1, 0.9, 0.7)

# Q2: Approach for problem identification with limited data.
score_question(scores, 2, 0.0, 0.3, 0.8, 0.6)

# Q3: How policy analytical framework compares options (3 criteria).
score_question(scores, 3, 0.0, 0.1, 1.0, 0.9)

# Q4: Key tasks in strategy/policy development domain.
score_question(scores, 4, 0.0, 0.2, 0.8, 0.6)

# Q5: How to combine evidence types (research, contextual, experiential).
score_question(scores, 5, 0.0, 0.1, 0.9, 0.9)

# Q6: How evaluation integrates into 3 policy domains (with examples).
score_question(scores, 6, 0.0, 0.1, 0.4, 0.3)

# Q7: Steps to form HiAP collaborative and set shared goals.
score_question(scores, 7, 0.0, 0.3, 0.4, 0.5)

# Q8: Practices for community engagement and health equity (HiAP/EBP).
score_question(scores, 8, 0.0, 0.3, 0.1, 0.1)

# Q9: Policy strategies for reducing pregnancy-related deaths.
score_question(scores, 9, 0.0, 0.3, 0.8, 0.4)

# Q10: Outline multi-year roadmap using CDC, PAF, and HiAP guidance.
score_question(scores, 10, 0.0, 0.3, 0.5, 0.4)

# --- FINAL METRICS CALCULATION AND OUTPUT ---

final_metrics = compute_metrics(scores)

print("=====================================================")
print("Evaluation Metrics Calculation Complete (Manual Scores)")
print("=====================================================")

print(f"Total Questions Evaluated: {final_metrics['count']}")
print("\nFinal Metrics (Avg Score on a 0.0 - 1.0 scale):")
print("-" * 50)
print(f"| Metric              | Vanilla LLM | Basic RAG |")
print(f"|---------------------|-------------|-----------|")
print(f"| Faithfulness (Fact.)| {final_metrics['vanilla_faith_avg']:.2f}         | {final_metrics['rag_faith_avg']:.2f}      |")
print(f"| Relevancy (Quality) | {final_metrics['vanilla_relev_avg']:.2f}         | {final_metrics['rag_relev_avg']:.2f}      |")
print("-" * 50)

print("\n(Note: The Vanilla Faithfulness average is 0.00 by definition, as the LLM has no source context.)")

Evaluation Metrics Calculation Complete (Manual Scores)
Total Questions Evaluated: 10

Final Metrics (Avg Score on a 0.0 - 1.0 scale):
--------------------------------------------------
| Metric              | Vanilla LLM | Basic RAG |
|---------------------|-------------|-----------|
| Faithfulness (Fact.)| 0.00         | 0.66      |
| Relevancy (Quality) | 0.21         | 0.54      |
--------------------------------------------------

(Note: The Vanilla Faithfulness average is 0.00 by definition, as the LLM has no source context.)


1. Advanced Retrieval Implementation (The Additional Task):

In [37]:
# Install Flashrank for the cross-encoder model
!pip install -q flashrank

In [None]:
!pip install -q langchain-experimental

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.6/209.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h



Option A: Re-ranking with Cross-Encoder (FlashRank).
We used FlashRank to re-rank the top-K results from the vector store.


In [38]:
from langchain_core.documents import Document
from flashrank import Ranker, RerankRequest

# Initialize Ranker (TinyBERT is fast and effective)
ranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2")

def get_advanced_docs_manual(question: str, fetch_k=10, final_k=5):
    """
    Retrieves documents using vector search and then re-ranks them using FlashRank.
    """
    # 1. Fetch more documents than needed (fetch_k)
    initial_docs = faiss_fixed.similarity_search(question, k=fetch_k)

    # 2. Prepare for Re-ranking
    passages = [
        {"id": str(i), "text": doc.page_content, "meta": doc.metadata}
        for i, doc in enumerate(initial_docs)
    ]

    # 3. Re-rank
    rerank_request = RerankRequest(query=question, passages=passages)
    ranked_results = ranker.rerank(rerank_request)

    # 4. Select Top-K (final_k)
    final_docs = []
    for res in ranked_results[:final_k]:
        # Reconstruct Document object
        doc_id = int(res['id'])
        original_doc = initial_docs[doc_id]
        # Update metadata with score if desired
        new_meta = original_doc.metadata.copy()
        new_meta['re-rank-score'] = res['score']
        final_docs.append(Document(page_content=original_doc.page_content, metadata=new_meta))

    return final_docs

def ask_advanced_rag(question: str):
    """
    Runs the Advanced RAG pipeline.
    """
    # 1. Get re-ranked docs
    advanced_docs = get_advanced_docs_manual(question)

    # 2. Generate Answer
    # We use a mock retriever to pass the docs to our core function
    class MockRetriever:
        def invoke(self, q):
            return advanced_docs

    return ask_rag_core(MockRetriever(), question, qa_prompt, llm)

print("Advanced RAG System (System 3) initialized.")


Advanced RAG System (System 3) initialized.


COMPARATIVE EVALUATION (THREE SYSTEMS)

Runs the 10 test questions through Vanilla, Basic RAG, and Advanced RAG.


In [39]:
def run_three_systems(questions):
    """
    Runs all questions through the three systems (Vanilla, Basic RAG, Advanced RAG)
    and prints the output for manual verification.
    """
    rows = []
    print("Starting 3-System Comparison...")

    for i, q in enumerate(questions, start=1):
        print("="*80)
        print(f"Q{i}: {q}\n")

        # System 1: Vanilla
        vanilla_answer = ask_vanilla(q)
        print("== System 1: Vanilla LLM ==")
        print(vanilla_answer, "\n")

        # System 2: Basic RAG (using fixed chunks retriever)
        rag_answer, rag_sources = ask_rag_fixed(q)
        print("== System 2: Basic RAG (fixed chunks) ==")
        print(rag_answer, "\n")

        # System 3: Advanced RAG (Re-ranking)
        advanced_answer, advanced_sources = ask_advanced_rag(q)
        print("== System 3: Advanced RAG (Re-ranking) ==")
        print(advanced_answer, "\n")

        # Capture context and metadata for evaluation
        rag_context = "\n\n".join(d.page_content for d in rag_sources)
        advanced_context = "\n\n".join(d.page_content for d in advanced_sources)

        rows.append({
            "id": i,
            "question": q,
            "vanilla_answer": vanilla_answer,
            "rag_answer": rag_answer,
            "advanced_answer": advanced_answer,
            "rag_sources_meta": [d.metadata for d in rag_sources],
            "advanced_sources_meta": [d.metadata for d in advanced_sources],
            "rag_context": rag_context,
            "advanced_context": advanced_context,
        })
    return rows




In [40]:
full_results = run_three_systems(EVAL_QUESTIONS)

Starting 3-System Comparison...
Q1: According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?

== System 1: Vanilla LLM ==
You are a public health policy assistant.
Answer the following question as completely and accurately as possible,
using your general knowledge. Do not mention any external tools or documents.

Question:
According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?

Answer: According to CDC's policy process, the five main domains are:

1. Health
2. Environment
3. Public health
4. Social determinants of health
5. Economic and social development

The five main domains are connected to stakeholder engagement and evaluation activities through the following steps:

1. Identify the stakeholders: Determine the key stakeholders involved in the decision-making process, such as government officials, healthcare pr

### Manual Scoring for Advanced RAG


In [41]:
def score_question(scores_list, index, vf, vr, rf, rr, af, ar):
    """
    Updates the scores list at the given index (1-based) with the provided values
    for Vanilla, Basic RAG, and Advanced RAG.
    """
    # Adjust index to 0-based
    idx = index - 1
    if 0 <= idx < len(scores_list):
        scores_list[idx]["vanilla_faith"] = vf
        scores_list[idx]["vanilla_relev"] = vr
        scores_list[idx]["rag_faith"] = rf
        scores_list[idx]["rag_relev"] = rr
        scores_list[idx]["advanced_faith"] = af
        scores_list[idx]["advanced_relev"] = ar
    else:
        print(f"Error: Index {index} out of range.")

def compute_metrics(scores_list):
    """
    Computes average faithfulness and relevancy for Vanilla, Basic RAG, and
    Advanced RAG systems.
    """
    metrics = {
        "vanilla_faith_avg": 0.0, "vanilla_relev_avg": 0.0,
        "rag_faith_avg": 0.0, "rag_relev_avg": 0.0,
        "advanced_faith_avg": 0.0, "advanced_relev_avg": 0.0,
        "count": 0
    }

    valid_entries = 0
    for s in scores_list:
        # Check if all scores are present
        required_keys = ["vanilla_faith", "vanilla_relev", "rag_faith", "rag_relev", "advanced_faith", "advanced_relev"]
        if all(k in s and s[k] is not None for k in required_keys):
            metrics["vanilla_faith_avg"] += s["vanilla_faith"]
            metrics["vanilla_relev_avg"] += s["vanilla_relev"]
            metrics["rag_faith_avg"] += s["rag_faith"]
            metrics["rag_relev_avg"] += s["rag_relev"]
            metrics["advanced_faith_avg"] += s["advanced_faith"]
            metrics["advanced_relev_avg"] += s["advanced_relev"]
            valid_entries += 1

    if valid_entries > 0:
        for k in metrics:
            if k != "count":
                metrics[k] /= valid_entries

    metrics["count"] = valid_entries
    return metrics

# --- MANUAL SCORE DATA SETUP ---

# Initialize scores list (10 questions)
scores = []
for i in range(1, 11):
    scores.append({
        "id": i,
        "question": f"Q{i}",
        "vanilla_faith": None,
        "vanilla_relev": None,
        "rag_faith": None,
        "rag_relev": None,
        "advanced_faith": None,  # Added Advanced RAG
        "advanced_relev": None   # Added Advanced RAG
    })

# --- MANUAL SCORE ASSIGNMENT (0.0 - 1.0 scale) ---
# Scores are derived from human verification against rag-output-2.rtf
# VF is 0.0 for Vanilla by definition (ungrounded system)
# Basic RAG scores are from the original prompt's score_question calls
# Advanced RAG scores are based on the user's explicit manual assignment

# Q1: 5 domains & connection to stakeholder/evaluation.
score_question(scores, 1, 0.0, 0.1, 0.9, 0.7, 0.4, 0.5)

# Q2: Approach for problem identification with limited data.
score_question(scores, 2, 0.0, 0.3, 0.8, 0.6, 0.1, 0.3)

# Q3: How policy analytical framework compares options (3 criteria).
score_question(scores, 3, 0.0, 0.1, 1.0, 0.9, 0.6, 0.8)

# Q4: Key tasks in strategy/policy development domain.
score_question(scores, 4, 0.0, 0.2, 0.8, 0.6, 0.0, 0.2)

# Q5: How to combine evidence types (research, contextual, experiential).
score_question(scores, 5, 0.0, 0.1, 0.9, 0.9, 0.4, 0.6)

# Q6: How evaluation integrates into 3 policy domains (with examples).
score_question(scores, 6, 0.0, 0.1, 0.4, 0.3, 0.1, 0.2)

# Q7: Steps to form HiAP collaborative and set shared goals.
score_question(scores, 7, 0.0, 0.3, 0.4, 0.5, 0.9, 0.9)

# Q8: Practices for community engagement and health equity (HiAP/EBP).
score_question(scores, 8, 0.0, 0.3, 0.1, 0.1, 0.9, 0.9)

# Q9: Policy strategies for reducing pregnancy-related deaths.
score_question(scores, 9, 0.0, 0.3, 0.8, 0.4, 0.0, 0.2)

# Q10: Outline multi-year roadmap using CDC, PAF, and HiAP guidance.
score_question(scores, 10, 0.0, 0.3, 0.5, 0.4, 0.6, 0.7)

# --- FINAL METRICS CALCULATION AND OUTPUT ---

final_metrics = compute_metrics(scores)

print("=====================================================")
print("Evaluation Metrics Calculation Complete (Manual Scores)")
print("=====================================================")

print(f"Total Questions Evaluated: {final_metrics['count']}")
print("\nFinal Metrics (Avg Score on a 0.0 - 1.0 scale):")
print("-" * 70)
print(f"| Metric                      | Vanilla LLM | Basic RAG | Advanced RAG |")
print(f"|-----------------------------|-------------|-----------|--------------|")
print(f"| Faithfulness (Fact.)        | {final_metrics['vanilla_faith_avg']:.2f}         | {final_metrics['rag_faith_avg']:.2f}      | {final_metrics['advanced_faith_avg']:.2f}         |")
print(f"| Relevancy (Quality)         | {final_metrics['vanilla_relev_avg']:.2f}         | {final_metrics['rag_relev_avg']:.2f}      | {final_metrics['advanced_relev_avg']:.2f}         |")
print("-" * 70)

Evaluation Metrics Calculation Complete (Manual Scores)
Total Questions Evaluated: 10

Final Metrics (Avg Score on a 0.0 - 1.0 scale):
----------------------------------------------------------------------
| Metric                      | Vanilla LLM | Basic RAG | Advanced RAG |
|-----------------------------|-------------|-----------|--------------|
| Faithfulness (Fact.)        | 0.00         | 0.66      | 0.40         |
| Relevancy (Quality)         | 0.21         | 0.54      | 0.53         |
----------------------------------------------------------------------
