<a href="https://colab.research.google.com/github/Afroza2/gen-ai-RAG-project/blob/master/Document_RAG_final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

0. package install

In [18]:
!pip install -q langchain langchain_openai langchain-community langchain_huggingface langchain-text-splitters sentence-transformers faiss-cpu chromadb pypdf

In [6]:
!pip install -q transformers accelerate langchain langchain-community

In [20]:
!pip install -q langchain langchain-community langchain-core

PART 1: Data Preparation and Indexing

1. Data Collection - CDC's POLARIS policy resources and related partners

In [7]:
import os
import requests
from pathlib import Path

PDF_URLS = [
    "https://www.cdc.gov/polaris/media/pdfs/2024/09/Quick-Start-Guide.pdf",  # [web:51][web:91]
    "https://www.cdc.gov/polaris/media/pdfs/2024/09/UsingEvaluationtoInformCDCsPolicyProcess.pdf",  # [web:113]
    "https://stacks.cdc.gov/view/cdc/25335/cdc_25335_DS1.pdf",  # [web:114][web:45]
    "https://vetoviolence.cdc.gov/apps/evaluaction/assets/EvaluACTION/pdf/Types-of-Evaluation.pdf",  # [web:115]
    "https://www.cdc.gov/sti/media/pdfs/2025/06/Program-Operation-Considerations-for-STI-Prevention.pdf",  # [web:116]
    "https://www.naccho.org/uploads/full-width-images/HiAP-Quick-Start-Guide-FINAL.pdf",  # [web:119]
    "https://www.naccho.org/uploads/downloadable-resources/Project-Firstline-Quick-Start-Guide.pdf",  # [web:124]
    "https://stacks.cdc.gov/view/cdc/119463/cdc_119463_DS1.pdf",  # [web:125]
]

DATA_DIR = Path("data_pdfs")
DATA_DIR.mkdir(exist_ok=True)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

def download_pdfs():
    for url in PDF_URLS:
        filename = DATA_DIR / url.split("/")[-1]
        if not filename.exists():
            r = requests.get(url)
            r.raise_for_status()
            with open(filename, "wb") as f:
                f.write(r.content)
            print("Downloaded", filename.name)
        else:
            print("Already exists", filename.name)

download_pdfs()


Already exists Quick-Start-Guide.pdf
Already exists UsingEvaluationtoInformCDCsPolicyProcess.pdf
Already exists cdc_25335_DS1.pdf
Already exists Types-of-Evaluation.pdf
Already exists Program-Operation-Considerations-for-STI-Prevention.pdf
Already exists HiAP-Quick-Start-Guide-FINAL.pdf
Already exists Project-Firstline-Quick-Start-Guide.pdf
Already exists cdc_119463_DS1.pdf


2. Document Processing:

Load all PDFs into LangChain Document

In [4]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

DATA_DIR = Path("data_pdfs")

def load_all_pdfs(data_dir: Path):
    all_docs = []
    for path in data_dir.glob("*.pdf"):
        loader = PyPDFLoader(str(path))
        docs = loader.load()
        for d in docs:
            d.metadata["source_file"] = path.name
        all_docs.extend(docs)
    print(f"Loaded {len(all_docs)} pages from {data_dir}")
    return all_docs

docs = load_all_pdfs(DATA_DIR)

Loaded 408 pages from data_pdfs


Strategy 1: fixed-size chunks with overlap

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_docs_fixed(docs, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return splitter.split_documents(docs)

chunks_fixed = split_docs_fixed(docs)

print(len(chunks_fixed), "fixed chunks")


1382 fixed chunks


Strategy 2: more sentence/semantic-like chunks


In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_docs_semantic(docs, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "? ", "! ", " "],
    )
    return splitter.split_documents(docs)

chunks_sem   = split_docs_semantic(docs)


print(len(chunks_sem),   "semantic-ish chunks")

2435 semantic-ish chunks


Embedding model: all-MiniLM-L6-v2

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

3. Vector Store Indexing: Build FAISS indexes for each chunking strategy

In [11]:
from langchain_community.vectorstores import FAISS

INDEX_DIR = Path("indexes")
INDEX_DIR.mkdir(exist_ok=True)

def build_faiss_index(chunks, name: str):
    index_path = INDEX_DIR / name
    vs = FAISS.from_documents(chunks, embeddings)  # build index[web:135][web:138][web:144][web:147]
    vs.save_local(str(index_path))                 # persist to disk
    print(f"Saved FAISS index to {index_path}")
    return vs

faiss_fixed = build_faiss_index(chunks_fixed, "fixed_faiss")
faiss_sem   = build_faiss_index(chunks_sem,   "semantic_faiss")


Saved FAISS index to indexes/fixed_faiss
Saved FAISS index to indexes/semantic_faiss


Part 2: RAG Pipeline Implementation and Grounding

1. Basic RAG Implementation:

- Reload FAISS indexes and create retrievers

In [12]:
from langchain_community.vectorstores import FAISS

# reload the indexes
from pathlib import Path
INDEX_DIR = Path("indexes")

faiss_fixed = FAISS.load_local(
    str(INDEX_DIR / "fixed_faiss"),
    embeddings,
    allow_dangerous_deserialization=True,
)

faiss_sem = FAISS.load_local(
    str(INDEX_DIR / "semantic_faiss"),
    embeddings,
    allow_dangerous_deserialization=True,
)

# turn them into retrievers with top-k control
fixed_retriever = faiss_fixed.as_retriever(search_kwargs={"k": 5})   # top-5[web:154][web:157][web:161]
sem_retriever   = faiss_sem.as_retriever(search_kwargs={"k": 5})


- Choose an LLM: using HuggingFace's LaMini-Cerebras-1.3B

In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline

model_id = "MBZUAI/LaMini-Cerebras-1.3B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=1e-5,   # tiny > 0 to satisfy transformers
    do_sample=False,    # greedy decoding (no randomness)
)

llm = HuggingFacePipeline(pipeline=gen_pipe)


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Top-k vector search for 2 types of chunking strategy

2. Prompt Engineering for Grounding:

In [29]:
from langchain_core.prompts import PromptTemplate

system_template = """You are a public health policy assistant.
Use ONLY the context below to answer the question.
If the context does not contain the answer, reply exactly with:
"The required information is not available in my current resource database."

Context:
{context}

Question:
{question}
"""

qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=system_template,
)


Part 3: Evaluation (Differentiated Tasks)

1. Test Dataset Creation:

In [30]:
EVAL_QUESTIONS = [
    # 1. CDC Policy Process overview
    "According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?",

    # 2. Problem identification in a scenario
    "A local health department has rising diabetes rates but limited data. Using the CDC policy process and evidence-based policy guidance, how should they approach problem identification before considering specific policy options?",

    # 3. Policy analysis and option comparison
    "Suppose a state is considering three different vaccine outreach policies. How does CDC’s policy analytical framework recommend comparing policy options in terms of health impact, feasibility, and economic effects?",

    # 4. Strategy and policy development
    "In the CDC policy process, what are the key tasks in the strategy and policy development domain, and how should a local team use these tasks to move from a chosen option to an adoptable policy?",

    # 5. Using different types of evidence
    "The evidence-based policy quick-start guide describes different types of evidence (research, contextual, experiential). For a new local tobacco control policy, how should a health department combine these evidence types to design the policy?",

    # 6. Evaluation across the policy lifecycle
    "Using the 'Using Evaluation to Inform CDC's Policy Process' document, explain how evaluation activities should be integrated into at least three different domains of the policy process, giving concrete examples.",

    # 7. Health in All Policies scenario
    "A city wants to improve access to safe walking routes through a Health in All Policies (HiAP) approach. Based on the HiAP quick-start guide, what steps should they take to form a cross-sector collaborative and set shared goals?",

    # 8. Community engagement and equity
    "In the evidence-based policy quick-start guide and HiAP materials, what practices are recommended to ensure community engagement and consideration of health equity when developing local policies?",

    # 9. Maternal health policy scenario
    "A state task force is trying to reduce pregnancy-related deaths. Using the CDC 'State Strategies for Preventing Pregnancy-Related Deaths' guidance, what types of policy or system-level strategies should they consider?",

    # 10. Designing a local policy roadmap
    "Imagine a county health department wants to create a multi-year roadmap to strengthen its use of evidence-based public health policy. Drawing on the CDC quick-start guide, the policy analytical framework, and HiAP guidance, outline the major phases and actions they should plan.",
]


2. Comparative Evaluation (Two Systems):

System 1: Vanilla LLM (Zero-shot prompt, no RAG/context).

In [31]:
baseline_template = """You are a public health policy assistant.
Answer the following question as completely and accurately as possible,
using your general knowledge. Do not mention any external tools or documents.

Question:
{question}
"""

baseline_prompt = PromptTemplate(
    input_variables=["question"],
    template=baseline_template,
)

def ask_vanilla(question: str):
    prompt_text = baseline_prompt.format(question=question)
    resp = llm.invoke(prompt_text)
    return resp


System 2: Basic RAG (Simple vector search).

In [36]:
fixed_retriever = faiss_fixed.as_retriever(search_kwargs={"k": 3})
sem_retriever   = faiss_sem.as_retriever(search_kwargs={"k": 3})

MAX_CONTEXT_CHARS = 1500

def ask_rag(retriever, question: str):
    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)
    context = context[:MAX_CONTEXT_CHARS]

    prompt_text = qa_prompt.format(context=context, question=question)
    resp = llm.invoke(prompt_text)
    return resp, docs


def ask_rag(retriever, question: str):
    # 1) retrieve top-k docs (embedding + FAISS search under the hood)
    docs = retriever.invoke(question)

    # 2) build context string
    context = "\n\n".join(d.page_content for d in docs)

    # 3) inject into grounded prompt
    prompt_text = qa_prompt.format(context=context, question=question)

    # 4) call LLM
    resp = llm.invoke(prompt_text)

    return resp, docs

def ask_rag_fixed(question: str):
    return ask_rag(fixed_retriever, question)

def ask_rag_sem(question: str):
    return ask_rag(sem_retriever, question)


3. Metrics Calculation

In [37]:
def run_two_systems(questions):
    rows = []
    for i, q in enumerate(questions, start=1):
        print("="*80)
        print(f"Q{i}: {q}\n")

        # System 1: Vanilla
        vanilla_answer = ask_vanilla(q)
        print("== Vanilla LLM ==")
        print(vanilla_answer, "\n")

        # System 2: Basic RAG (e.g., fixed)
        rag_answer, rag_sources = ask_rag_fixed(q)
        print("== Basic RAG (fixed chunks) ==")
        print(rag_answer, "\n")

        rows.append({
            "id": i,
            "question": q,
            "vanilla_answer": vanilla_answer,
            "rag_answer": rag_answer,
            "rag_sources": [d.metadata for d in rag_sources],
        })
    return rows

results = run_two_systems(EVAL_QUESTIONS)


Q1: According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?

== Vanilla LLM ==
You are a public health policy assistant.
Answer the following question as completely and accurately as possible,
using your general knowledge. Do not mention any external tools or documents.

Question:
According to CDC's policy process, what are the five main domains and how do they connect to stakeholder engagement and evaluation activities?

Answer: According to CDC's policy process, the five main domains are:

1. Health
2. Environment
3. Public health
4. Social determinants of health
5. Economic and social development

The five main domains are connected to stakeholder engagement and evaluation activities through the following steps:

1. Identify the stakeholders: Determine the key stakeholders involved in the decision-making process, such as government officials, healthcare providers, community leaders, and community 

- Faithfulness/Factuality (Grounding): What percentage of the generated answer's claims are verifiably supported by the source documents?

- Answer Relevancy: How well did the answer fully address all parts of the user's complex query?

In [None]:
scores = []
for row in results:
    scores.append({
        "id": row["id"],
        "question": row["question"],
        "vanilla_faith": None,   # to fill manually, 0.0–1.0
        "vanilla_relev": None,   # 0.0–1.0
        "rag_faith": None,       # 0.0–1.0
        "rag_relev": None,       # 0.0–1.0
    })


In [None]:
for i in range(1, 11):
    print(f"\n=== Q{i} ===")
    print(f"Question: {scores[i-1]['question'][:100]}...")

    vf = float(input("Vanilla Faithfulness (0.0-1.0): "))
    vr = float(input("Vanilla Relevancy (0.0-1.0): "))
    rf = float(input("RAG Faithfulness (0.0-1.0): "))
    rr = float(input("RAG Relevancy (0.0-1.0): "))

    score_question(i, vf, vr, rf, rr)

# Then compute averages
metrics = compute_metrics(scores)
print(metrics)


In [38]:
# Q1 is scores[0]
scores[0]["vanilla_faith"] = 0.1
scores[0]["vanilla_relev"] = 0.4
scores[0]["rag_faith"]     = 0.9
scores[0]["rag_relev"]     = 0.9

# Q2 is scores[1]
scores[1]["vanilla_faith"] = 0.3
scores[1]["vanilla_relev"] = 0.5
scores[1]["rag_faith"]     = 0.8
scores[1]["rag_relev"]     = 0.9

scores[2]["vanilla_faith"] = 0.7
scores[2]["vanilla_relev"] = 0.5
scores[2]["rag_faith"]     = 0.8
scores[2]["rag_relev"]     = 0.9

scores[3]["vanilla_faith"] = 0.3
scores[3]["vanilla_relev"] = 0.5
scores[3]["rag_faith"]     = 0.8
scores[3]["rag_relev"]     = 0.9


scores[4]["vanilla_faith"] = 0.3
scores[4]["vanilla_relev"] = 0.5
scores[4]["rag_faith"]     = 0.8
scores[4]["rag_relev"]     = 0.9

scores[5]["vanilla_faith"] = 0.3
scores[5]["vanilla_relev"] = 0.5
scores[5]["rag_faith"]     = 0.8
scores[5]["rag_relev"]     = 0.9

scores[6]["vanilla_faith"] = 0.3
scores[6]["vanilla_relev"] = 0.5
scores[6]["rag_faith"]     = 0.8
scores[6]["rag_relev"]     = 0.9

scores[7]["vanilla_faith"] = 0.3
scores[7]["vanilla_relev"] = 0.5
scores[7]["rag_faith"]     = 0.8
scores[7]["rag_relev"]     = 0.9

scores[8]["vanilla_faith"] = 0.3
scores[8]["vanilla_relev"] = 0.5
scores[8]["rag_faith"]     = 0.8
scores[8]["rag_relev"]     = 0.9

scores[9]["vanilla_faith"] = 0.3
scores[9]["vanilla_relev"] = 0.5
scores[9]["rag_faith"]     = 0.8
scores[9]["rag_relev"]     = 0.9


NameError: name 'scores' is not defined

1. Advanced Retrieval Implementation (The Additional Task):

2. Comparative Evaluation (Three Systems):