In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os

PROJECT_PATH = "/content/drive/MyDrive/rag_project"
DATA_PATH = os.path.join(PROJECT_PATH, "data")

os.makedirs(DATA_PATH, exist_ok=True)

print("Project directory created at:", PROJECT_PATH)

Project directory created at: /content/drive/MyDrive/rag_project


In [3]:
import os

print("Files in data folder:")
print(os.listdir(DATA_PATH))

Files in data folder:
['cs231n_full_notes.pdf', 'cnn_transformers_intro.pdf', 'cs224n_transformers_2024.pdf', 'attention_is_all_you_need.pdf', 'cs224n_merged_notes.pdf', 'neural_networks_backprop.pdf']


In [4]:
%pip install pymupdf sentence-transformers faiss-cpu transformers rouge-score scikit-learn numpy

Collecting pymupdf
  Downloading pymupdf-1.27.1-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymupdf-1.27.1-cp310-abi3-manylinux_2_28_x86_64.whl (24.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.9/24.9 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none

In [5]:
import torch
print("GPU Available:", torch.cuda.is_available())

GPU Available: True


In [6]:
import fitz  # PyMuPDF
import os
import re


def clean_text(text):
    """Collapse excessive whitespace and strip leading/trailing spaces."""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def extract_pdfs_from_folder(data_path):
    """Extract and clean raw text from every PDF in *data_path*.

    Returns:
        Dict mapping filename -> cleaned full text.
    """
    documents = {}
    for file in os.listdir(data_path):
        if file.endswith(".pdf"):
            file_path = os.path.join(data_path, file)
            doc = fitz.open(file_path)
            full_text = ""
            for page in doc:
                full_text += page.get_text()
            full_text = clean_text(full_text)
            documents[file] = full_text
            print(f"Extracted: {file} | Characters: {len(full_text)}")
    return documents


documents = extract_pdfs_from_folder(DATA_PATH)

Extracted: cs231n_full_notes.pdf | Characters: 149765
Extracted: cnn_transformers_intro.pdf | Characters: 55110
Extracted: cs224n_transformers_2024.pdf | Characters: 27641
Extracted: attention_is_all_you_need.pdf | Characters: 32708
Extracted: cs224n_merged_notes.pdf | Characters: 234082
Extracted: neural_networks_backprop.pdf | Characters: 30428


In [7]:
# Preview one document
sample_file = list(documents.keys())[0]
print("Preview from:", sample_file)
print(documents[sample_file][:1000])

Preview from: cs231n_full_notes.pdf
DEEP LEARNING STUDY NOTES All credits go to L. Fei-Fei, A. Karpathy, J.Johnson teachers of the CS231n course. Thank you for this amazing course!! by Albert Pumarola Contents I DATA 7 1 Data Preprocessing 9 2 Making the most of your data - Data Augmentation and Transfer Learning 11 2.1 Data Augmentation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 2.2 Transfer Learning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 13 II LEARNING 15 3 Neural Network 17 4 Parameters Initialization 21 4.1 Weights . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21 4.2 Biases . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 23 5 Activation Function 25 5.1 Sigmoid . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 25 5.2 Tanh . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 25 5.3 ReLU . 

In [8]:
import re
import numpy as np


def chunk_text(text, chunk_size=120, overlap=30):
    """Sentence-boundary-aware chunking with word-level size control.

    Splits text into chunks of approximately *chunk_size* words, never
    breaking mid-sentence.  An overlap of *overlap* words is carried
    forward between consecutive chunks so that cross-boundary context
    is preserved.

    The default 120-word / 30-word-overlap setting is chosen to align
    with the token capacity of all-MiniLM-L6-v2, which produces its
    best embeddings at roughly 100-128 tokens (~100-120 words).

    Args:
        text: Raw document string.
        chunk_size: Target number of words per chunk (default 120).
        overlap: Number of trailing words to carry into the next chunk.

    Returns:
        List of chunk strings.
    """
    # Split into sentences first for boundary-aware chunking
    sentences = re.split(r'(?<=[.!?])\s+', text)

    chunks = []
    current_chunk_words = []
    current_word_count = 0

    for sentence in sentences:
        sentence_words = sentence.split()
        sentence_word_count = len(sentence_words)

        # If adding this sentence exceeds chunk_size, finalize current chunk
        if current_word_count + sentence_word_count > chunk_size and current_chunk_words:
            chunks.append(" ".join(current_chunk_words))

            # Overlap: keep last `overlap` words for continuity
            overlap_words = (
                current_chunk_words[-overlap:]
                if overlap < len(current_chunk_words)
                else current_chunk_words
            )
            current_chunk_words = list(overlap_words)
            current_word_count = len(current_chunk_words)

        current_chunk_words.extend(sentence_words)
        current_word_count += sentence_word_count

    # Don't forget the last chunk
    if current_chunk_words:
        chunks.append(" ".join(current_chunk_words))

    return chunks


# ── Apply chunking to all documents ──────────────────────────
all_chunks = []
chunk_metadata = []

for filename, text in documents.items():
    chunks = chunk_text(text, chunk_size=120, overlap=30)
    for i, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        chunk_metadata.append({"source": filename, "chunk_index": i})

print(f"Total chunks created: {len(all_chunks)}")
print(f"Avg chunk length (words): {np.mean([len(c.split()) for c in all_chunks]):.0f}")
print(f"Avg chunk length (chars): {np.mean([len(c) for c in all_chunks]):.0f}")

Total chunks created: 1128
Avg chunk length (words): 113
Avg chunk length (chars): 640


In [9]:
print("Sample chunk:")
print(all_chunks[0][:1000])

Sample chunk:
DEEP LEARNING STUDY NOTES All credits go to L. Fei-Fei, A. Karpathy, J.Johnson teachers of the CS231n course. Thank you for this amazing course!! by Albert Pumarola Contents I DATA 7 1 Data Preprocessing 9 2 Making the most of your data - Data Augmentation and Transfer Learning 11 2.1 Data Augmentation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 2.2 Transfer Learning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

# Load embedding model
embedding_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2",
    device=device
)

# Generate embeddings
chunk_embeddings = embedding_model.encode(
    all_chunks,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("Embeddings shape:", chunk_embeddings.shape)

Using device: cuda


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Embeddings shape: (1128, 384)


In [11]:
import faiss

# Dimension of embeddings
dimension = chunk_embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dimension)

# Add embeddings to index
index.add(chunk_embeddings)

print("Total vectors in index:", index.ntotal)

Total vectors in index: 1128


In [12]:
def retrieve(query, top_k=5, score_threshold=None):
    """Retrieve the top-k nearest chunks for *query* from the FAISS index.

    Args:
        query: Natural-language question string.
        top_k: Number of nearest neighbours to return.
        score_threshold: Optional L2 distance ceiling; chunks above this
            threshold are discarded as irrelevant.

    Returns:
        List of dicts, each containing 'chunk', 'distance', and 'metadata'.
    """
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx == -1:  # FAISS returns -1 for missing results
            continue
        if score_threshold is not None and dist > score_threshold:
            continue
        results.append({
            "chunk": all_chunks[idx],
            "distance": float(dist),
            "metadata": chunk_metadata[idx],
        })
    return results


# ── Test retrieval ────────────────────────────────────────────
test_query = "What is backpropagation?"
retrieved = retrieve(test_query, top_k=5)

for i, r in enumerate(retrieved):
    print(f"\n--- Chunk {i+1} | Distance: {r['distance']:.4f} | Source: {r['metadata']} ---")
    print(r['chunk'][:500])


--- Chunk 1 | Distance: 0.5534 | Source: {'source': 'cs231n_full_notes.pdf', 'chunk_index': 130} ---
y multiple times, so when we perform backpropagation we must be careful to use += instead of = to accumulate the gradient on these variables (otherwise we would overwrite it). This follows the multivariable chain rule in Calculus, which states that if a variable branches out to diﬀerent parts of the circuit, then the gradients that ﬂow back to it will add. Patterns in backward ﬂow It is interesting to note that in many cases the backward-ﬂowing gradient can be interpreted on an intuitive level. 

--- Chunk 2 | Distance: 0.6310 | Source: {'source': 'cs231n_full_notes.pdf', 'chunk_index': 107} ---
Notice that backpropagation is a beautifully local process. Every gate in a circuit diagram gets some inputs and can right away compute two things: 1. its output value and 2. the local gradient of its inputs with respect to its output value. Notice that the gates can do this completely independ

In [13]:
# ═══════════════════════════════════════════════════════════════
# A) UPGRADED MODEL LOADING — flan-t5-large (780M params)
# ═══════════════════════════════════════════════════════════════
# WHY: flan-t5-base (250M) produces extractive fragments instead of
#       synthesized paragraph answers. flan-t5-large (780M) has 3x
#       more parameters, significantly better instruction-following,
#       and fits comfortably on a Colab T4 GPU (~1.5 GB in float16).
# ═══════════════════════════════════════════════════════════════

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "google/flan-t5-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)
generator_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16  # Half-precision to fit Colab GPU memory
)

generator_model = generator_model.to(device)
generator_model.eval()

# Memory check
if device == 'cuda':
    mem_gb = torch.cuda.memory_allocated() / 1e9
    print(f"GPU memory used: {mem_gb:.2f} GB")

print(f"Generator model loaded: {model_name}")
print(f"Parameters: {sum(p.numel() for p in generator_model.parameters()) / 1e6:.0f}M")

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/558 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

GPU memory used: 2.00 GB
Generator model loaded: google/flan-t5-large
Parameters: 783M


In [14]:
# ═══════════════════════════════════════════════════════════════
# B) RAG GENERATION + BASELINE (FAIR COMPARISON)
# ═══════════════════════════════════════════════════════════════
# Both functions share identical decoding parameters AND the same
# prompt template via GENERATION_CONFIG and PROMPT_TEMPLATE.
# The only difference: RAG fills context with retrieved chunks;
# baseline passes an empty string.  This eliminates the prompt-
# wording confound that could inflate or deflate either system.
# ═══════════════════════════════════════════════════════════════

GENERATION_CONFIG = {
    "max_new_tokens": 220,
    "min_new_tokens": 60,
    "num_beams": 4,
    "length_penalty": 1.0,
    "no_repeat_ngram_size": 3,
    "do_sample": False,
}

PROMPT_TEMPLATE = (
    "Use the information below to answer the question clearly and factually.\n"
    "If the information is insufficient, say that the answer cannot be determined "
    "from the provided material.\n\n"
    "Context:\n{context}\n\n"
    "Question:\n{question}\n\n"
    "Answer:"
)


def generate_rag_answer(query, top_k=3, max_context_tokens=300):
    """Generate an answer using retrieved context (RAG pipeline).

    The prompt is built from PROMPT_TEMPLATE with retrieved chunks as
    context.  Decoding uses GENERATION_CONFIG (shared with baseline).

    Args:
        query: User question string.
        top_k: Number of chunks to retrieve.
        max_context_tokens: Max tokens allocated to context in the prompt.

    Returns:
        Dict with answer, sources, num_chunks_used, and input_tokens.
    """
    retrieved_chunks = retrieve(query, top_k=top_k)
    context = "\n\n".join(
        rc["chunk"] if isinstance(rc, dict) else rc for rc in retrieved_chunks
    )

    context_tokens = tokenizer(
        context, return_tensors="pt", truncation=True, max_length=max_context_tokens
    )
    truncated_context = tokenizer.decode(
        context_tokens["input_ids"][0], skip_special_tokens=True
    )

    prompt = PROMPT_TEMPLATE.format(context=truncated_context, question=query)

    inputs = tokenizer(
        prompt, return_tensors="pt", truncation=True, max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = generator_model.generate(**inputs, **GENERATION_CONFIG)

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    sources = [rc["metadata"] for rc in retrieved_chunks if isinstance(rc, dict)]

    return {
        "answer": answer,
        "sources": sources,
        "num_chunks_used": len(retrieved_chunks),
        "input_tokens": inputs["input_ids"].shape[1],
    }


def generate_baseline_answer(query):
    """Generate an answer WITHOUT retrieval context (parametric baseline).

    Uses the SAME PROMPT_TEMPLATE as RAG but with an empty context
    string, and the same GENERATION_CONFIG, ensuring a fair comparison.

    Args:
        query: User question string.

    Returns:
        Dict with answer.
    """
    prompt = PROMPT_TEMPLATE.format(context="", question=query)

    inputs = tokenizer(
        prompt, return_tensors="pt", truncation=True, max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = generator_model.generate(**inputs, **GENERATION_CONFIG)

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"answer": answer}


print(f"Generation config (shared): {GENERATION_CONFIG}")
print(f"Prompt template (shared): identical for RAG and baseline")
print("Functions defined: generate_rag_answer(), generate_baseline_answer()")

Generation config (shared): {'max_new_tokens': 220, 'min_new_tokens': 60, 'num_beams': 4, 'length_penalty': 1.0, 'no_repeat_ngram_size': 3, 'do_sample': False}
Prompt template (shared): identical for RAG and baseline
Functions defined: generate_rag_answer(), generate_baseline_answer()


In [15]:
# ── Quick sanity check: RAG vs Baseline on one question ──
question = "What is backpropagation?"

rag_result = generate_rag_answer(question)
baseline_result = generate_baseline_answer(question)

print(f"Question: {question}")
print(f"Input tokens used: {rag_result['input_tokens']}/512")
print(f"Chunks used: {rag_result['num_chunks_used']}")
print(f"Sources: {rag_result['sources']}")
print(f"\n{'─'*60}")
print(f"RAG Answer:\n{rag_result['answer']}")
print(f"\n{'─'*60}")
print(f"Baseline Answer (no retrieval):\n{baseline_result['answer']}")

Question: What is backpropagation?
Input tokens used: 339/512
Chunks used: 3
Sources: [{'source': 'cs231n_full_notes.pdf', 'chunk_index': 130}, {'source': 'cs231n_full_notes.pdf', 'chunk_index': 107}, {'source': 'cs224n_merged_notes.pdf', 'chunk_index': 148}]

────────────────────────────────────────────────────────────
RAG Answer:
technique that allows us to use the chain rule of differentiation to learn about the local gradient of its inputs with respect to its output value. (t) J Backpropagation is a beautifully local process. Every gate in a circuit diagram gets some inputs and can right away compute two things: 1. its input value and 2. the local

────────────────────────────────────────────────────────────
Baseline Answer (no retrieval):
can not be determined from the provided material. Backpropagation refers to an algorithm used to propagate directional information from one point to the next point in a network. Back propagation is based on directional propagation theory. Back pr

In [16]:
# ═══════════════════════════════════════════════════════════════
# E1) REPRODUCIBILITY + METRICS + 30-QUESTION EVALUATION DATASET
# ═══════════════════════════════════════════════════════════════

import random
import numpy as np
import torch
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ttest_rel, t

# Install bert-score once, then preload BERTScorer
import subprocess
subprocess.run(["pip", "install", "bert-score", "-q"], check=True)

from bert_score import BERTScorer

device_for_bert = "cuda" if torch.cuda.is_available() else "cpu"

bert_scorer = BERTScorer(
    model_type="roberta-large",
    lang="en",
    rescale_with_baseline=True,
    device=device_for_bert
)

# Reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# ROUGE
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def compute_rouge_l(prediction, reference):
    return scorer.score(reference, prediction)["rougeL"].fmeasure

def compute_cosine_similarity(text_a, text_b):
    emb = embedding_model.encode([text_a, text_b], convert_to_numpy=True)
    return float(cosine_similarity([emb[0]], [emb[1]])[0][0])

def compute_bert_score(prediction, reference):
    P, R, F1 = bert_scorer.score([prediction], [reference])
    return float(F1[0])

def evaluate_answer(prediction, reference):
    return {
        "rouge_l": compute_rouge_l(prediction, reference),
        "cosine_similarity": compute_cosine_similarity(prediction, reference),
        "bert_score": compute_bert_score(prediction, reference),
    }

# ───────────────────────────────────────────────────────────────
# Dataset Definition (30 total: 20 standard, 5 multi-hop, 5 unanswerable)
# Split: 20 dev, 10 test
# ───────────────────────────────────────────────────────────────

KNOWN_SOURCES = {
    "cnn_transformers_intro.pdf",
    "neural_networks_backprop.pdf",
    "cs231n_full_notes.pdf",
    "attention_is_all_you_need.pdf",
    "cs224n_transformers_2024.pdf",
    "cs224n_merged_notes.pdf",
}

evaluation_dataset = [

# STANDARD (20)
{"id":"S01","split":"dev","type":"standard",
 "question":"What is a perceptron?",
 "reference_answer":"A perceptron is a single linear neuron computing a weighted sum followed by thresholding.",
 "relevant_sources":["neural_networks_backprop.pdf"]},

{"id":"S02","split":"dev","type":"standard",
 "question":"Why are non-linear activations necessary?",
 "reference_answer":"Without non-linear activations, stacked linear layers collapse into one linear function.",
 "relevant_sources":["neural_networks_backprop.pdf"]},

{"id":"S03","split":"dev","type":"standard",
 "question":"What is the vanishing gradient problem?",
 "reference_answer":"Gradients shrink across many layers, slowing early-layer learning.",
 "relevant_sources":["neural_networks_backprop.pdf"]},

{"id":"S04","split":"dev","type":"standard",
 "question":"What does learning rate control?",
 "reference_answer":"It controls update step size during optimization.",
 "relevant_sources":["neural_networks_backprop.pdf"]},

{"id":"S05","split":"dev","type":"standard",
 "question":"What is overfitting?",
 "reference_answer":"Overfitting occurs when a model memorizes training data.",
 "relevant_sources":["cs231n_full_notes.pdf"]},

{"id":"S06","split":"dev","type":"standard",
 "question":"What is layer normalization?",
 "reference_answer":"Layer normalization normalizes features within each example.",
 "relevant_sources":["cs224n_transformers_2024.pdf"]},

{"id":"S07","split":"dev","type":"standard",
 "question":"Why are CNNs parameter efficient?",
 "reference_answer":"Local connectivity and weight sharing reduce parameters.",
 "relevant_sources":["cs231n_full_notes.pdf"]},

{"id":"S08","split":"dev","type":"standard",
 "question":"What is pooling?",
 "reference_answer":"Pooling downsamples feature maps to reduce computation.",
 "relevant_sources":["cs231n_full_notes.pdf"]},

{"id":"S09","split":"dev","type":"standard",
 "question":"What is residual connection?",
 "reference_answer":"Residual connections allow gradients to pass directly across layers.",
 "relevant_sources":["cs224n_transformers_2024.pdf"]},

{"id":"S10","split":"dev","type":"standard",
 "question":"How does self-attention work?",
 "reference_answer":"Self-attention computes weighted sums of value vectors using query-key similarity.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"S11","split":"dev","type":"standard",
 "question":"Why multi-head attention?",
 "reference_answer":"Multiple heads capture diverse relational patterns.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"S12","split":"dev","type":"standard",
 "question":"Why positional encodings?",
 "reference_answer":"They inject order information into attention-based models.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"S13","split":"dev","type":"standard",
 "question":"What is scaled dot-product attention?",
 "reference_answer":"It scales query-key dot products before softmax normalization.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"S14","split":"dev","type":"standard",
 "question":"Why masking in decoder?",
 "reference_answer":"Masking prevents attending to future tokens.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

# TEST standard (6)
{"id":"S15","split":"test","type":"standard",
 "question":"What is teacher forcing?",
 "reference_answer":"Teacher forcing feeds ground-truth tokens during training.",
 "relevant_sources":["cs224n_merged_notes.pdf"]},

{"id":"S16","split":"test","type":"standard",
 "question":"Why use Adam optimizer?",
 "reference_answer":"Adam adapts learning rates per parameter.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"S17","split":"test","type":"standard",
 "question":"Why transfer learning helps?",
 "reference_answer":"Pretrained models improve performance with limited labeled data.",
 "relevant_sources":["cs224n_transformers_2024.pdf"]},

{"id":"S18","split":"test","type":"standard",
 "question":"What is encoder-decoder difference?",
 "reference_answer":"Encoders process input; decoders generate output autoregressively.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"S19","split":"test","type":"standard",
 "question":"What is stride in CNN?",
 "reference_answer":"Stride controls filter movement and spatial downsampling.",
 "relevant_sources":["cs231n_full_notes.pdf"]},

{"id":"S20","split":"test","type":"standard",
 "question":"Why 1x1 convolution?",
 "reference_answer":"It mixes channel information efficiently.",
 "relevant_sources":["cs231n_full_notes.pdf"]},

# MULTI-HOP (5)
{"id":"M01","split":"dev","type":"multi_hop",
 "question":"How do residuals and normalization stabilize transformers?",
 "reference_answer":"Residual paths aid gradient flow while normalization stabilizes activations.",
 "relevant_sources":["cs224n_transformers_2024.pdf"]},

{"id":"M02","split":"dev","type":"multi_hop",
 "question":"Compare CNN locality and transformer attention.",
 "reference_answer":"CNNs model local patterns; transformers model global dependencies.",
 "relevant_sources":["cs231n_full_notes.pdf","attention_is_all_you_need.pdf"]},

{"id":"M03","split":"dev","type":"multi_hop",
 "question":"How does backprop update attention?",
 "reference_answer":"Gradients flow backward through attention projections and softmax.",
 "relevant_sources":["neural_networks_backprop.pdf","attention_is_all_you_need.pdf"]},

{"id":"M04","split":"test","type":"multi_hop",
 "question":"Why positional encodings despite permutation invariance?",
 "reference_answer":"They break symmetry so order-sensitive meaning can be learned.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

{"id":"M05","split":"test","type":"multi_hop",
 "question":"How do Adam and LR schedules interact?",
 "reference_answer":"Adam adapts step sizes while schedules control overall training stability.",
 "relevant_sources":["attention_is_all_you_need.pdf"]},

# UNANSWERABLE (5)
{"id":"U01","split":"dev","type":"unanswerable",
 "question":"What is CRISPR?",
 "reference_answer":"This cannot be answered from the provided corpus.",
 "relevant_sources":[]},

{"id":"U02","split":"dev","type":"unanswerable",
 "question":"What is the capital of Brazil?",
 "reference_answer":"This cannot be answered from the provided corpus.",
 "relevant_sources":[]},

{"id":"U03","split":"dev","type":"unanswerable",
 "question":"What caused the 2008 crisis?",
 "reference_answer":"This cannot be answered from the provided corpus.",
 "relevant_sources":[]},

{"id":"U04","split":"test","type":"unanswerable",
 "question":"Who won the 2026 Super Bowl?",
 "reference_answer":"This cannot be answered from the provided corpus.",
 "relevant_sources":[]},

{"id":"U05","split":"test","type":"unanswerable",
 "question":"Explain quantum error correction.",
 "reference_answer":"This cannot be answered from the provided corpus.",
 "relevant_sources":[]},
]

# ───────────────────────────────────────────────────────────────
# Dataset Validation
# ───────────────────────────────────────────────────────────────

dev_set = [x for x in evaluation_dataset if x["split"] == "dev"]
test_set = [x for x in evaluation_dataset if x["split"] == "test"]

assert len(evaluation_dataset) == 30
assert len(dev_set) == 20
assert len(test_set) == 10

type_counts = {
    t: sum(1 for x in evaluation_dataset if x["type"] == t)
    for t in ["standard", "multi_hop", "unanswerable"]
}

assert type_counts["standard"] == 20
assert type_counts["multi_hop"] == 5
assert type_counts["unanswerable"] == 5

for item in evaluation_dataset:
    if item["type"] == "unanswerable":
        assert item["relevant_sources"] == []
    else:
        assert len(item["relevant_sources"]) > 0
        assert set(item["relevant_sources"]).issubset(KNOWN_SOURCES)

print("Dataset integrity checks passed.")

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Dataset integrity checks passed.


In [17]:
# ═══════════════════════════════════════════════════════════════
# E2) MODULAR EVALUATION FUNCTIONS (REFACTORED)
# ═══════════════════════════════════════════════════════════════

# ── Helper: interpret Cohen's d magnitude ─────────────────────

def interpret_cohens_d(d):
    """Return a human-readable effect-size label for Cohen's d.

    Thresholds follow Cohen (1988): |d| < 0.2 negligible,
    0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
    """
    ad = abs(d)
    if ad < 0.2:
        return "negligible"
    elif ad < 0.5:
        return "small"
    elif ad < 0.8:
        return "medium"
    else:
        return "large"


# ── Helper: aggregate metrics from result rows ────────────────

def aggregate_metrics(results, metric_key):
    """Extract a metric array from result rows and return summary stats.

    Args:
        results: List of result dicts from run_generation_eval.
        metric_key: Key name (e.g. 'rag_rouge_l', 'baseline_bert_score').

    Returns:
        Dict with mean, std, and the raw numpy array.
    """
    arr = np.array([r[metric_key] for r in results], dtype=float)
    return {"mean": float(np.mean(arr)), "std": float(np.std(arr, ddof=1)), "values": arr}


def paired_test_and_effect(rag_arr, base_arr):
    """Run paired t-test and compute Cohen's d + 95 % CI.

    Args:
        rag_arr: 1-D numpy array of RAG scores.
        base_arr: 1-D numpy array of baseline scores (same length).

    Returns:
        Dict with t_stat, p_value, cohens_d, ci_low, ci_high, mean_diff.
    """
    diff = rag_arr - base_arr
    n = len(diff)
    diff_mean = float(np.mean(diff))
    diff_std = float(np.std(diff, ddof=1))

    if np.allclose(diff, 0.0):
        t_stat, p_val = 0.0, 1.0
    else:
        t_stat, p_val = ttest_rel(rag_arr, base_arr)
        t_stat, p_val = float(t_stat), float(p_val)

    cohens_d = float(diff_mean / diff_std) if diff_std > 0 else 0.0
    t_crit = float(t.ppf(0.975, df=n - 1))
    sem = diff_std / np.sqrt(n)
    ci_low = float(diff_mean - t_crit * sem)
    ci_high = float(diff_mean + t_crit * sem)

    return {
        "t_stat": t_stat, "p_value": p_val, "cohens_d": cohens_d,
        "ci_low": ci_low, "ci_high": ci_high, "mean_diff": diff_mean,
    }


# ── Generation evaluation loop ───────────────────────────────

def run_generation_eval(dataset_subset, top_k=3, include_baseline=True, verbose=True):
    """Run RAG (and optionally baseline) generation on a dataset subset.

    Returns a list of per-question result dicts.
    """
    results = []
    total = len(dataset_subset)

    for i, item in enumerate(dataset_subset, 1):
        q, ref = item["question"], item["reference_answer"]
        rag_result = generate_rag_answer(q, top_k=top_k)
        rag_scores = evaluate_answer(rag_result["answer"], ref)

        row = {
            "id": item["id"], "split": item["split"], "type": item["type"],
            "question": q, "reference": ref,
            "relevant_sources": item["relevant_sources"],
            "rag_answer": rag_result["answer"],
            "rag_rouge_l": float(rag_scores["rouge_l"]),
            "rag_cosine": float(rag_scores["cosine_similarity"]),
            "rag_bert_score": float(rag_scores["bert_score"]),
            "sources": rag_result.get("sources", []),
        }

        if include_baseline:
            bl = generate_baseline_answer(q)
            bl_scores = evaluate_answer(bl["answer"], ref)
            row.update({
                "baseline_answer": bl["answer"],
                "baseline_rouge_l": float(bl_scores["rouge_l"]),
                "baseline_cosine": float(bl_scores["cosine_similarity"]),
                "baseline_bert_score": float(bl_scores["bert_score"]),
            })
        else:
            row.update({
                "baseline_answer": None,
                "baseline_rouge_l": np.nan,
                "baseline_cosine": np.nan,
                "baseline_bert_score": np.nan,
            })

        results.append(row)
        if verbose:
            print(f"[{i:02d}/{total:02d}] {item['id']} | type={item['type']} | top_k={top_k}")

    return results


# ── Top-k ablation (DEV only) ────────────────────────────────

def run_topk_ablation(dev_subset, topk_values=(3, 5)):
    """Ablate top_k on DEV split and select best setting."""
    table, dev_results_by_topk = [], {}

    print("=" * 78)
    print("Top-k Ablation on DEV (RAG only): top_k in", set(topk_values))
    print("=" * 78)

    for k in topk_values:
        dev_results = run_generation_eval(dev_subset, top_k=k, include_baseline=False, verbose=False)
        dev_results_by_topk[k] = dev_results
        rouge_agg = aggregate_metrics(dev_results, "rag_rouge_l")
        cos_agg = aggregate_metrics(dev_results, "rag_cosine")
        table.append({
            "top_k": k,
            "rag_rouge_mean": rouge_agg["mean"], "rag_rouge_std": rouge_agg["std"],
            "rag_cosine_mean": cos_agg["mean"], "rag_cosine_std": cos_agg["std"],
        })

    table = sorted(table, key=lambda x: x["top_k"])
    print(f"{'top_k':<8}{'ROUGE-L mean':<16}{'ROUGE-L std':<14}{'Cosine mean':<14}{'Cosine std':<12}")
    print("-" * 64)
    for row in table:
        print(f"{row['top_k']:<8}{row['rag_rouge_mean']:<16.4f}{row['rag_rouge_std']:<14.4f}"
              f"{row['rag_cosine_mean']:<14.4f}{row['rag_cosine_std']:<12.4f}")

    best = max(table, key=lambda x: (x["rag_rouge_mean"], x["rag_cosine_mean"], -x["rag_rouge_std"]))
    best_top_k = int(best["top_k"])
    print(f"\nSelected best_top_k on DEV: {best_top_k}")

    return {"table": table, "best_top_k": best_top_k, "dev_results_by_topk": dev_results_by_topk}


# ── Statistical validation (answerable only + unanswerable analysis) ──

def compute_statistical_validation(test_results):
    """Compute paired stats on ANSWERABLE test items only.

    Prints a detailed summary with effect-size interpretation and a
    statistical power disclaimer.  Unanswerable items are analyzed
    separately.

    Returns:
        Dict with all statistical summaries.
    """
    answerable = [r for r in test_results if r["type"] != "unanswerable"]
    unanswerable = [r for r in test_results if r["type"] == "unanswerable"]
    n_ans = len(answerable)

    # ── Answerable: ROUGE-L ──
    rag_rouge = aggregate_metrics(answerable, "rag_rouge_l")
    base_rouge = aggregate_metrics(answerable, "baseline_rouge_l")
    rouge_test = paired_test_and_effect(rag_rouge["values"], base_rouge["values"])

    # ── Answerable: BERTScore ──
    rag_bert = aggregate_metrics(answerable, "rag_bert_score")
    base_bert = aggregate_metrics(answerable, "baseline_bert_score")
    bert_test = paired_test_and_effect(rag_bert["values"], base_bert["values"])

    # ── Unanswerable summary ──
    unanswerable_summary = {}
    if unanswerable:
        u_rag = aggregate_metrics(unanswerable, "rag_rouge_l")
        u_base = aggregate_metrics(unanswerable, "baseline_rouge_l")
        unanswerable_summary = {
            "n": len(unanswerable),
            "rag_rouge_mean": u_rag["mean"], "baseline_rouge_mean": u_base["mean"],
        }

    out = {
        "n_test": n_ans,
        "n_unanswerable": len(unanswerable),
        # ROUGE-L (answerable)
        "rag_mean": rag_rouge["mean"], "rag_std": rag_rouge["std"],
        "baseline_mean": base_rouge["mean"], "baseline_std": base_rouge["std"],
        "mean_diff": rouge_test["mean_diff"],
        "t_stat": rouge_test["t_stat"], "p_value": rouge_test["p_value"],
        "cohens_d": rouge_test["cohens_d"],
        "ci_diff_low": rouge_test["ci_low"], "ci_diff_high": rouge_test["ci_high"],
        # BERTScore (answerable)
        "rag_bert_mean": rag_bert["mean"], "rag_bert_std": rag_bert["std"],
        "baseline_bert_mean": base_bert["mean"], "baseline_bert_std": base_bert["std"],
        "bert_mean_diff": bert_test["mean_diff"],
        "bert_t_stat": bert_test["t_stat"], "bert_p_value": bert_test["p_value"],
        "bert_cohens_d": bert_test["cohens_d"],
        # Unanswerable
        "unanswerable": unanswerable_summary,
    }

    # ── Print answerable stats with interpretation ──
    rouge_d_label = interpret_cohens_d(rouge_test["cohens_d"])
    bert_d_label = interpret_cohens_d(bert_test["cohens_d"])
    rouge_sig = "statistically significant (p < 0.05)" if rouge_test["p_value"] < 0.05 else "NOT statistically significant (p >= 0.05)"
    bert_sig = "statistically significant (p < 0.05)" if bert_test["p_value"] < 0.05 else "NOT statistically significant (p >= 0.05)"

    print("\n" + "=" * 78)
    print(f"Statistical Validation — ANSWERABLE only (n = {n_ans})")
    print("=" * 78)

    print(f"\n[ROUGE-L]")
    print(f"  RAG mean      = {rag_rouge['mean']:.4f}  (std = {rag_rouge['std']:.4f})")
    print(f"  Baseline mean = {base_rouge['mean']:.4f}  (std = {base_rouge['std']:.4f})")
    print(f"  Difference (Δ)= {rouge_test['mean_diff']:.4f}")
    print(f"  95% CI        = [{rouge_test['ci_low']:.4f}, {rouge_test['ci_high']:.4f}]")
    print(f"  t-statistic   = {rouge_test['t_stat']:.4f}")
    print(f"  p-value       = {rouge_test['p_value']:.6f}  →  {rouge_sig}")
    print(f"  Cohen's d     = {rouge_test['cohens_d']:.4f}  →  {rouge_d_label} effect")

    print(f"\n[BERTScore]")
    print(f"  RAG mean      = {rag_bert['mean']:.4f}  (std = {rag_bert['std']:.4f})")
    print(f"  Baseline mean = {base_bert['mean']:.4f}  (std = {base_bert['std']:.4f})")
    print(f"  Difference (Δ)= {bert_test['mean_diff']:.4f}")
    print(f"  t-statistic   = {bert_test['t_stat']:.4f}")
    print(f"  p-value       = {bert_test['p_value']:.6f}  →  {bert_sig}")
    print(f"  Cohen's d     = {bert_test['cohens_d']:.4f}  →  {bert_d_label} effect")

    print(f"\n{'─' * 78}")
    print(f"⚠  STATISTICAL POWER DISCLAIMER")
    print(f"   With n = {n_ans} answerable test items, this study has LOW statistical")
    print(f"   power.  A non-significant p-value does NOT imply that RAG and")
    print(f"   baseline perform equally; it may simply reflect insufficient sample")
    print(f"   size.  Interpret these results as indicative, not definitive.")
    print(f"{'─' * 78}")

    # ── Print unanswerable analysis ──
    if unanswerable_summary:
        print(f"\n{'=' * 78}")
        print(f"Unanswerable Analysis (n = {unanswerable_summary['n']})")
        print("=" * 78)
        print(f"  RAG ROUGE-L mean      = {unanswerable_summary['rag_rouge_mean']:.4f}")
        print(f"  Baseline ROUGE-L mean = {unanswerable_summary['baseline_rouge_mean']:.4f}")
        print("  Interpretation: Both systems are expected to score low on unanswerable")
        print("  questions since the reference states the question cannot be answered.")
        print("  These items are excluded from the paired significance test.")

    return out


# ── Retrieval-only evaluation ─────────────────────────────────

def evaluate_retrieval_only(dataset_subset, k=5):
    """Compute Precision@k, Recall@k, and MRR on answerable items.

    Args:
        dataset_subset: List of evaluation items.
        k: Number of chunks to retrieve per query.

    Returns:
        Dict with aggregate retrieval metrics and per-question breakdown.
    """
    precision_scores, recall_scores, mrr_scores = [], [], []
    per_question = []
    n_skipped = 0

    for item in dataset_subset:
        relevant = item["relevant_sources"]
        if not relevant:
            n_skipped += 1
            continue

        retrieved = retrieve(item["question"], top_k=k)
        retrieved_sources = [x["metadata"]["source"] for x in retrieved]
        relevant_set = set(relevant)
        hits = set(retrieved_sources) & relevant_set

        prec = len(hits) / float(k)
        rec = len(hits) / float(len(relevant_set))
        rr = next((1.0 / rank for rank, s in enumerate(retrieved_sources, 1) if s in relevant_set), 0.0)

        precision_scores.append(prec)
        recall_scores.append(rec)
        mrr_scores.append(rr)
        per_question.append({
            "id": item["id"], "question": item["question"],
            "retrieved_sources": retrieved_sources, "relevant_sources": relevant,
            "precision_at_k": prec, "recall_at_k": rec, "reciprocal_rank": rr,
        })

    if not per_question:
        raise ValueError("No answerable questions for retrieval evaluation.")

    summary = {
        "k": k, "n_evaluated": len(per_question), "n_skipped_unanswerable": n_skipped,
        "precision_at_k": float(np.mean(precision_scores)),
        "recall_at_k": float(np.mean(recall_scores)),
        "mrr": float(np.mean(mrr_scores)),
        "per_question": per_question,
    }

    print("\n" + "=" * 78)
    print(f"Retrieval-only Evaluation @k={k}")
    print("=" * 78)
    print(f"Evaluated={summary['n_evaluated']} | Skipped unanswerable={n_skipped}")
    print(f"Precision@{k}: {summary['precision_at_k']:.4f}")
    print(f"Recall@{k}:    {summary['recall_at_k']:.4f}")
    print(f"MRR:           {summary['mrr']:.4f}")

    return summary


# ── Failure analysis ──────────────────────────────────────────

def print_failure_analysis(results, n=3):
    """Print and return the n lowest-scoring RAG ROUGE-L cases.

    Args:
        results: List of per-question result dicts.
        n: Number of worst cases to display.

    Returns:
        List of n failure-case dicts sorted by ascending ROUGE-L.
    """
    assert 0 < n <= len(results), "n must be in [1, len(results)]."

    lowest = sorted(results, key=lambda x: x["rag_rouge_l"])[:n]

    print("\n" + "=" * 78)
    print(f"Failure Analysis: Lowest {n} RAG ROUGE-L Cases")
    print("=" * 78)

    records = []
    for rank, item in enumerate(lowest, 1):
        print(f"\nCase {rank} | {item['id']} | RAG={item['rag_rouge_l']:.4f} | Base={item['baseline_rouge_l']:.4f}")
        print(f"  Q: {item['question']}")
        print(f"  RAG: {item['rag_answer']}")
        print(f"  Base: {item['baseline_answer']}")
        print(f"  Ref: {item['reference']}")
        records.append({
            "id": item["id"], "question": item["question"],
            "rag_rouge_l": float(item["rag_rouge_l"]),
            "baseline_rouge_l": float(item["baseline_rouge_l"]),
            "rag_answer": item["rag_answer"],
            "baseline_answer": item["baseline_answer"],
            "reference_answer": item["reference"],
        })

    return records


# ── Experimental results text builder ─────────────────────────

def build_experimental_results_section(stats_out, ablation_out, retrieval_out):
    """Build an academic-style experimental results section with interpretation.

    Includes Cohen's d interpretation, p-value explanation, and a
    statistical power disclaimer.
    """
    rouge_sig = "statistically significant" if stats_out["p_value"] < 0.05 else "not statistically significant"
    bert_sig = "statistically significant" if stats_out["bert_p_value"] < 0.05 else "not statistically significant"
    rouge_d_label = interpret_cohens_d(stats_out["cohens_d"])
    bert_d_label = interpret_cohens_d(stats_out["bert_cohens_d"])
    k = ablation_out["best_top_k"]

    text = (
        f"We evaluated a fixed RAG pipeline against a parametric baseline using identical "
        f"prompt templates and decoding parameters (max_new_tokens=220, min_new_tokens=60, "
        f"num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3). The only experimental "
        f"manipulation was the presence or absence of retrieved context.\n\n"
        f"The top-k retrieval parameter was tuned on the dev split (k ∈ {{3, 5}}); the "
        f"selected value (k={k}) was frozen before test-set evaluation. Inferential "
        f"analysis was conducted on n={stats_out['n_test']} answerable held-out test "
        f"questions; {stats_out.get('n_unanswerable', 0)} unanswerable items were "
        f"excluded from significance testing.\n\n"
        f"ROUGE-L: RAG mean={stats_out['rag_mean']:.4f} (std={stats_out['rag_std']:.4f}) "
        f"vs. baseline mean={stats_out['baseline_mean']:.4f} "
        f"(std={stats_out['baseline_std']:.4f}), Δ={stats_out['mean_diff']:.4f}. "
        f"Paired t-test: t={stats_out['t_stat']:.4f}, p={stats_out['p_value']:.6f} "
        f"— {rouge_sig}. "
        f"Cohen's d={stats_out['cohens_d']:.4f} ({rouge_d_label} effect), 95% CI "
        f"[{stats_out['ci_diff_low']:.4f}, {stats_out['ci_diff_high']:.4f}].\n\n"
        f"BERTScore: RAG mean={stats_out['rag_bert_mean']:.4f} "
        f"(std={stats_out['rag_bert_std']:.4f}) vs. baseline "
        f"mean={stats_out['baseline_bert_mean']:.4f} "
        f"(std={stats_out['baseline_bert_std']:.4f}), Δ={stats_out['bert_mean_diff']:.4f}. "
        f"Paired t-test: t={stats_out['bert_t_stat']:.4f}, "
        f"p={stats_out['bert_p_value']:.6f} — {bert_sig}. "
        f"Cohen's d={stats_out['bert_cohens_d']:.4f} ({bert_d_label} effect).\n\n"
        f"Retrieval evaluation at k={retrieval_out['k']}: "
        f"Precision@k={retrieval_out['precision_at_k']:.4f}, "
        f"Recall@k={retrieval_out['recall_at_k']:.4f}, "
        f"MRR={retrieval_out['mrr']:.4f}.\n\n"
        f"IMPORTANT CAVEAT: With only n={stats_out['n_test']} answerable test items, "
        f"the statistical power of these tests is low. Non-significant p-values should "
        f"not be interpreted as evidence of equivalence between RAG and baseline; they "
        f"may reflect insufficient sample size. These results are indicative of "
        f"directional trends and should be replicated on a larger evaluation set before "
        f"drawing definitive conclusions."
    )
    return text


print("Evaluation functions defined (refactored).")

Evaluation functions defined (refactored).


In [18]:
# ═══════════════════════════════════════════════════════════════
# E3) STRICT EXECUTION PROTOCOL (DEV ABLATION → FREEZE → TEST)
# ═══════════════════════════════════════════════════════════════

# 1) Tune top_k on DEV only (no test access)
ablation_out = run_topk_ablation(dev_set, topk_values=(3, 5))

# 2) Freeze best_top_k
best_top_k = ablation_out["best_top_k"]
print(f"\nFrozen best_top_k (chosen on DEV only): {best_top_k}")

# 3) Final generation evaluation on TEST only
test_results = run_generation_eval(test_set, top_k=best_top_k, include_baseline=True, verbose=True)

# Assertions: metric ranges + no NaNs/infs
rouge_matrix = np.array(
    [[r["rag_rouge_l"], r["baseline_rouge_l"]] for r in test_results], dtype=float
)
cosine_matrix = np.array(
    [[r["rag_cosine"], r["baseline_cosine"]] for r in test_results], dtype=float
)
bert_matrix = np.array(
    [[r["rag_bert_score"], r["baseline_bert_score"]] for r in test_results], dtype=float
)
assert np.all(np.isfinite(rouge_matrix)) and np.all(np.isfinite(cosine_matrix)) and np.all(np.isfinite(bert_matrix)), \
    "NaN/inf detected in test metrics."
assert np.all((rouge_matrix >= 0.0) & (rouge_matrix <= 1.0)), "ROUGE-L outside [0,1]."
assert np.all((cosine_matrix >= -1.0) & (cosine_matrix <= 1.0)), "Cosine outside [-1,1]."
assert np.all((bert_matrix >= -1.0) & (bert_matrix <= 1.0)), "BERTScore outside [-1,1]."

# 4) Statistical validation (answerable only; unanswerable analyzed separately)
stats_out = compute_statistical_validation(test_results)

# Assertions: finite stats + valid CI
assert stats_out["ci_diff_low"] <= stats_out["ci_diff_high"], "Invalid CI."
for key in ["rag_mean", "rag_std", "baseline_mean", "baseline_std", "mean_diff",
            "t_stat", "p_value", "cohens_d",
            "rag_bert_mean", "rag_bert_std", "baseline_bert_mean", "baseline_bert_std",
            "bert_mean_diff", "bert_t_stat", "bert_p_value", "bert_cohens_d"]:
    assert np.isfinite(stats_out[key]), f"Non-finite: {key}"

# 5) Retrieval-only evaluation on TEST (answerable items)
retrieval_out = evaluate_retrieval_only(test_set, k=best_top_k)

for key in ["precision_at_k", "recall_at_k", "mrr"]:
    assert np.isfinite(retrieval_out[key]) and 0.0 <= retrieval_out[key] <= 1.0, \
        f"Retrieval metric {key} invalid."

# 6) Failure analysis
failure_cases = print_failure_analysis(test_results, n=3)
assert len(failure_cases) == 3
assert [x["rag_rouge_l"] for x in failure_cases] == sorted(x["rag_rouge_l"] for x in failure_cases)

print("\nAll evaluation assertions passed.")

Top-k Ablation on DEV (RAG only): top_k in {3, 5}
top_k   ROUGE-L mean    ROUGE-L std   Cosine mean   Cosine std  
----------------------------------------------------------------
3       0.0926          0.0586        0.4011        0.0911      
5       0.0926          0.0586        0.4011        0.0911      

Selected best_top_k on DEV: 3

Frozen best_top_k (chosen on DEV only): 3
[01/10] S15 | type=standard | top_k=3
[02/10] S16 | type=standard | top_k=3
[03/10] S17 | type=standard | top_k=3
[04/10] S18 | type=standard | top_k=3
[05/10] S19 | type=standard | top_k=3
[06/10] S20 | type=standard | top_k=3
[07/10] M04 | type=multi_hop | top_k=3
[08/10] M05 | type=multi_hop | top_k=3
[09/10] U04 | type=unanswerable | top_k=3
[10/10] U05 | type=unanswerable | top_k=3

Statistical Validation — ANSWERABLE only (n = 8)

[ROUGE-L]
  RAG mean      = 0.0355  (std = 0.0458)
  Baseline mean = 0.0409  (std = 0.0406)
  Difference (Δ)= -0.0054
  95% CI        = [-0.0516, 0.0408]
  t-statistic   = -0.

In [19]:
# ═══════════════════════════════════════════════════════════════
# E4) EXPERIMENTAL RESULTS SECTION (ACADEMIC STYLE, 2-3 PARAGRAPHS)
# ═══════════════════════════════════════════════════════════════
experimental_results_text = build_experimental_results_section(stats_out, ablation_out, retrieval_out)

print("\n" + "=" * 78)
print("Experimental Results (Report-Ready)")
print("=" * 78)
print(experimental_results_text)


Experimental Results (Report-Ready)
We evaluated a fixed RAG pipeline against a parametric baseline using identical prompt templates and decoding parameters (max_new_tokens=220, min_new_tokens=60, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3). The only experimental manipulation was the presence or absence of retrieved context.

The top-k retrieval parameter was tuned on the dev split (k ∈ {3, 5}); the selected value (k=3) was frozen before test-set evaluation. Inferential analysis was conducted on n=8 answerable held-out test questions; 2 unanswerable items were excluded from significance testing.

ROUGE-L: RAG mean=0.0355 (std=0.0458) vs. baseline mean=0.0409 (std=0.0406), Δ=-0.0054. Paired t-test: t=-0.2751, p=0.791212 — not statistically significant. Cohen's d=-0.0972 (negligible effect), 95% CI [-0.0516, 0.0408].

BERTScore: RAG mean=0.0109 (std=0.0685) vs. baseline mean=-0.1110 (std=0.1290), Δ=0.1218. Paired t-test: t=2.5921, p=0.035834 — statistically significant. Coh