<h1>Step1: Data Processing</h1>

In [4]:
import json
from pathlib import Path
from typing import List, Tuple, Dict, Any

import fitz  # PyMuPDF
import pandas as pd

# Project paths
ROOT = Path("/content/drive/MyDrive/RAG-FT-DATA")
RAW_DIR   = ROOT / "raw"
CLEAN_DIR = ROOT / "cleaned_text"
PROC_DIR  = ROOT / "processed"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

list(RAW_DIR.glob("*.pdf"))

[PosixPath('/content/drive/MyDrive/RAG-FT-DATA/raw/annual-report-2024.pdf'),
 PosixPath('/content/drive/MyDrive/RAG-FT-DATA/raw/infosys-ar-25.pdf')]

In [3]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


<h3>Utility Helpers</h3>

In [5]:
HEADER_FOOTER_HINTS = [
    r"^page\s*\d+(\s*of\s*\d+)?$",   # Page 1 of 100
    r"^\d+$",                        # just a number (often a page number)
]

def clean_lines(lines: List[str]) -> List[str]:
    """
    Basic line-level cleanup:
    - strip
    - drop obvious headers/footers/page numbers
    - drop very short all-caps tokens
    - collapse multiple spaces
    """
    cleaned = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        drop = False
        for pat in HEADER_FOOTER_HINTS:
            if re.fullmatch(pat, s, flags=re.IGNORECASE):
                drop = True
                break
        if drop:
            continue
        if len(s) <= 3 and s.isupper():
            continue
        s = re.sub(r"\s+", " ", s)
        cleaned.append(s)
    return cleaned


<h3>Metric Patterns & Extraction</h3>

In [6]:
def extract_pdf_text(pdf_path: Path) -> Tuple[str, List[Tuple[int, str]]]:
    """
    Extract text per page using PyMuPDF and clean it.
    Returns: (full_clean_text, [(page_no, page_clean_text), ...])
    """
    doc = fitz.open(pdf_path)
    pages = []
    for pno in range(len(doc)):
        page = doc[pno]
        raw = page.get_text("text") or ""
        clines = clean_lines(raw.splitlines())
        pages.append((pno + 1, "\n".join(clines)))
    doc.close()

    full = "\n\n".join(t for _, t in pages if t.strip())
    return full, pages

def write_clean_text(doc_name: str, full_text: str, pages: List[Tuple[int, str]]):
    (CLEAN_DIR / f"{doc_name}.txt").write_text(full_text, encoding="utf-8")
    per_page = [{"page": p, "text": t} for p, t in pages]
    (CLEAN_DIR / f"{doc_name}.pages.json").write_text(
        json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8"
    )


In [7]:
SECTION_PATTERNS = {
    "balance_sheet": r"\b(balance\s+sheet|statement\s+of\s+financial\s+position)\b",
    "income_statement": r"\b(income\s+statement|profit\s+and\s+loss|statement\s+of\s+operations)\b",
    "cash_flow": r"\b(cash\s+flow|statement\s+of\s+cash\s+flows)\b",
    "mdna": r"\b(management\s+discussion\s+and\s+analysis|md&a)\b",
    "notes": r"\b(notes\s+to\s+the\s+financial\s+statements|notes\s+to\s+accounts)\b",
}

def rough_section_indices(text: str) -> Dict[str, List[int]]:
    idx = {}
    low = text.lower()
    for name, pat in SECTION_PATTERNS.items():
        idx[name] = [m.start() for m in re.finditer(pat, low)]
    return idx

def segment_sections(text: str) -> Dict[str, str]:
    indices = rough_section_indices(text)
    all_starts = []
    for sec, starts in indices.items():
        for s in starts:
            all_starts.append((s, sec))
    if not all_starts:
        return {"full_report": text}

    all_starts.sort(key=lambda x: x[0])
    result = {}
    for i, (start, sec) in enumerate(all_starts):
        end = all_starts[i+1][0] if i+1 < len(all_starts) else len(text)
        if sec not in result:  # keep first occurrence only
            result[sec] = text[start:end].strip()
    if not result:
        result["full_report"] = text
    return result


<h3>Chunking (100-word & 400-word windows with overlap)</h3>

In [8]:
def chunk_words(words, chunk_size, overlap=20):
    i = 0
    n = len(words)
    while i < n:
        j = min(i + chunk_size, n)
        yield (i, j, " ".join(words[i:j]))
        if j == n:
            break
        i = max(j - overlap, i + 1)

def make_chunks(doc_name: str, full_text: str, pages: List[Tuple[int, str]], sizes=(100, 400)):
    # Map approx page spans for traceability
    page_word_spans = []
    cum = 0
    for p, t in pages:
        wc = len(t.split())
        page_word_spans.append((p, cum, cum + wc))
        cum += wc

    def span_to_pages(start_w: int, end_w: int):
        touched = []
        for p, a, b in page_word_spans:
            if end_w <= a:
                break
            if start_w >= b:
                continue
            touched.append(p)
        if not touched:
            return []
        return [min(touched), max(touched)] if len(touched) > 1 else [touched[0], touched[0]]

    words = full_text.split()
    outputs = {}
    for sz in sizes:
        chunks = []
        for idx, (start, end, text) in enumerate(chunk_words(words, chunk_size=sz, overlap=20)):
            chunks.append({
                "chunk_id": f"{doc_name}_{sz}_{idx:05d}",
                "doc_name": doc_name,
                "chunk_size_words": sz,
                "start_word_index": start,
                "end_word_index": end,
                "pages_approx": span_to_pages(start, end),
                "text": text
            })
        outputs[sz] = chunks
    return outputs

def save_chunks(all_chunks: Dict[int, list]):
    PROC_DIR.mkdir(parents=True, exist_ok=True)
    for sz, chs in all_chunks.items():
        # JSONL
        jsonl_path = PROC_DIR / f"chunks_{sz}.jsonl"
        with open(jsonl_path, "w", encoding="utf-8") as f:
            for c in chs:
                f.write(json.dumps(c, ensure_ascii=False) + "\n")
        # JSON (convenience)
        (PROC_DIR / f"chunks_{sz}.json").write_text(
            json.dumps(chs, ensure_ascii=False, indent=2), encoding="utf-8"
        )


<h3>Run on All Raw PDFs</h3>

In [9]:
import re

pdfs = sorted(RAW_DIR.glob("*.pdf"))
assert len(pdfs) > 0, "No PDFs found in data/raw/. Please add your annual reports."

all_chunks_100, all_chunks_400 = [], []

for pdf in pdfs:
    doc_name = pdf.stem
    print(f"[INFO] Processing {pdf.name} ...")
    full_text, pages = extract_pdf_text(pdf)
    # Save cleaned text + per-page JSON
    write_clean_text(doc_name, full_text, pages)
    # Save rough section splits for reference
    sections = segment_sections(full_text)
    (CLEAN_DIR / f"{doc_name}.sections.json").write_text(
        json.dumps(sections, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    # Build chunks (100 & 400)
    chunks_by_size = make_chunks(doc_name, full_text, pages, sizes=(100, 400))
    all_chunks_100.extend(chunks_by_size[100])
    all_chunks_400.extend(chunks_by_size[400])

# Save combined chunk files
save_chunks({100: all_chunks_100, 400: all_chunks_400})

print("\n[STATS] Documents processed:", [p.name for p in pdfs])
print("[STATS] 100-word chunks:", len(all_chunks_100))
print("[STATS] 400-word chunks:", len(all_chunks_400))
print("[OK] Outputs saved to:", PROC_DIR.resolve())


[INFO] Processing annual-report-2024.pdf ...
[INFO] Processing infosys-ar-25.pdf ...

[STATS] Documents processed: ['annual-report-2024.pdf', 'infosys-ar-25.pdf']
[STATS] 100-word chunks: 4212
[STATS] 400-word chunks: 888
[OK] Outputs saved to: /content/drive/MyDrive/RAG-FT-DATA/processed


<h3>Quick Peek / Sanity Check</h3>

In [10]:
# Show a few cleaned lines and a couple of chunks for verification
txt_files = sorted(CLEAN_DIR.glob("*.txt"))
print("Cleaned text files:", [p.name for p in txt_files][:5])

sample_txt = txt_files[0].read_text(encoding="utf-8").splitlines()[:15]
print("\nSample cleaned lines:\n", "\n".join(sample_txt))

import json
sample_100 = json.loads((PROC_DIR / "chunks_100.json").read_text(encoding="utf-8"))[:10]
pd.DataFrame(sample_100)[["chunk_id","pages_approx","text"]].head(10)


Cleaned text files: ['annual-report-2024.txt', 'infosys-ar-25.txt']

Sample cleaned lines:
 Generative AI and You
Integrated Annual Report 2023-24

Infosys Integrated Annual Report 2023-24
We barely saw it happen. AI walking into our lives. Through
the ads that follow us on social media. The personalized
pick of movies and shows. Our cars. The maps helping us
navigate. Right there in our hands – our super-powerful
phones. And now, it’s happening again. This time with
generative AI. In the form of handy tools – like ChatGPT,
MetaAI and Stable Diffusion – that pique our imagination,
and stoke our curiosity.
Generative AI technology’s path into enterprises too has
been just as accelerated and enthusiastic, supported by
an exponential increase in investments. While almost


Unnamed: 0,chunk_id,pages_approx,text
0,annual-report-2024_100_00000,"[1, 2]",Generative AI and You Integrated Annual Report...
1,annual-report-2024_100_00001,"[2, 2]","pique our imagination, and stoke our curiosity..."
2,annual-report-2024_100_00002,"[2, 2]","believe, some ongoing AI pilots will scale to ..."
3,annual-report-2024_100_00003,"[2, 2]","with potential for more pervasive automation, ..."
4,annual-report-2024_100_00004,"[2, 2]",continuously reviewed and enhanced to cover mo...
5,annual-report-2024_100_00005,"[2, 3]","us all, and how Infosys can be the trusted par..."
6,annual-report-2024_100_00006,"[3, 3]",2023-24 Microsoft Corporation is a technology ...
7,annual-report-2024_100_00007,"[3, 3]",support to manage their business operations ac...
8,annual-report-2024_100_00008,"[3, 3]",efficiency projections. The aim was to transfo...
9,annual-report-2024_100_00009,"[3, 3]",operational insights that the business counts ...


In [11]:
OUT_QA_JSONL = ROOT / "qa_pairs.jsonl"
OUT_QA_CSV   = ROOT / "qa_pairs.csv"

In [12]:
def load_clean_texts(clean_dir: Path) -> Dict[str, str]:
    texts = {}
    for p in clean_dir.glob("*.txt"):
        texts[p.stem] = p.read_text(encoding="utf-8", errors="ignore")
    return texts

def load_sections(clean_dir: Path) -> Dict[str, Dict[str, str]]:
    sections = {}
    for p in clean_dir.glob("*.sections.json"):
        try:
            sections[p.stem.replace(".sections","")] = json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            pass
    return sections

def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def guess_company_from_text(doc_name: str, text: str) -> str:
    first = "\n".join(text.splitlines()[:20])
    m = re.search(r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,3})\b", first)
    return m.group(1) if m else doc_name

def find_years(text: str):
    return sorted({int(y) for y in re.findall(r"\b(20\d{2})\b", text)})


<h3>Patterns & Extraction</h3>

In [13]:
# Amount/units patterns
CURRENCY = r"(?:₹|\$|USD|EUR|INR|Rs\.?)?"
NUM      = r"(?:\d{1,3}(?:[,\s]\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)"
UNITS    = r"(?:\s?(?:million|billion|mn|bn|crore|lakh|cr|m|bn))?"
AMOUNT   = CURRENCY + r"\s*" + NUM + r"\s*" + UNITS

METRIC_PATTERNS = {
    "revenue":      r"\b(revenue|total\s+revenue|net\s+sales|sales)\b[:\s\-–]*" + AMOUNT,
    "net_income":   r"\b(net\s+(?:income|profit)|profit\s+after\s+tax|PAT)\b[:\s\-–]*" + AMOUNT,
    "ebitda":       r"\b(EBITDA)\b[:\s\-–]*" + AMOUNT,
    "eps":          r"\b(EPS|earnings\s+per\s+share)\b[:\s\-–]*" + AMOUNT,
    "cash_flow":    r"\b(net\s+cash\s+from\s+operating\s+activities|operating\s+cash\s+flow)\b[:\s\-–]*" + AMOUNT,
    "assets":       r"\b(total\s+assets)\b[:\s\-–]*" + AMOUNT,
    "liabilities":  r"\b(total\s+liabilities)\b[:\s\-–]*" + AMOUNT,
}

def clean_amount(val: str) -> str:
    v = normalize_spaces(val)
    v = v.replace("USD", "USD ").replace("INR", "INR ").replace("Rs.", "Rs ")
    v = re.sub(r"\s+", " ", v)
    return v.strip()

def extract_metric_sentences(text: str, metric: str, pattern: str) -> List[Dict[str, Any]]:
    out = []
    for m in re.finditer(pattern, text, flags=re.IGNORECASE):
        span = m.span()
        start = max(0, span[0] - 160)
        end   = min(len(text), span[1] + 160)
        context = normalize_spaces(text[start:end])
        raw = normalize_spaces(m.group(0))
        amt_m = re.search(AMOUNT, raw, flags=re.IGNORECASE)
        amount = clean_amount(amt_m.group(0)) if amt_m else raw
        near = text[max(0, span[0]-80): min(len(text), span[1]+80)]
        year_m = re.search(r"\b(20\d{2})\b", near)
        year = int(year_m.group(1)) if year_m else None
        out.append({"metric": metric, "amount": amount, "year": year, "raw": raw, "context": context})
    return out

def extract_all_metrics(text: str) -> List[Dict[str, Any]]:
    results = []
    for metric, pat in METRIC_PATTERNS.items():
        results.extend(extract_metric_sentences(text, metric, pat))
    return results


<h3>Broaden metric patterns (percentages, headcount, dividends, etc.)</h3>

In [14]:
# Extend your existing patterns (keep the original METRIC_PATTERNS; just update/merge)
EXTRA_PATTERNS = {
    "operating_margin":  r"\b(operating\s+margin|EBIT\s*margin)\b[:\s\-–]*([0-9]+(?:\.[0-9]+)?\s?%)",
    "net_margin":        r"\b(net\s+margin)\b[:\s\-–]*([0-9]+(?:\.[0-9]+)?\s?%)",
    "headcount":         r"\b(headcount|number\s+of\s+employees|employees)\b[:\s\-–]*" + AMOUNT,
    "dividend":          r"\b(dividend\s+(?:per\s+share)?|dps)\b[:\s\-–]*" + AMOUNT,
    "equity":            r"\b(total\s+equity|shareholders'\s*funds|net\s+worth)\b[:\s\-–]*" + AMOUNT,
    "cash_and_cash_eq":  r"\b(cash\s+and\s+cash\s+equivalents|cash\s+&\s+cash\s+equivalents)\b[:\s\-–]*" + AMOUNT,
    "rd_expense":        r"\b(research\s+and\s+development\s+expenses|R&D\s+expenses?)\b[:\s\-–]*" + AMOUNT,
    "capex":             r"\b(capex|capital\s+expenditure)\b[:\s\-–]*" + AMOUNT,
    "opex":              r"\b(operating\s+expenses|opex)\b[:\s\-–]*" + AMOUNT
}

METRIC_PATTERNS.update(EXTRA_PATTERNS)

# Allow Indian units & symbols more robustly
INDIAN_UNITS = r"(?:crore|lakh|lakhs|cr|cr\.|mn|bn|million|billion|thousand|k)"
AMOUNT = CURRENCY + r"\s*" + NUM + r"(?:\s*" + INDIAN_UNITS + r")?"


<h3>Add textual extractors (CEO, CFO, Auditor, HQ, segments)</h3>

In [15]:
def extract_textual_facts(text: str) -> List[Dict]:
    facts = []
    # CEO/CFO
    for title, pat in [
        ("ceo", r"\b(Chief\s+Executive\s+Officer|CEO)\b[:\-–]?\s*([A-Z][a-zA-Z\.\-']+(?:\s+[A-Z][a-zA-Z\.\-']+){0,3})"),
        ("cfo", r"\b(Chief\s+Financial\s+Officer|CFO)\b[:\-–]?\s*([A-Z][a-zA-Z\.\-']+(?:\s+[A-Z][a-zA-Z\.\-']+){0,3})"),
    ]:
        for m in re.finditer(pat, text):
            person = m.group(2).strip()
            ctx = normalize_spaces(text[max(0, m.start()-120): m.end()+120])
            facts.append({"type": title, "value": person, "context": ctx})

    # Auditor
    for m in re.finditer(r"\b(Statutory\s+Auditors?|Auditor)\b[:\-–]?\s*([A-Z&][A-Za-z&\s\.,'-]{3,80})", text):
        val = m.group(2).strip()
        ctx = normalize_spaces(text[max(0, m.start()-120): m.end()+120])
        facts.append({"type": "auditor", "value": val, "context": ctx})

    # Headquarters
    for m in re.finditer(r"\b(Registered\s+Office|Headquarters?|Corporate\s+Office)\b[:\-–]?\s*([A-Za-z0-9,\.\-\(\) ]{10,120})", text, re.IGNORECASE):
        val = normalize_spaces(m.group(2))
        facts.append({"type": "hq", "value": val, "context": val})

    # Business segments (collect a comma/semicolon list after keywords)
    seg_match = re.search(r"\b(business\s+segments?|reportable\s+segments?)\b[:\-–]?\s*([A-Za-z0-9&/\-\s,;]{10,200})", text, re.IGNORECASE)
    if seg_match:
        segs = [s.strip(" ;,") for s in re.split(r"[;,]", seg_match.group(2)) if len(s.strip()) > 1]
        if segs:
            facts.append({"type": "segments", "value": ", ".join(sorted(set(segs))), "context": seg_match.group(0)})

    return facts


<h3>Build more questions from textual facts</h3>

In [16]:
def textual_facts_to_qas(doc_name: str, company: str, text: str) -> List[Dict]:
    out = []
    facts = extract_textual_facts(text)
    for f in facts:
        t = f["type"]; v = f["value"]; ctx = f["context"]
        if t == "ceo":
            q = f"Who was the CEO of {company}?"
        elif t == "cfo":
            q = f"Who was the CFO of {company}?"
        elif t == "auditor":
            q = f"Who is the statutory auditor of {company}?"
        elif t == "hq":
            q = f"What is the registered office address of {company}?"
        elif t == "segments":
            q = f"What are the reportable business segments of {company}?"
        else:
            continue
        out.append({
            "question": q,
            "answer": v,
            "metric": t,
            "year": None,
            "company": company,
            "source_doc": doc_name,
            "context_snippet": ctx,
            "confidence_heuristic": 0.8
        })
    return out


<h3>Generate more YoY comparisons (for many metrics)</h3>

In [17]:
def build_yoy_pairs_multi(items: List[Dict[str, Any]], metrics: List[str], company: str) -> List[Dict]:
    out = []
    for metric in metrics:
        by_year = {}
        for it in items:
            if it["metric"] == metric and it["company"] == company and it.get("year"):
                by_year[it["year"]] = it["answer"]
        years_sorted = sorted(by_year.keys(), reverse=True)
        if len(years_sorted) >= 2:
            y1, y2 = years_sorted[0], years_sorted[1]
            q = f"Compare {company}'s {metric.replace('_',' ')} in {y2} vs {y1}."
            a = f"{y2}: {by_year[y2]}; {y1}: {by_year[y1]}."
            out.append({
                "question": q, "answer": a,
                "metric": f"{metric}_comparison",
                "year": f"{y2} vs {y1}",
                "company": company, "source_doc": "multiple",
                "context_snippet": "",
                "confidence_heuristic": 0.72
            })
    return out


<h3>Build Q/A Candidates</h3>

In [18]:
# Rebuild candidates with extra metrics + textual facts
texts = load_clean_texts(CLEAN_DIR)
sections = load_sections(CLEAN_DIR)

candidates = []
for doc_name, txt in texts.items():
    company = guess_company_from_text(doc_name, txt)
    # numeric/amount-like facts
    facts = extract_all_metrics(txt)
    for f in facts:
        metric, amount, year, ctx = f["metric"], f["amount"], f["year"], f["context"]
        if metric == "revenue":
            q = f"What was {company}'s revenue in {year}?"
        elif metric == "net_income":
            q = f"What was {company}'s net income in {year}?"
        elif metric == "ebitda":
            q = f"What was {company}'s EBITDA in {year}?"
        elif metric == "eps":
            q = f"What was {company}'s EPS in {year}?"
        elif metric == "cash_flow":
            q = f"What was {company}'s operating cash flow in {year}?"
        elif metric == "assets":
            q = f"What were {company}'s total assets in {year}?"
        elif metric == "liabilities":
            q = f"What were {company}'s total liabilities in {year}?"
        elif metric == "operating_margin":
            q = f"What was {company}'s operating margin in {year}?"
        elif metric == "net_margin":
            q = f"What was {company}'s net margin in {year}?"
        elif metric == "headcount":
            q = f"What was {company}'s total employee headcount in {year}?"
        elif metric == "dividend":
            q = f"What was the dividend for {company} in {year}?"
        elif metric == "equity":
            q = f"What was {company}'s total equity in {year}?"
        elif metric == "cash_and_cash_eq":
            q = f"What were {company}'s cash and cash equivalents in {year}?"
        elif metric == "rd_expense":
            q = f"What was {company}'s R&D expense in {year}?"
        elif metric == "capex":
            q = f"What was {company}'s capital expenditure (CapEx) in {year}?"
        elif metric == "opex":
            q = f"What were {company}'s operating expenses (OpEx) in {year}?"
        else:
            q = f"What was {metric.replace('_',' ')} in {year} for {company}?"

        conf = 0.9 if (year is not None and re.search(NUM, amount)) else 0.65
        candidates.append({
            "question": q,
            "answer": amount,
            "metric": metric,
            "year": year,
            "company": company,
            "source_doc": doc_name,
            "context_snippet": ctx,
            "confidence_heuristic": conf
        })

    # textual facts
    candidates += textual_facts_to_qas(doc_name, company, txt)

print("Candidates so far:", len(candidates))


Candidates so far: 139


<h3>Deduplicate & Add YoY Comparison Q/As</h3>

In [19]:
# Deduplicate
seen = set(); uniq = []
for it in candidates:
    key = (it["question"].lower(), normalize_spaces(it["answer"]).lower())
    if key not in seen:
        seen.add(key); uniq.append(it)

companies = sorted({it["company"] for it in uniq})
metrics_for_yoy = ["revenue","net_income","ebitda","eps","operating_margin","net_margin","cash_flow"]

yoy_more = []
for comp in companies:
    yoy_more += build_yoy_pairs_multi(uniq, metrics_for_yoy, comp)

# Controls: ambiguous + irrelevant
controls = [
    {"question": "What was the revenue?", "answer": "Not in scope", "metric":"control_ambiguous", "year": None, "company":"N/A", "source_doc":"N/A", "context_snippet":"", "confidence_heuristic":0.5},
    {"question": "Tell me about future mergers?", "answer": "Not in scope", "metric":"control_irrelevant", "year": None, "company":"N/A", "source_doc":"N/A", "context_snippet":"", "confidence_heuristic":0.5},
    {"question": "What is the capital of France?", "answer": "Not in scope", "metric":"control_irrelevant", "year": None, "company":"N/A", "source_doc":"N/A", "context_snippet":"", "confidence_heuristic":0.5}
]

qa_dataset = uniq + yoy_more + controls
print("Total after YoY+controls:", len(qa_dataset))

# Balance to ~50
def balanced_sample(items: List[Dict[str, Any]], max_total: int = 50):
    # simple stratify by coarse category
    buckets = {"numeric": [], "textual": [], "comparison": [], "control": []}
    for it in items:
        m = it["metric"]
        if m.endswith("_comparison"):
            buckets["comparison"].append(it)
        elif m.startswith("control_"):
            buckets["control"].append(it)
        elif m in {"ceo","cfo","auditor","hq","segments"}:
            buckets["textual"].append(it)
        else:
            buckets["numeric"].append(it)

    take = []
    take += sorted(buckets["numeric"], key=lambda x: x.get("confidence_heuristic",0), reverse=True)[:22]
    take += sorted(buckets["textual"], reverse=True, key=lambda x: x.get("confidence_heuristic",0))[:10]
    take += sorted(buckets["comparison"], reverse=True, key=lambda x: x.get("confidence_heuristic",0))[:12]
    take += buckets["control"][:6]

    if len(take) > max_total:
        take = take[:max_total]
    return take

qa_balanced = balanced_sample(qa_dataset, max_total=50)
print("Balanced size:", len(qa_balanced))
pd.DataFrame(qa_balanced)[:10]


Total after YoY+controls: 106
Balanced size: 35


Unnamed: 0,question,answer,metric,year,company,source_doc,context_snippet,confidence_heuristic
0,What were Generative AI's total assets in 2023?,1.0,assets,2023.0,Generative AI,annual-report-2024,"oans 2.6 Other financial assets 2.7 10,129 9,0...",0.9
1,What was Generative AI's operating margin in 2...,20.7,operating_margin,2020.0,Generative AI,annual-report-2024,ry Business highlights Performance overview Di...,0.9
2,What was Generative AI's operating margin in 2...,85.0,operating_margin,2020.0,Generative AI,annual-report-2024,mance overview Dividend per share(2) (in ₹) 46...,0.9
3,What was Generative AI's total employee headco...,1882.0,headcount,2024.0,Generative AI,annual-report-2024,hilippines Malaysia Singapore Japan South Kore...,0.9
4,What was Generative AI's total employee headco...,20.0,headcount,2023.0,Generative AI,annual-report-2024,n to India c. A brief on types of customers Bu...,0.9
5,What was Generative AI's total employee headco...,12.6,headcount,2022.0,Generative AI,annual-report-2024,e in fiscal 2024 (In %) Turnover rate in fisca...,0.9
6,What were Integrated Annual Report's total ass...,1.0,assets,2024.0,Integrated Annual Report,infosys-ar-25,"2.6 Other financial assets 2.7 12,569 10,129 I...",0.9
7,What was Integrated Annual Report's total empl...,1869.0,headcount,2024.0,Integrated Annual Report,infosys-ar-25,and outcomes for all stakeholders. Infosys Int...,0.9
8,What was Integrated Annual Report's total empl...,20.0,headcount,2024.0,Integrated Annual Report,infosys-ar-25,"ices, healthcare, high technology, insurance, ...",0.9
9,What was Integrated Annual Report's total empl...,14.5,headcount,2023.0,Integrated Annual Report,infosys-ar-25,e in fiscal 2025 (In %) Turnover rate in fisca...,0.9


<h3>Balance to ~50 Q/As</h3>

In [20]:
# If you want to cap to ~50 for FT while keeping variety
def balanced_sample(items: List[Dict[str, Any]], per_metric: int = 8, max_total: int = 50):
    by_metric = {}
    for it in items:
        by_metric.setdefault(it["metric"], []).append(it)
    sampled = []
    # sort by confidence within each metric
    for m, arr in by_metric.items():
        arr_sorted = sorted(arr, key=lambda x: x.get("confidence_heuristic", 0), reverse=True)
        sampled.extend(arr_sorted[:per_metric])
    # if more than max_total, keep highest-confidence globally
    if len(sampled) > max_total:
        sampled = sorted(sampled, key=lambda x: x.get("confidence_heuristic", 0), reverse=True)[:max_total]
    return sampled

qa_balanced = balanced_sample(qa_dataset, per_metric=8, max_total=50)
print("Total candidates:", len(qa_dataset))
print("Balanced sample:", len(qa_balanced))
df_preview = pd.DataFrame(qa_balanced)[["question","answer","metric","year","company","source_doc","confidence_heuristic"]]
# df_preview.head(15)
df_preview


Total candidates: 106
Balanced sample: 50


Unnamed: 0,question,answer,metric,year,company,source_doc,confidence_heuristic
0,What were Generative AI's total assets in 2023?,1,assets,2023.0,Generative AI,annual-report-2024,0.9
1,What were Integrated Annual Report's total ass...,1,assets,2024.0,Integrated Annual Report,infosys-ar-25,0.9
2,What was Generative AI's operating margin in 2...,20.7,operating_margin,2020.0,Generative AI,annual-report-2024,0.9
3,What was Generative AI's operating margin in 2...,85,operating_margin,2020.0,Generative AI,annual-report-2024,0.9
4,What was Generative AI's total employee headco...,1882,headcount,2024.0,Generative AI,annual-report-2024,0.9
5,What was Generative AI's total employee headco...,20,headcount,2023.0,Generative AI,annual-report-2024,0.9
6,What was Generative AI's total employee headco...,12.6,headcount,2022.0,Generative AI,annual-report-2024,0.9
7,What was Integrated Annual Report's total empl...,1869,headcount,2024.0,Integrated Annual Report,infosys-ar-25,0.9
8,What was Integrated Annual Report's total empl...,20,headcount,2024.0,Integrated Annual Report,infosys-ar-25,0.9
9,What was Integrated Annual Report's total empl...,14.5,headcount,2023.0,Integrated Annual Report,infosys-ar-25,0.9


<h3>Save Q/As (JSONL + CSV)</h3>

In [21]:
to_save = qa_balanced if len(qa_balanced) >= 40 else qa_dataset  # fall back if not enough
OUT_QA_JSONL.parent.mkdir(parents=True, exist_ok=True)

with open(OUT_QA_JSONL, "w", encoding="utf-8") as f:
    for row in to_save:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

pd.DataFrame(to_save).to_csv(OUT_QA_CSV, index=False)

print("Saved:")
print(" -", OUT_QA_JSONL.resolve())
print(" -", OUT_QA_CSV.resolve())
print("Total Q/A pairs saved:", len(to_save))


Saved:
 - /content/drive/MyDrive/RAG-FT-DATA/qa_pairs.jsonl
 - /content/drive/MyDrive/RAG-FT-DATA/qa_pairs.csv
Total Q/A pairs saved: 50


In [22]:
df = pd.read_csv(OUT_QA_CSV)
display_cols = ["question","answer","metric","year","company","source_doc","confidence_heuristic"]
df.sort_values(by="confidence_heuristic", ascending=False)[display_cols].head(10)


Unnamed: 0,question,answer,metric,year,company,source_doc,confidence_heuristic
0,What were Generative AI's total assets in 2023?,1.0,assets,2023.0,Generative AI,annual-report-2024,0.9
1,What were Integrated Annual Report's total ass...,1.0,assets,2024.0,Integrated Annual Report,infosys-ar-25,0.9
2,What was Generative AI's operating margin in 2...,20.7,operating_margin,2020.0,Generative AI,annual-report-2024,0.9
3,What was Generative AI's operating margin in 2...,85.0,operating_margin,2020.0,Generative AI,annual-report-2024,0.9
4,What was Generative AI's total employee headco...,1882.0,headcount,2024.0,Generative AI,annual-report-2024,0.9
5,What was Generative AI's total employee headco...,20.0,headcount,2023.0,Generative AI,annual-report-2024,0.9
6,What was Generative AI's total employee headco...,12.6,headcount,2022.0,Generative AI,annual-report-2024,0.9
7,What was Integrated Annual Report's total empl...,1869.0,headcount,2024.0,Integrated Annual Report,infosys-ar-25,0.9
8,What was Integrated Annual Report's total empl...,20.0,headcount,2024.0,Integrated Annual Report,infosys-ar-25,0.9
9,What was Integrated Annual Report's total empl...,14.5,headcount,2023.0,Integrated Annual Report,infosys-ar-25,0.9


<h1>Step 2 (RAG)</h1>

In [28]:
# If needed:
# !pip install sentence-transformers faiss-cpu rank-bm25 scikit-learn pandas numpy tqdm

from pathlib import Path
import json
import re
import pickle
import time
from typing import List, Dict, Tuple
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi

import faiss  # CPU index

# Project paths (relative to repo root)
EMB_DIR    = ROOT / "embeddings"
FAISS_DIR  = EMB_DIR / "faiss_index"
BM25_DIR   = EMB_DIR / "bm25_index"

for p in [EMB_DIR, FAISS_DIR, BM25_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Choose which chunk set to index (100 or 400-word chunks)
CHUNK_SIZE_TO_USE = 400
CHUNKS_PATH = PROC_DIR / f"chunks_{CHUNK_SIZE_TO_USE}.jsonl"

assert CHUNKS_PATH.exists(), f"Missing {CHUNKS_PATH}. Run Step 1 to generate chunk files."
print("Using chunks file:", CHUNKS_PATH)


Using chunks file: /content/drive/MyDrive/RAG-FT-DATA/processed/chunks_400.jsonl


In [27]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


<h3>Load Chunks & Simple Preprocessing</h3>

In [29]:
def load_chunks(jsonl_path: Path) -> List[Dict]:
    chunks = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

chunks = load_chunks(CHUNKS_PATH)
len(chunks), chunks[0].keys(), chunks[0]["chunk_id"][:60]


(888,
 dict_keys(['chunk_id', 'doc_name', 'chunk_size_words', 'start_word_index', 'end_word_index', 'pages_approx', 'text']),
 'annual-report-2024_400_00000')

<h3>Text Normalization & Tokenizer (for BM25)</h3>

In [30]:
# Lightweight stopword list (no external downloads)
STOPWORDS = set("""
a an the and or of to in for on at from by with about as is are was were be been being this that these those
it its itself they them their we us our he she his her you your i me my mine ours yours theirs
""".strip().split())

TOKEN_RE = re.compile(r"[A-Za-z0-9]+")  # alphanum tokens

def normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_for_bow(s: str) -> List[str]:
    s = normalize_text(s)
    toks = TOKEN_RE.findall(s)
    return [t for t in toks if t not in STOPWORDS and len(t) > 1]


<h3>Build BM25 (Sparse Index)</h3>

In [31]:
# Prepare corpus for BM25
corpus_tokens = [tokenize_for_bow(c["text"]) for c in chunks]
bm25 = BM25Okapi(corpus_tokens)

# Persist BM25 (tokens) + chunk metadata for reuse
with open(BM25_DIR / "bm25.pkl", "wb") as f:
    pickle.dump({"bm25": bm25}, f)

meta_df = pd.DataFrame([{
    "chunk_id": c["chunk_id"],
    "doc_name": c["doc_name"],
    "pages_approx": c["pages_approx"]
} for c in chunks])
meta_df.to_csv(BM25_DIR / "bm25_metadata.csv", index=False)

print("BM25 built. Corpus size:", len(corpus_tokens))


BM25 built. Corpus size: 888


<h3>Build Dense Embeddings + FAISS (Cosine Similarity)</h3>

In [32]:
# Choose a small open-source embedder (both are good):
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"   # 384-d
# EMBED_MODEL_NAME = "intfloat/e5-small-v2"                    # 384-d (requires query format: "query: ...")

embedder = SentenceTransformer(EMBED_MODEL_NAME)

# Encode all chunks (batched)
texts = [c["text"] for c in chunks]
emb = embedder.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
emb.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

(888, 384)

In [33]:
# Build FAISS index with Inner Product (cosine since we normalized)
dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb)  # add vectors

# Save artifacts: index + id mapping
faiss.write_index(index, str(FAISS_DIR / "faiss.index"))

id_map = {
    "chunk_ids": [c["chunk_id"] for c in chunks],
    "doc_names": [c["doc_name"] for c in chunks],
    "pages_approx": [c["pages_approx"] for c in chunks],
}
with open(FAISS_DIR / "id_map.json", "w", encoding="utf-8") as f:
    json.dump(id_map, f, ensure_ascii=False, indent=2)

np.save(FAISS_DIR / "embeddings.npy", emb)
print("FAISS index saved. Vectors:", emb.shape[0], "dim:", dim)


FAISS index saved. Vectors: 888 dim: 384


<h3>Retrieval Helpers (Dense, Sparse, Fusion)</h3>

In [34]:
# Load/reuse artifacts (if needed later)
def load_faiss_and_map():
    idx = faiss.read_index(str(FAISS_DIR / "faiss.index"))
    with open(FAISS_DIR / "id_map.json", "r", encoding="utf-8") as f:
        id_map = json.load(f)
    return idx, id_map

def dense_search(query: str, top_k: int = 10) -> List[Tuple[int, float]]:
    """Returns list of (row_index_in_chunks, score) for top_k."""
    q = query
    # If using E5, format as: q = f"query: {query}"
    q_emb = embedder.encode([q], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)  # inner product scores
    return list(zip(I[0].tolist(), D[0].tolist()))

def sparse_search(query: str, top_k: int = 10) -> List[Tuple[int, float]]:
    """Returns list of (row_index_in_chunks, score) for top_k (BM25)."""
    toks = tokenize_for_bow(query)
    scores = bm25.get_scores(toks)
    # get top_k indices
    idxs = np.argsort(scores)[::-1][:top_k]
    return [(int(i), float(scores[i])) for i in idxs]

def reciprocal_rank_fusion(dense_res, sparse_res, k: int = 60, top_k: int = 20):
    """
    Combine rankings via Reciprocal Rank Fusion (robust to scale differences).
    dense_res/sparse_res: list[(idx, score)] sorted by descending score
    Return: list[(idx, fused_score)]
    """
    ranks = defaultdict(lambda: {"dense": None, "sparse": None})
    for rank, (i, _) in enumerate(dense_res, start=1):
        ranks[i]["dense"] = rank
    for rank, (i, _) in enumerate(sparse_res, start=1):
        ranks[i]["sparse"] = rank

    fused = []
    for i, rs in ranks.items():
        r_dense = rs["dense"] if rs["dense"] is not None else 10**9
        r_sparse = rs["sparse"] if rs["sparse"] is not None else 10**9
        score = (1.0 / (k + r_dense)) + (1.0 / (k + r_sparse))
        fused.append((i, score))

    fused.sort(key=lambda x: x[1], reverse=True)
    return fused[:top_k]

def pretty_hit(row_idx: int, score: float) -> Dict:
    c = chunks[row_idx]
    return {
        "chunk_id": c["chunk_id"],
        "doc_name": c["doc_name"],
        "pages_approx": c["pages_approx"],
        "score": round(score, 4),
        "preview": (c["text"][:220] + " ...") if len(c["text"]) > 220 else c["text"]
    }


<h3>Cross-Encoder Re-Ranking (Advanced RAG)</h3>

In [35]:
# Cross-encoder for re-ranking (query, passage) pairs
# This is small, fast, and effective for top-20 candidates:
CROSS_ENCODER_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
cross_encoder = CrossEncoder(CROSS_ENCODER_NAME)

def rerank_with_cross_encoder(query: str, fused_hits: List[Tuple[int, float]], top_k: int = 5):
    """
    Re-rank fused hits using a cross-encoder. Returns top_k in new order with CE scores.
    """
    pairs = [(query, chunks[i]["text"]) for i, _ in fused_hits]
    ce_scores = cross_encoder.predict(pairs)  # higher is better
    reranked = []
    for (i, _), s in zip(fused_hits, ce_scores):
        reranked.append((i, float(s)))
    reranked.sort(key=lambda x: x[1], reverse=True)
    return reranked[:top_k]


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

<h3>End-to-End: Hybrid Retrieval + Re-Ranking (Demo)</h3>

In [36]:
def hybrid_retrieve(query: str, k_dense=15, k_sparse=15, k_fused=20, k_final=5):
    t0 = time.time()
    d_hits = dense_search(query, top_k=k_dense)
    s_hits = sparse_search(query, top_k=k_sparse)
    fused  = reciprocal_rank_fusion(d_hits, s_hits, k=60, top_k=k_fused)
    rerank = rerank_with_cross_encoder(query, fused, top_k=k_final)
    t1 = time.time()
    return {
        "query": query,
        "dense_top": [pretty_hit(i, sc) for i, sc in d_hits[:5]],
        "sparse_top": [pretty_hit(i, sc) for i, sc in s_hits[:5]],
        "fused_top": [pretty_hit(i, sc) for i, sc in fused[:5]],
        "reranked_top": [pretty_hit(i, sc) for i, sc in rerank],
        "latency_sec": round(t1 - t0, 3)
    }

# Try a few queries (adjust to your reports)
queries = [
    "What was the company's revenue in 2023?",
    "What is the net profit for the year 2024?",
    "What were the total assets last year?",
]

for q in queries:
    out = hybrid_retrieve(q, k_dense=15, k_sparse=15, k_fused=20, k_final=5)
    print("\n=== QUERY:", q)
    print("Latency:", out["latency_sec"], "s")
    print("Top (re-ranked):")
    for h in out["reranked_top"]:
        print(f"  • {h['doc_name']} {h['pages_approx']} | score={h['score']}\n    {h['preview']}\n")



=== QUERY: What was the company's revenue in 2023?
Latency: 7.785 s
Top (re-ranked):
  • annual-report-2024 [239, 240] | score=5.7769
    receivables and unbilled revenues are presented net of impairment in the Balance Sheet. During the year ended March 31, 2024 and March 31, 2023, the Company recognized revenue of ₹4,189 crore and ₹4,391 crore arising fro ...

  • annual-report-2024 [71, 71] | score=5.1145
    currency growth by comparing current-period revenues in respective local currencies converted to INR using prior-period exchange rates and comparing the same to our prior-period reported revenues. Our revenues in reporte ...

  • infosys-ar-25 [242, 242] | score=4.3411
    & platforms The Company derives revenues from the sale of products and platforms including Infosys Applied AI which applies next- generation AI and machine learning. The percentage of revenue from fixed-price contracts f ...

  • infosys-ar-25 [322, 322] | score=3.1814
    The Group believes that this disaggr

<h3>Load a Small, Open-Source Generator (FLAN-T5)</h3>

In [37]:
# If needed:
# !pip install transformers accelerate sentencepiece

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# You can switch to "google/flan-t5-small" if you have very limited compute
GEN_MODEL_NAME = "google/flan-t5-base"

device = "cuda" if torch.cuda.is_available() else "cpu"
gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME).to(device)

print("Loaded generator:", GEN_MODEL_NAME, "| device:", device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded generator: google/flan-t5-base | device: cpu


<h3>Utilities: Trim Context to Fit the Model</h3>

In [38]:
def trim_context_to_tokens(texts, tokenizer, max_tokens: int = 768):
    """
    Concatenate passages and trim to a max token length for the generator input.
    Returns a single string containing as many passages as fit.
    """
    sep = "\n\n"
    joined = ""
    token_count = 0

    for t in texts:
        candidate = (joined + sep + t) if joined else t
        ids = tokenizer(candidate, return_tensors="pt", add_special_tokens=False).input_ids
        if ids.shape[-1] <= max_tokens:
            joined = candidate
            token_count = ids.shape[-1]
        else:
            break
    return joined, token_count


<h3>Compose Prompt + Generate Answer</h3>

In [39]:
import time
import torch
from typing import List, Dict, Any

# ---------- Generation helpers (token-budgeted) ----------

MAX_INPUT_TOKENS_HARD = 512  # T5-like models hard cap

def _build_prompt(query: str, passages: List[str]) -> str:
    context_block = "\n\n".join(f"- {p}" for p in passages)
    return (
        "You are a financial assistant. Answer the question using ONLY the given context. "
        "If the answer is not present, say 'Not in scope'. "
        "Be concise and report numbers exactly as in the context.\n\n"
        f"Context:\n{context_block}\n\n"
        f"Question: {query}\nAnswer:"
    )

def _fit_prompt_to_budget(query: str, passages: List[str], tokenizer, max_tokens: int) -> str:
    """Greedily add passages until full prompt would exceed max_tokens."""
    kept = []
    for p in passages:
        candidate = _build_prompt(query, kept + [p])
        input_ids = tokenizer(candidate, return_tensors="pt", add_special_tokens=True, truncation=False).input_ids
        if input_ids.shape[-1] <= max_tokens:
            kept.append(p)
        else:
            break
    return _build_prompt(query, kept)

@torch.no_grad()
def generate_answer(query: str,
                    passages: List[str],
                    max_input_tokens: int = MAX_INPUT_TOKENS_HARD,
                    max_new_tokens: int = 64,
                    temperature: float = 0.0,
                    top_p: float = 1.0,
                    num_beams: int = 1) -> str:
    """
    Builds a prompt that fits within `max_input_tokens` (<=512 for T5), then generates.
    Only passes sampling kwargs when sampling is enabled.
    """
    # Cap to model hard limit to avoid warnings/errors
    max_input_tokens = min(max_input_tokens, MAX_INPUT_TOKENS_HARD)

    prompt = _fit_prompt_to_budget(query, passages, gen_tokenizer, max_tokens=max_input_tokens)
    enc = gen_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens).to(device)

    do_sample = (temperature > 0.0 and num_beams == 1)
    gen_kwargs = dict(max_new_tokens=max_new_tokens, num_beams=num_beams, do_sample=do_sample)
    if do_sample:
        gen_kwargs.update(dict(temperature=temperature, top_p=top_p))

    out = gen_model.generate(**enc, **gen_kwargs)
    ans = gen_tokenizer.decode(out[0], skip_special_tokens=True).strip()
    return ans

<h3>Guardrails (Input + Output)</h3>
<p>We’ll implement two simple guardrails:

Input-side domain filter → if the question is clearly not finance/report related, we short-circuit with “Out of scope”.

Output-side factuality check → if the generated answer contains a number not present in the retrieved context, flag as a possible hallucination (and optionally replace with “Not in scope”).</p>

In [40]:
import re
from typing import List, Dict, Any, Tuple

# ---------- 1) Finance intent detection ----------

_FINANCE_KEYWORDS = [
    r"revenue", r"sales", r"net income", r"profit", r"ebitda", r"eps",
    r"cash\s*flow", r"operating\s*cash\s*flow", r"free\s*cash\s*flow",
    r"assets?", r"liabilit(?:y|ies)", r"equity", r"debt", r"margin", r"ebit",
    r"pbt", r"pat", r"dividend", r"capex", r"opex", r"receivables", r"payables",
    r"working\s*capital", r"guidance", r"segment", r"report", r"notes?",
    r"annual", r"balance\s*sheet", r"income\s*statement", r"cash\s*flow\s*statement",
    r"md&?a", r"management\s+discussion", r"ceo", r"cfo", r"auditor",
    r"headcount", r"employee[s]?", r"yoy", r"qoq", r"fx", r"forex"
]
FINANCE_RE = re.compile(r"\b(?:" + "|".join(_FINANCE_KEYWORDS) + r")\b", flags=re.IGNORECASE)

FY_RE = re.compile(r"\bfy\s*'?(\d{2,4})\b", flags=re.IGNORECASE)  # FY24 / FY2024

def is_finance_query(query: str, min_hits: int = 1) -> bool:
    q = query.lower()
    hits = len(FINANCE_RE.findall(q)) + len(FY_RE.findall(q))
    return hits >= min_hits

# ---------- 2) Number extraction (richer) ----------

# Currency symbols & units
_CCY = r"(?:₹|rs\.?|inr|usd|\$|eur|€|gbp|£)"
_UNITS = r"(?:percent|%|crore|cr\.?|cr|lakh|lakhs|million|mn|billion|bn|thousand|k)"
# Numbers incl. negatives, (accounting), decimals, grouped
_NUM_CORE = r"(?:\(?-?\d{1,3}(?:,\d{3})+|\(?-?\d+(?:\.\d+)?\)?)"

# Capture optional currency before/after & units after
NUM_WITH_META_RE = re.compile(
    rf"(?:{_CCY}\s*)?{_NUM_CORE}(?:\s*{_UNITS})?(?:\s*{_CCY})?",
    flags=re.IGNORECASE
)

# Simple numeric core (for strict compare fallback)
NUM_STRICT_RE = re.compile(r"\b-?(?:\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\b")

# Years to ignore in hallucination checks unless the query is about "year"
YEAR_RE = re.compile(r"\b(19|20)\d{2}\b")

def _strip_commas_parens(x: str) -> str:
    x = x.replace(",", "")
    if x.startswith("(") and x.endswith(")"):  # accounting negatives
        x = "-" + x[1:-1]
    return x

def extract_numbers_with_meta(s: str) -> List[Tuple[str, str]]:
    """
    Returns list of (value_str, unit_str) pairs.
    value_str is normalized (commas removed, accounting () → -).
    unit_str is one of %, crore, million, bn, etc. (lowercased); '' if none.
    """
    out = []
    for m in NUM_WITH_META_RE.finditer(s):
        span = m.group(0)
        # pull out unit (last occurrence of unit word/symbol in span)
        unit = ""
        um = re.search(_UNITS, span, flags=re.IGNORECASE)
        if um:
            unit = um.group(0).lower().strip(".")
            if unit == "percent":
                unit = "%"
        # pull out numeric core
        nm = NUM_STRICT_RE.search(span)
        if not nm:
            continue
        val = _strip_commas_parens(nm.group(0))
        out.append((val, unit))
    return out

def extract_numbers_strict(s: str) -> List[str]:
    return [ _strip_commas_parens(x) for x in NUM_STRICT_RE.findall(s) ]

def _looks_like_year(x: str) -> bool:
    return bool(YEAR_RE.fullmatch(x))

# ---------- 3) Factuality / consistency check ----------

def _float_or_none(x: str) -> float:
    try:
        return float(x)
    except Exception:
        return None

def _units_equivalent(a_unit: str, b_unit: str) -> bool:
    # Treat "percent" and "%" as same; treat mn/million; bn/billion; cr/crore; k/thousand
    canon = {
        "%": {"%", "percent"},
        "million": {"mn", "million"},
        "billion": {"bn", "billion"},
        "crore": {"cr", "cr.", "crore"},
        "thousand": {"k", "thousand"}
    }
    def norm(u: str) -> str:
        u = u.lower()
        for k, vs in canon.items():
            if u in vs:
                return k
        return u
    return norm(a_unit) == norm(b_unit) or (not a_unit and not b_unit)

def _roughly_equal(a: str, b: str, tol: float = 0.005) -> bool:
    """
    Numbers match if identical as strings OR float-close within ±0.5% (default).
    """
    if a == b:
        return True
    fa, fb = _float_or_none(a), _float_or_none(b)
    if fa is None or fb is None:
        return False
    if fb == 0:
        return abs(fa) < 1e-9
    return abs(fa - fb) / max(1e-9, abs(fb)) <= tol

def output_factuality_check(answer: str, contexts: List[str], query: str = "") -> Dict[str, Any]:
    """
    Two-tier check:
      1) strict: numeric strings in answer must appear verbatim in contexts
      2) lenient: pairwise compare (number, unit) with rounding tolerance and unit equivalence
    Ignores year-like tokens unless the query mentions 'year' explicitly.
    """
    # Extract answer numbers (strict) & with metadata
    ans_nums_strict = [x for x in extract_numbers_strict(answer) if not _looks_like_year(x) or "year" in query.lower()]
    ans_pairs = extract_numbers_with_meta(answer)

    # Build context pools
    ctx_all_text = " ".join(contexts)
    ctx_nums_strict = set(extract_numbers_strict(ctx_all_text))
    ctx_pairs = extract_numbers_with_meta(ctx_all_text)

    # 1) Strict pass
    strict_missing = sorted({x for x in ans_nums_strict if x not in ctx_nums_strict})

    # 2) Lenient pass: try to justify missing numbers by unit/tolerance pairing
    unjustified = []
    for (aval, aunit) in ans_pairs:
        if _looks_like_year(aval) and "year" not in query.lower():
            continue
        # If strict already contains aval, it's fine
        if aval in ctx_nums_strict:
            continue
        # Look for a context pair with same/compatible units and close value
        found_close = False
        for (cval, cunit) in ctx_pairs:
            if not _units_equivalent(aunit, cunit):
                continue
            if _roughly_equal(aval, cval, tol=0.005):  # 0.5% tolerance
                found_close = True
                break
        if not found_close:
            unjustified.append((aval, aunit))

    # Suspicious if any strict missing that also could not be justified leniently
    suspicious_values = [v for v in strict_missing if all(v != uv for (uv, _) in unjustified)]

    return {
        "answer_numbers_strict": sorted(ans_nums_strict),
        "context_numbers_strict": sorted(ctx_nums_strict),
        "unjustified_pairs": unjustified,  # list of (value, unit) still not supported
        "suspicious_numbers": suspicious_values,
        "is_potential_hallucination": len(suspicious_values) > 0 or len(unjustified) > 0
    }


<h3>End-to-End rag_answer (Hybrid Retrieval → Re-rank → Generate → Guardrails)</h3>

In [41]:
import time

def rag_answer(query: str,
               k_dense: int = 15,
               k_sparse: int = 15,
               k_fused: int = 20,
               k_final: int = 5,
               max_input_tokens: int = MAX_INPUT_TOKENS_HARD,
               generator_max_new_tokens: int = 64) -> Dict[str, Any]:
    t0 = time.time()

    # 1) INPUT GUARDRAIL (domain filter)
    if not is_finance_query(query):
        return {
            "query": query,
            "method": "RAG",
            "answer": "Out of scope (non-financial query).",
            "confidence": 0.3,
            "retrieved_contexts": [],
            "latency_sec": round(time.time() - t0, 3),
            "guardrail_triggered": "input_out_of_scope"
        }

    # 2) RETRIEVAL + FUSION + RERANK
    d_hits = dense_search(query, top_k=k_dense)           # [(idx, score)]
    s_hits = sparse_search(query, top_k=k_sparse)         # [(idx, score)]
    fused  = reciprocal_rank_fusion(d_hits, s_hits, k=60, top_k=k_fused)  # [(idx, rrf)]
    rerank = rerank_with_cross_encoder(query, fused, top_k=k_final)       # [(idx, ce_score)]

    top_idxs = [i for i, _ in rerank]
    contexts = [chunks[i]["text"] for i in top_idxs]  # FULL texts for factuality check

    # 3) GENERATION (token-budgeted)
    gen_t0 = time.time()
    answer = generate_answer(
        query,
        contexts,
        max_input_tokens=max_input_tokens,
        max_new_tokens=generator_max_new_tokens,
        temperature=0.0,   # deterministic by default
        top_p=1.0,
        num_beams=1
    )
    gen_t1 = time.time()

    # 4) OUTPUT GUARDRAIL (factuality/number-consistency vs contexts)
    fact_check = output_factuality_check(answer, contexts, query=query)  # should use full contexts internally
    guardrail_flag = None
    final_answer = answer

    # Confidence from CE scores (min-max → [0.5, 1.0])
    ce_scores = [sc for _, sc in rerank]
    if ce_scores:
        mn, mx = min(ce_scores), max(ce_scores)
        conf = 0.5 if mx == mn else (ce_scores[0] - mn) / (mx - mn)
        conf = float(0.5 + 0.5 * conf)
    else:
        conf = 0.4

    if fact_check.get("is_potential_hallucination", False):
        guardrail_flag = "output_potential_hallucination"
        final_answer = "Not in scope (insufficient supporting context)."
        conf = 0.35  # downgrade on intervention

    # 5) Pretty payload of contexts (previews shown to user)
    pretty_contexts = []
    for (i, sc) in rerank:
        c = chunks[i]
        txt = c["text"]
        preview = txt if len(txt) <= 280 else (txt[:280] + " ...")
        pretty_contexts.append({
            "chunk_id": c["chunk_id"],
            "doc_name": c["doc_name"],
            "pages_approx": c["pages_approx"],
            "ce_score": round(sc, 4),
            "preview": preview
        })

    t1 = time.time()
    return {
        "query": query,
        "method": "RAG",
        "answer": final_answer,
        "raw_answer": answer,
        "confidence": round(conf, 3),
        "retrieved_contexts": pretty_contexts,
        "latency_sec": round(t1 - t0, 3),
        "gen_time_sec": round(gen_t1 - gen_t0, 3),
        "guardrail_triggered": guardrail_flag,
        "fact_check": fact_check
    }

<h3>Try a Few End-to-End Queries</h3>

In [42]:
tests = [
    "What was the company's revenue in 2023?",
    "What was the net profit in 2024?",
    "Capital of France?"
]
for q in tests:
    out = rag_answer(q, k_dense=15, k_sparse=15, k_fused=20, k_final=5, max_input_tokens=512)
    print("\n=== QUERY:", q)
    print("Answer:", out["answer"])
    print("Confidence:", out["confidence"], "| Total latency:", out["latency_sec"], "s")
    print("Top contexts:")
    for ctx in out["retrieved_contexts"][:2]:
        print(f"  - {ctx['doc_name']} {ctx['pages_approx']} | ce={ctx['ce_score']}")
        print("    ", ctx["preview"][:160], "...")


Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors



=== QUERY: What was the company's revenue in 2023?
Answer: Not in scope
Confidence: 1.0 | Total latency: 7.823 s
Top contexts:
  - annual-report-2024 [239, 240] | ce=5.7769
     receivables and unbilled revenues are presented net of impairment in the Balance Sheet. During the year ended March 31, 2024 and March 31, 2023, the Company rec ...
  - annual-report-2024 [71, 71] | ce=5.1145
     currency growth by comparing current-period revenues in respective local currencies converted to INR using prior-period exchange rates and comparing the same to ...

=== QUERY: What was the net profit in 2024?
Answer: Not in scope
Confidence: 1.0 | Total latency: 6.596 s
Top contexts:
  - annual-report-2024 [71, 71] | ce=2.2456
     currency growth by comparing current-period revenues in respective local currencies converted to INR using prior-period exchange rates and comparing the same to ...
  - infosys-ar-25 [261, 263] | ce=1.2009
     with the CSR Amendment Rules. Infosys Integrated Annual Repor

<h3>The generator is defaulting to “Not in scope” even when the contexts are relevant.

Confidence stays high because it’s computed from CE scores (good retrieval) while the answer is a fallback.

Below is a surgical set of improvements.</h3>

In [43]:
# =========================
# RAG: Retrieval → Rerank → Generate → Guardrails (Improved)
# =========================
import re, time, torch
from typing import List, Dict, Any

# ---------- Retrieval query boosting (year/FY & finance hints) ----------
def _expand_retrieval_query(q: str) -> str:
    ql = q.lower()
    boost = " revenue profit net income ebitda eps crore billion inr ₹ usd $ fy"
    years = re.findall(r"\b(20\d{2})\b", ql)
    fy_tokens = []
    for y in years:
        yy = int(y) % 100
        fy_tokens.extend([f"FY{y}", f"FY {y}", f"FY{yy}", f"FY {yy}"])
    return q + " " + boost + " " + " ".join(fy_tokens)

def dense_search(query: str, top_k: int = 10):
    q = _expand_retrieval_query(query)
    q_emb = embedder.encode([q], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)
    return list(zip(I[0].tolist(), D[0].tolist()))  # [(idx, score)]

def sparse_search(query: str, top_k: int = 10):
    q = _expand_retrieval_query(query)
    toks = tokenize_for_bow(q)
    scores = bm25.get_scores(toks)
    idxs = np.argsort(scores)[::-1][:top_k]
    return [(int(i), float(scores[i])) for i in idxs]

# ---------- Prompt builder with strict input token budget ----------
MAX_INPUT_TOKENS_HARD = 512  # T5-like cap

def _build_prompt(query: str, passages: List[str]) -> str:
    context_block = "\n\n".join(f"- {p}" for p in passages)
    return (
        "You are a financial assistant. Use ONLY the context to answer. "
        "If the answer is truly unavailable, say 'Not in scope'. "
        "Do NOT say 'Not in scope' if any relevant figure appears in the context. "
        "Report numbers exactly as written (keep currency and units).\n\n"
        f"Context:\n{context_block}\n\n"
        f"Question: {query}\nAnswer:"
    )

def _fit_prompt_to_budget(query: str, passages: List[str], tokenizer, max_tokens: int) -> str:
    kept = []
    for p in passages:
        candidate = _build_prompt(query, kept + [p])
        ids = tokenizer(candidate, return_tensors="pt", add_special_tokens=True, truncation=False).input_ids
        if ids.shape[-1] <= max_tokens:
            kept.append(p)
        else:
            break
    return _build_prompt(query, kept)

@torch.no_grad()
def generate_answer(query: str,
                    passages: List[str],
                    max_input_tokens: int = MAX_INPUT_TOKENS_HARD,
                    max_new_tokens: int = 64,
                    temperature: float = 0.0,
                    top_p: float = 1.0,
                    num_beams: int = 1) -> str:
    max_input_tokens = min(max_input_tokens, MAX_INPUT_TOKENS_HARD)
    prompt = _fit_prompt_to_budget(query, passages, gen_tokenizer, max_tokens=max_input_tokens)
    enc = gen_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens).to(device)

    do_sample = (temperature > 0.0 and num_beams == 1)
    gen_kwargs = dict(max_new_tokens=max_new_tokens, num_beams=num_beams, do_sample=do_sample)
    if do_sample:
        gen_kwargs.update(dict(temperature=temperature, top_p=top_p))

    out = gen_model.generate(**enc, **gen_kwargs)
    return gen_tokenizer.decode(out[0], skip_special_tokens=True).strip()

# ---------- Minimal extractor fallback for common metrics ----------
_FINANCE_KEY_TO_REGEX = {
    "revenue":    r"\b(revenue|income\s+from\s+operations|total\s+income)\b",
    "net profit": r"\b(net\s+profit|profit\s+after\s+tax|pat)\b",
    "net income": r"\b(net\s+income|profit\s+after\s+tax|pat)\b",
    "ebitda":     r"\b(ebitda)\b",
    "eps":        r"\b(eps|earnings\s+per\s+share)\b",
}

def _extract_candidate_from_contexts(query: str, contexts: List[str]) -> str | None:
    ql = query.lower()
    metric = None
    for k in _FINANCE_KEY_TO_REGEX:
        if k in ql:
            metric = k
            break
    if metric is None:
        return None

    years = re.findall(r"\b(20\d{2})\b", ql)
    year_pat = None
    if years:
        y = years[0]
        yy = int(y) % 100
        year_pat = re.compile(rf"(FY\s*{y}|FY\s*{yy}|{y})", re.IGNORECASE)

    met_pat = re.compile(_FINANCE_KEY_TO_REGEX[metric], re.IGNORECASE)

    for ctx in contexts:
        # scan sentence-by-sentence near metric keywords
        for sent in re.split(r"(?<=[\.\:\;])\s+", ctx):
            if not met_pat.search(sent):
                continue
            if year_pat and not year_pat.search(sent):
                # (optional) allow adjacency check across sentences here if needed
                pass
            # choose a reasonable number/unit pair from this sentence
            pairs = extract_numbers_with_meta(sent)  # requires your earlier helper
            if not pairs:
                continue

            def _score_pair(p):
                val, unit = p
                if metric in {"revenue", "net profit", "net income", "ebitda"} and unit == "%":
                    return -2
                if metric == "eps" and unit == "%":
                    return -1
                pref = 0
                if unit in {"million", "billion", "crore"}:
                    pref += 2
                return pref

            pairs_sorted = sorted(pairs, key=_score_pair, reverse=True)
            cand_val, cand_unit = pairs_sorted[0]
            if cand_unit and cand_unit != "%":
                return f"{cand_val} {cand_unit}"
            elif cand_unit == "%":
                return f"{cand_val}%"
            else:
                return cand_val

    return None

# ---------- End-to-end RAG ----------
def rag_answer(query: str,
               k_dense: int = 15,
               k_sparse: int = 15,
               k_fused: int = 20,
               k_final: int | None = None,
               max_input_tokens: int = MAX_INPUT_TOKENS_HARD,
               generator_max_new_tokens: int = 64) -> Dict[str, Any]:
    t0 = time.time()

    # Input guardrail (domain)
    if not is_finance_query(query):
        return {
            "query": query, "method": "RAG",
            "answer": "Out of scope (non-financial query).",
            "confidence": 0.3, "retrieved_contexts": [],
            "latency_sec": round(time.time() - t0, 3),
            "guardrail_triggered": "input_out_of_scope"
        }

    # Dynamic k_final: slightly higher for numeric factoids
    if k_final is None:
        k_final = 8 if any(w in query.lower() for w in ("revenue","net profit","net income","eps","ebitda")) else 5

    # Retrieve → fuse → rerank
    d_hits = dense_search(query, top_k=k_dense)
    s_hits = sparse_search(query, top_k=k_sparse)
    fused  = reciprocal_rank_fusion(d_hits, s_hits, k=60, top_k=k_fused)
    rerank = rerank_with_cross_encoder(query, fused, top_k=k_final)  # [(idx, ce_score)]

    top_idxs = [i for i, _ in rerank]
    contexts = [chunks[i]["text"] for i in top_idxs]  # FULL text

    # Generate (token-budgeted)
    gen_t0 = time.time()
    answer = generate_answer(
        query, contexts,
        max_input_tokens=max_input_tokens,
        max_new_tokens=generator_max_new_tokens,
        temperature=0.0, top_p=1.0, num_beams=1
    )
    gen_t1 = time.time()

    # Confidence from CE scores (min-max → [0.5, 1.0])
    ce_scores = [sc for _, sc in rerank]
    if ce_scores:
        mn, mx = min(ce_scores), max(ce_scores)
        conf = 0.5 if mx == mn else (ce_scores[0] - mn) / (mx - mn)
        conf = float(0.5 + 0.5 * conf)
    else:
        conf = 0.4

    guardrail_flag = None
    final_answer = answer
    ans_lower = answer.strip().lower()

    # If model said Not in scope, try extractor recovery
    if ans_lower.startswith("not in scope"):
        candidate = _extract_candidate_from_contexts(query, contexts)
        if candidate:
            final_answer = candidate
            guardrail_flag = "generator_conservative_auto_fill"
            conf = max(conf, 0.6)
        else:
            conf = min(conf, 0.35)

    # Factuality check (numbers vs contexts) — uses your improved function
    fact_check = output_factuality_check(final_answer, contexts, query=query)
    if fact_check.get("is_potential_hallucination", False):
        guardrail_flag = (guardrail_flag + "|output_potential_hallucination") if guardrail_flag else "output_potential_hallucination"
        final_answer = "Not in scope (insufficient supporting context)."
        conf = 0.35

    # Pretty contexts (previews for UI)
    pretty_contexts = []
    for (i, sc) in rerank:
        c = chunks[i]
        txt = c["text"]
        preview = txt if len(txt) <= 280 else (txt[:280] + " ...")
        pretty_contexts.append({
            "chunk_id": c["chunk_id"],
            "doc_name": c["doc_name"],
            "pages_approx": c["pages_approx"],
            "ce_score": round(sc, 4),
            "preview": preview
        })

    t1 = time.time()
    return {
        "query": query, "method": "RAG",
        "answer": final_answer, "raw_answer": answer,
        "confidence": round(conf, 3),
        "retrieved_contexts": pretty_contexts,
        "latency_sec": round(t1 - t0, 3),
        "gen_time_sec": round(gen_t1 - gen_t0, 3),
        "guardrail_triggered": guardrail_flag,
        "fact_check": fact_check
    }


In [44]:
tests = [
    "What was the company's revenue in 2023?",
    "What was the net profit in 2024?",
    "Capital of France?"
]
for q in tests:
    out = rag_answer(q, k_dense=15, k_sparse=15, k_fused=20, k_final=5, max_input_tokens=512)
    print("\n=== QUERY:", q)
    print("Answer:", out["answer"])
    print("Confidence:", out["confidence"], "| Total latency:", out["latency_sec"], "s")
    print("Top contexts:")
    for ctx in out["retrieved_contexts"][:2]:
        print(f"  - {ctx['doc_name']} {ctx['pages_approx']} | ce={ctx['ce_score']}")
        print("    ", ctx["preview"][:160], "...")



=== QUERY: What was the company's revenue in 2023?
Answer: 31
Confidence: 1.0 | Total latency: 8.069 s
Top contexts:
  - infosys-ar-25 [322, 322] | ce=3.1814
     The Group believes that this disaggregation best depicts how the nature, amount, timing and uncertainty of revenues and cash flows are affected by industry, mar ...
  - infosys-ar-25 [344, 346] | ce=2.3769
     of revenue by geographic locations is given in Note 2.18, Revenue from operations. Infosys Integrated Annual Report 2024-25 Business segments Year ended March 3 ...

=== QUERY: What was the net profit in 2024?
Answer: 4.0
Confidence: 1.0 | Total latency: 6.612 s
Top contexts:
  - annual-report-2024 [71, 71] | ce=2.2456
     currency growth by comparing current-period revenues in respective local currencies converted to INR using prior-period exchange rates and comparing the same to ...
  - annual-report-2024 [73, 73] | ce=0.8969
     investments of ₹39,005 crore comprise cash and cash equivalents, current and non- cur

<h1>Step 3 (Fine-Tuning)</h1>
<p>We’ll use FLAN-T5 (open-source, instruction-tuned) for a generative Q&A model, and we’ll:

Load your Q/A dataset

Run a baseline (pre-fine-tuning) evaluation

Fine-tune FLAN-T5 with HuggingFace Trainer

Implement an advanced method: Adapter-based Mixture-of-Experts (AdapterFusion)

Provide an ft_answer() helper with a simple guardrail</p>

<h3>Setup & Imports</h3>

In [47]:
# If needed:
# !pip install transformers accelerate datasets sentencepiece evaluate scikit-learn adapter-transformers

from pathlib import Path
import json, time, re, random
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    TrainingArguments, Trainer
)
import evaluate

QA_JSONL = ROOT / "qa_pairs.jsonl"   # produced in Step 1B
FT_OUT   = ROOT / "fine_tuned_model"          # output dir for fine-tuned model
FT_OUT.mkdir(parents=True, exist_ok=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cpu


In [46]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


<h3>Load Q/A Dataset</h3>

In [48]:
def load_qa_jsonl(path: Path) -> List[Dict[str, Any]]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))
    return rows

# Expect columns: question, answer, metric, year, company, ...
qa_rows = load_qa_jsonl(QA_JSONL)
df = pd.DataFrame(qa_rows)
print("Total Q/A loaded:", len(df))
df.head(5)


Total Q/A loaded: 50


Unnamed: 0,question,answer,metric,year,company,source_doc,context_snippet,confidence_heuristic
0,What were Generative AI's total assets in 2023?,1.0,assets,2023.0,Generative AI,annual-report-2024,"oans 2.6 Other financial assets 2.7 10,129 9,0...",0.9
1,What were Integrated Annual Report's total ass...,1.0,assets,2024.0,Integrated Annual Report,infosys-ar-25,"2.6 Other financial assets 2.7 12,569 10,129 I...",0.9
2,What was Generative AI's operating margin in 2...,20.7,operating_margin,2020.0,Generative AI,annual-report-2024,ry Business highlights Performance overview Di...,0.9
3,What was Generative AI's operating margin in 2...,85.0,operating_margin,2020.0,Generative AI,annual-report-2024,mance overview Dividend per share(2) (in ₹) 46...,0.9
4,What was Generative AI's total employee headco...,1882.0,headcount,2024.0,Generative AI,annual-report-2024,hilippines Malaysia Singapore Japan South Kore...,0.9


<h3>Train/Val/Test Split & Formatting</h3>

In [49]:
# We'll do 80/10/10 split
df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
n = len(df)
n_train = int(0.8*n); n_val = int(0.1*n)
train_df = df.iloc[:n_train].copy()
val_df   = df.iloc[n_train:n_train+n_val].copy()
test_df  = df.iloc[n_train+n_val:].copy()

def to_sft_format(df: pd.DataFrame) -> Dataset:
    # Prompt style: "question: <Q>"
    prompts = [f"question: {q}" for q in df["question"].tolist()]
    targets = [str(a) for a in df["answer"].tolist()]
    return Dataset.from_dict({"prompt": prompts, "target": targets})

ds_train = to_sft_format(train_df)
ds_val   = to_sft_format(val_df)
ds_test  = to_sft_format(test_df)

raw_ds = DatasetDict(train=ds_train, validation=ds_val, test=ds_test)
raw_ds


DatasetDict({
    train: Dataset({
        features: ['prompt', 'target'],
        num_rows: 40
    })
    validation: Dataset({
        features: ['prompt', 'target'],
        num_rows: 5
    })
    test: Dataset({
        features: ['prompt', 'target'],
        num_rows: 5
    })
})

<h3>Tokenization</h3>

In [50]:
BASE_MODEL = "google/flan-t5-small"  # small = faster training; you can switch to flan-t5-base if you have GPU

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

MAX_INPUT_TOKENS  = 256
MAX_TARGET_TOKENS = 64

def preprocess(batch):
    model_inputs = tokenizer(
        batch["prompt"],
        max_length=MAX_INPUT_TOKENS,
        truncation=True,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target"],
            max_length=MAX_TARGET_TOKENS,
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw_ds.map(preprocess, batched=True, remove_columns=raw_ds["train"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=BASE_MODEL)

tokenized


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})

<h3>Baseline (Pre-Fine-Tuning) Evaluation</h3>

In [51]:
# Load base model (no fine-tuning yet)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

def generate_answers(model, questions: List[str], max_new_tokens=64, temp=0.0):
    outs = []
    for q in questions:
        prompt = f"question: {q}"
        enc = tokenizer(prompt, return_tensors="pt").to(device)
        t0 = time.time()
        out = model.generate(
            **enc, max_new_tokens=max_new_tokens,
            temperature=temp, do_sample=(temp>0.0)
        )
        dt = time.time()-t0
        ans = tokenizer.decode(out[0], skip_special_tokens=True).strip()
        outs.append((ans, dt))
    return outs

# Take up to 10 test questions for baseline
sample_test = test_df.head(min(10, len(test_df)))
base_preds = generate_answers(base_model, sample_test["question"].tolist())

# Simple normalization for EM/F1
def normalize_text(s):
    s = s.lower().strip()
    s = re.sub(r"[\s]+", " ", s)
    s = re.sub(r"[^a-z0-9\.\-\$₹% ]", "", s)
    return s

def exact_match(pred, gold):
    return int(normalize_text(pred) == normalize_text(gold))

def f1_score(pred, gold):
    # token-level F1 on normalized strings
    p = normalize_text(pred).split()
    g = normalize_text(gold).split()
    if not p and not g:
        return 1.0
    if not p or not g:
        return 0.0
    common = {}
    for tok in p:
        common[tok] = min(p.count(tok), g.count(tok))
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(p)
    recall = num_same / len(g)
    return 2*precision*recall/(precision+recall)

em_list, f1_list, times = [], [], []
for (pred, dt), gold in zip(base_preds, sample_test["answer"].tolist()):
    em_list.append(exact_match(pred, gold))
    f1_list.append(f1_score(pred, gold))
    times.append(dt)

print("Baseline | EM:", np.mean(em_list), "F1:", np.mean(f1_list), "Avg latency (s):", np.mean(times))
pd.DataFrame({
    "question": sample_test["question"].tolist(),
    "gold": sample_test["answer"].tolist(),
    "pred": [p for p,_ in base_preds],
    "latency_s": times,
    "EM": em_list,
    "F1": f1_list
})


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Baseline | EM: 0.0 F1: 0.0 Avg latency (s): 0.7760354995727539


Unnamed: 0,question,gold,pred,latency_s,EM,F1
0,What was Integrated Annual Report's total empl...,1869,87039,0.319089,0,0.0
1,What were Generative AI's total assets in None?,1,adolescent,0.315493,0,0.0
2,Who was the CFO of Generative AI?,Certification,john s. s. s. s. s. s. s. s. s. s. s. s. s. s....,2.774303,0,0.0
3,What are the reportable business segments of G...,"Consumer Packaged Goods and Logistics, Uti, en...",ad-hoc,0.272189,0,0.0
4,What was Generative AI's net income in None?,27234,$1.1 billion,0.199104,0,0.0


<h3>Fine-Tuning with HuggingFace Trainer</h3>

In [52]:
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

lr = 5e-5
bs = 8
epochs = 5
logging_steps = 20

args = TrainingArguments(
    output_dir=str(FT_OUT / "hf_runs"),
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    learning_rate=lr,
    num_train_epochs=epochs,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    bf16=torch.cuda.is_available(),  # use bf16 if on modern GPU
    logging_steps=logging_steps,
    # load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    seed=SEED
)

# Metric function (optional; we track EM/F1 offline)
def compute_metrics(eval_pred):
    return {}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(str(FT_OUT / "flan_t5_small_finetuned"))
tokenizer.save_pretrained(str(FT_OUT / "flan_t5_small_finetuned"))

print("Saved fine-tuned model to:", (FT_OUT / "flan_t5_small_finetuned").resolve())


  trainer = Trainer(


Step,Training Loss
20,4.563


Saved fine-tuned model to: /content/drive/MyDrive/RAG-FT-DATA/fine_tuned_model/flan_t5_small_finetuned


<h3>Evaluate Fine-Tuned Model</h3>

In [53]:
ft_model = AutoModelForSeq2SeqLM.from_pretrained(str(FT_OUT / "flan_t5_small_finetuned")).to(device)

ft_preds = generate_answers(ft_model, sample_test["question"].tolist(), max_new_tokens=64, temp=0.0)

em_list, f1_list, times = [], [], []
for (pred, dt), gold in zip(ft_preds, sample_test["answer"].tolist()):
    em_list.append(exact_match(pred, gold))
    f1_list.append(f1_score(pred, gold))
    times.append(dt)

print("Fine-Tuned | EM:", np.mean(em_list), "F1:", np.mean(f1_list), "Avg latency (s):", np.mean(times))
pd.DataFrame({
    "question": sample_test["question"].tolist(),
    "gold": sample_test["answer"].tolist(),
    "pred": [p for p,_ in ft_preds],
    "latency_s": times,
    "EM": em_list,
    "F1": f1_list
})


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Fine-Tuned | EM: 0.0 F1: 0.012121212121212121 Avg latency (s): 3.465559244155884


Unnamed: 0,question,gold,pred,latency_s,EM,F1
0,What was Integrated Annual Report's total empl...,1869,59,0.458801,0,0.0
1,What were Generative AI's total assets in None?,1,59,0.169263,0,0.0
2,Who was the CFO of Generative AI?,Certification,adolescent,0.396449,0,0.0
3,What are the reportable business segments of G...,"Consumer Packaged Goods and Logistics, Uti, en...",a symbiotic relationship between the two,16.04145,0,0.060606
4,What was Generative AI's net income in None?,27234,87.8,0.261834,0,0.0


<h3>Advanced Fine-Tuning Technique → Mixture-of-Experts (MoE) - Two-expert LoRA approach with a lightweight router</h3>
<p>Two LoRA experts (numeric-focused and textual-focused) trained with PEFT on top of FLAN-T5.

A tiny router (scikit-learn LogisticRegression) that chooses which expert to use per question.

(Optional) Soft routing: generate with both experts and pick the answer with higher router probability or higher token-logprob.</p>

In [54]:
# If needed:
# !pip install transformers accelerate sentencepiece peft datasets scikit-learn

import os, re, time, json, random
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any

import torch
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq, TrainingArguments, Trainer)
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from joblib import dump, load

device = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL = "google/flan-t5-small"
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

FT_DIR = ROOT / "fine_tuned_model"
FT_DIR.mkdir(parents=True, exist_ok=True)

print("Device:", device)


Device: cpu


<h3>Prepare Splits (Numeric vs Textual)</h3>

In [55]:
# Assumes df exists from Step 3 (columns: question, answer, ...)
assert "df" in globals(), "Run your Step 3 loading to create `df` (Q/A dataframe)."

def is_numeric_answer(s: str) -> bool:
    return bool(re.search(r"\d", str(s)))

numeric_df = df[df["answer"].apply(is_numeric_answer)].copy()
textual_df = df[~df["answer"].apply(is_numeric_answer)].copy()

print("Numeric QAs:", len(numeric_df), "Textual QAs:", len(textual_df))

# Train/val splits (80/20) per expert
num_train = numeric_df.sample(frac=0.8, random_state=SEED)
num_val   = numeric_df.drop(num_train.index)
txt_train = textual_df.sample(frac=0.8, random_state=SEED)
txt_val   = textual_df.drop(txt_train.index)

def to_sft(ds_df: pd.DataFrame) -> Dataset:
    return Dataset.from_dict({
        "prompt": [f"question: {q}" for q in ds_df["question"].tolist()],
        "target": [str(a) for a in ds_df["answer"].tolist()]
    })

ds_num_train = to_sft(num_train)
ds_num_val   = to_sft(num_val)
ds_txt_train = to_sft(txt_train)
ds_txt_val   = to_sft(txt_val)


Numeric QAs: 31 Textual QAs: 19


<h3>Tokenization & Collator</h3>

In [56]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
MAX_INPUT_TOKENS  = 256
MAX_TARGET_TOKENS = 64

def preprocess(batch):
    model_inputs = tokenizer(batch["prompt"], max_length=MAX_INPUT_TOKENS, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target"], max_length=MAX_TARGET_TOKENS, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

num_train_tk = ds_num_train.map(preprocess, batched=True, remove_columns=ds_num_train.column_names)
num_val_tk   = ds_num_val.map(preprocess, batched=True, remove_columns=ds_num_val.column_names)
txt_train_tk = ds_txt_train.map(preprocess, batched=True, remove_columns=ds_txt_train.column_names)
txt_val_tk   = ds_txt_val.map(preprocess, batched=True, remove_columns=ds_txt_val.column_names)

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=BASE_MODEL)


Map:   0%|          | 0/25 [00:00<?, ? examples/s]



Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

<h3>LoRA Expert A (Numeric)</h3>

In [57]:
base_numeric = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q", "v", "k", "o", "wi", "wo"],
    lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
)
num_model = get_peft_model(base_numeric, lora_cfg)

args_num = TrainingArguments(
    output_dir=str(FT_DIR / "lora_numeric"),
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    report_to="none",
    seed=SEED,
    bf16=torch.cuda.is_available()
)

trainer_num = Trainer(
    model=num_model, args=args_num,
    train_dataset=num_train_tk, eval_dataset=num_val_tk,
    data_collator=collator, tokenizer=tokenizer
)
trainer_num.train()

num_model.save_pretrained(str(FT_DIR / "lora_numeric"))
tokenizer.save_pretrained(str(FT_DIR / "lora_numeric"))
print("Saved LoRA numeric expert at:", (FT_DIR / "lora_numeric").resolve())


  trainer_num = Trainer(


Step,Training Loss




Saved LoRA numeric expert at: /content/drive/MyDrive/RAG-FT-DATA/fine_tuned_model/lora_numeric


<h3>LoRA Expert B (Textual)<h3>

In [58]:
base_textual = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
txt_model = get_peft_model(base_textual, lora_cfg)

args_txt = TrainingArguments(
    output_dir=str(FT_DIR / "lora_textual"),
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    report_to="none",
    seed=SEED,
    bf16=torch.cuda.is_available()
)

trainer_txt = Trainer(
    model=txt_model, args=args_txt,
    train_dataset=txt_train_tk, eval_dataset=txt_val_tk,
    data_collator=collator, tokenizer=tokenizer
)
trainer_txt.train()

txt_model.save_pretrained(str(FT_DIR / "lora_textual"))
tokenizer.save_pretrained(str(FT_DIR / "lora_textual"))
print("Saved LoRA textual expert at:", (FT_DIR / "lora_textual").resolve())


  trainer_txt = Trainer(


Step,Training Loss




Saved LoRA textual expert at: /content/drive/MyDrive/RAG-FT-DATA/fine_tuned_model/lora_textual


<h3>Train a Tiny Router (LogReg)</h3>

In [59]:
router_df = df[["question", "answer"]].copy()
router_df["label_numeric"] = router_df["answer"].apply(is_numeric_answer).astype(int)

vec = TfidfVectorizer(ngram_range=(1,2), max_features=8000, lowercase=True)
X = vec.fit_transform(router_df["question"])
y = router_df["label_numeric"].values

clf = LogisticRegression(max_iter=200)
clf.fit(X, y)

dump(vec, FT_DIR / "router_vectorizer.joblib")
dump(clf, FT_DIR / "router_clf.joblib")

# Quick sanity check
print("Router class balance:", np.mean(y))
print("Router sample preds:", clf.predict(vec.transform(router_df["question"].head(5))))


Router class balance: 0.62
Router sample preds: [0 1 1 1 0]


<h3>Inference with Hard Routing (MoE)</h3>

In [60]:
# Load base once for inference; attach LoRA adapters dynamically
base_for_infer = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

# Load each LoRA adapter as PeftModel when needed
NUM_PATH = str(FT_DIR / "lora_numeric")
TXT_PATH = str(FT_DIR / "lora_textual")

vectorizer = load(FT_DIR / "router_vectorizer.joblib")
router_clf = load(FT_DIR / "router_clf.joblib")

@torch.no_grad()
def moe_lora_answer(question: str, max_new_tokens=64, temperature=0.0, soft=False) -> Dict[str, Any]:
    """
    Hard routing (default): choose one expert via router.
    soft=True: generate with both experts and pick numeric if router prob>0.5 else textual.
    (You can enhance soft mode by scoring logprobs, but here we keep it simple & fast.)
    """
    # Router predict
    Xq = vectorizer.transform([question])
    prob_numeric = float(router_clf.predict_proba(Xq)[0,1])
    choose_numeric = (prob_numeric >= 0.5)

    # Load chosen adapter
    if choose_numeric:
        peft_model = PeftModel.from_pretrained(base_for_infer, NUM_PATH).to(device)
        expert = "numeric"
    else:
        peft_model = PeftModel.from_pretrained(base_for_infer, TXT_PATH).to(device)
        expert = "textual"

    prompt = f"question: {question}"
    enc = tokenizer(prompt, return_tensors="pt").to(device)

    t0 = time.time()
    out = peft_model.generate(
        **enc,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=(temperature>0.0)
    )
    dt = time.time()-t0
    ans = tokenizer.decode(out[0], skip_special_tokens=True).strip()

    return {
        "answer": ans,
        "expert_used": expert,
        "p_numeric": round(prob_numeric, 3),
        "latency_sec": round(dt, 3)
    }

# Try it
examples = [
    "What was the revenue in 2023?",
    "Name the key business segments reported.",
]
for q in examples:
    print(q, "->", moe_lora_answer(q))


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


What was the revenue in 2023? -> {'answer': '$500,000', 'expert_used': 'numeric', 'p_numeric': 0.719, 'latency_sec': 0.204}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Name the key business segments reported. -> {'answer': 'ad-hoc business', 'expert_used': 'textual', 'p_numeric': 0.486, 'latency_sec': 0.401}


<h3>Compare FT (single) vs MoE-LoRA</h3>

In [61]:
# Helper metrics
def normalize_text(s):
    s = s.lower().strip()
    s = re.sub(r"[\s]+", " ", s)
    s = re.sub(r"[^a-z0-9\.\-\$₹% ]", "", s)
    return s

def exact_match(pred, gold): return int(normalize_text(pred) == normalize_text(gold))

def f1_score(pred, gold):
    p = normalize_text(pred).split(); g = normalize_text(gold).split()
    if not p and not g: return 1.0
    if not p or not g: return 0.0
    common = {}
    for tok in p: common[tok] = min(p.count(tok), g.count(tok))
    num_same = sum(common.values())
    if num_same == 0: return 0.0
    precision = num_same / len(p); recall = num_same / len(g)
    return 2*precision*recall/(precision+recall)

# Load the single fine-tuned model (from Step 3) or fall back to base
if "ft_model" not in globals():
    ft_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

@torch.no_grad()
def generate_answers(model, questions: List[str], max_new_tokens=64):
    outs = []
    for q in questions:
        enc = tokenizer(f"question: {q}", return_tensors="pt").to(device)
        t0 = time.time()
        out = model.generate(**enc, max_new_tokens=max_new_tokens)
        dt = time.time() - t0
        ans = tokenizer.decode(out[0], skip_special_tokens=True)
        outs.append((ans.strip(), dt))
    return outs

# Sample test
sample = df.sample(n=min(10, len(df)), random_state=123)
qs = sample["question"].tolist()
golds = sample["answer"].tolist()

# Single FT model (if available)
ft_outs = generate_answers(ft_model, qs)
ft_preds = [a for (a, _) in ft_outs]
ft_times = [t for (_, t) in ft_outs]

# MoE-LoRA
moe_preds, moe_times = [], []
for q in qs:
    out = moe_lora_answer(q)
    moe_preds.append(out["answer"])
    moe_times.append(out["latency_sec"])

def eval_run(preds, times, golds):
    em = np.mean([exact_match(p,g) for p,g in zip(preds, golds)])
    f1 = np.mean([f1_score(p,g) for p,g in zip(preds, golds)])
    lat = float(np.mean(times))
    return em, f1, lat

# ft_em, ft_f1, ft_lat   = eval_run([p for p,_ in ft_preds], [t for _,t in ft_preds], golds)
# moe_em, moe_f1, moe_lat= eval_run(moe_preds, moe_times, golds)
# Evaluate
ft_em, ft_f1, ft_lat   = eval_run(ft_preds, ft_times, golds)
moe_em, moe_f1, moe_lat= eval_run(moe_preds, moe_times, golds)

pd.DataFrame({
    "metric": ["EM","F1","Latency(s)"],
    "Single FT": [ft_em, ft_f1, ft_lat],
    "MoE-LoRA (router)": [moe_em, moe_f1, moe_lat]
})


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

Unnamed: 0,metric,Single FT,MoE-LoRA (router)
0,EM,0.0,0.0
1,F1,0.0,0.0
2,Latency(s),0.208029,1.119


<h3>Guardrail config & helpers (input + output expectations)</h3>

In [62]:
import re
import time
from typing import List, Dict, Any

# --- Domain keywords for finance scope ---
FINANCE_KEYWORDS = set("""
revenue sales net income profit ebitda eps cash flow operating cash flow free cash flow assets liabilities
equity debt margin ebit ebitda pbt pat dividend capex opex receivables payables working capital
guidance segment report notes annual balance sheet income statement cashflow md&a fiscal fy yoy
""".split())

# --- Very light PII patterns (block obvious requests) ---
PII_PATTERNS = [
    r"\bpan\s*number\b", r"\baadhaar\b", r"\bssn\b", r"\bsocial\s+security\b",
    r"\bcredit\s*card\b", r"\bcvv\b", r"\bbank\s*account\b", r"\bifsc\b",
    r"\bphone\s*(number)?\b", r"\bemail\b"
]

# --- Numeric expectation keywords (if present in the query, we expect numbers in the answer) ---
NUMERIC_CUES = set("""
revenue sales net income profit ebitda eps cash flow operating cash flow free cash flow assets liabilities
equity debt margin growth % percent yoy qoq guidance dividend capex opex
amount total figure number how much what is the value rupees dollars usd inr ₹ $
""".split())

NUM_RE = re.compile(r"\b(?:\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\b")

def is_finance_query(query: str) -> bool:
    q = query.lower()
    return any(k in q for k in FINANCE_KEYWORDS)

def contains_pii_request(query: str) -> bool:
    q = query.lower()
    return any(re.search(p, q) for p in PII_PATTERNS)

def expects_numeric_answer(query: str) -> bool:
    q = query.lower()
    return any(cue in q for cue in NUMERIC_CUES)

def answer_has_number(ans: str) -> bool:
    return bool(NUM_RE.search(ans.replace(",", "")))


<h3>Generation with log-prob confidence (for base/finetuned T5)</h3>

In [63]:
import torch

# expects global: tokenizer, base_model, ft_model (from Step 3)
def generate_with_scores(model, prompt: str, max_new_tokens=64, temperature=0.0):
    enc = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **enc,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=(temperature > 0.0),
        return_dict_in_generate=True,
        output_scores=True
    )
    # Decode answer
    seq_ids = out.sequences[0]
    ans = tokenizer.decode(seq_ids, skip_special_tokens=True).strip()

    # Compute average log-prob for generated tokens (decoder steps only)
    # Align scores to generated tokens (ignore prompt length)
    scores = out.scores  # list[tensor vocab_logits] length = generated_len
    gen_token_ids = seq_ids[len(enc.input_ids[0]):]  # only newly generated token ids
    logprobs = []
    for step_logits, tok_id in zip(scores, gen_token_ids):
        lp = torch.log_softmax(step_logits, dim=-1)[0, tok_id.item()].item()
        logprobs.append(lp)
    avg_logprob = float(sum(logprobs) / max(1, len(logprobs)))

    return ans, avg_logprob, len(gen_token_ids)


<h3>Optional: reuse RAG’s number-consistency check (if available)</h3>

In [64]:
# Build a map from chunk_id -> full text once (global)
CHUNK_TEXT_BY_ID = {c["chunk_id"]: c["text"] for c in chunks}

def rag_number_consistency_check(answer: str, query: str) -> dict:
    """
    Compare numbers in the answer with numbers found in the *full* retrieved chunks.
    """
    if "hybrid_retrieve" not in globals():
        return {"used": False, "is_suspicious": False, "suspicious_numbers": [], "contexts": []}

    hits = hybrid_retrieve(query, k_dense=10, k_sparse=10, k_fused=15, k_final=5)
    # Use full texts
    contexts_full = []
    for row in hits.get("reranked_top", []):
        cid = row.get("chunk_id")
        full = CHUNK_TEXT_BY_ID.get(cid, row.get("preview",""))
        contexts_full.append(full)

    # Extract numbers
    ans_nums = set(NUM_RE.findall(answer.replace(",", "")))
    ctx_nums = set()
    for c in contexts_full:
        for n in NUM_RE.findall(c.replace(",", "")):
            ctx_nums.add(n)

    suspicious = sorted(list(ans_nums - ctx_nums))
    return {
        "used": True,
        "is_suspicious": len(suspicious) > 0,
        "suspicious_numbers": suspicious,
        "contexts_checked": len(contexts_full)
    }



<h3>Guardrailed FT answer wrapper (works for base / finetuned / MoE-LoRA)</h3>

In [65]:
def ft_guardrailed_answer(query: str, mode: str = "finetuned",
                          max_new_tokens: int = 64, temperature: float = 0.0,
                          use_rag_consistency: bool = True) -> Dict[str, Any]:
    t0 = time.time()
    flags = []

    # 1) INPUT GUARDRAIL
    if contains_pii_request(query):
        return {
            "method": f"FT:{mode}",
            "answer": "Out of scope (PII request is not allowed).",
            "confidence": 0.2,
            "latency_sec": round(time.time()-t0, 3),
            "flags": ["input_pii_block"]
        }
    if not is_finance_query(query):
        return {
            "method": f"FT:{mode}",
            "answer": "Out of scope (non-financial query).",
            "confidence": 0.3,
            "latency_sec": round(time.time()-t0, 3),
            "flags": ["input_out_of_scope"]
        }

    # 2) INFERENCE
    expectation_numeric = expects_numeric_answer(query)
    if mode == "base":
        assert "base_model" in globals(), "Load base_model first (Step 3)."
        prompt = f"question: {query}"
        ans, avg_lp, gen_len = generate_with_scores(base_model, prompt, max_new_tokens, temperature)
        conf_model = avg_lp
    elif mode == "finetuned":
        assert "ft_model" in globals(), "Load ft_model first (Step 3)."
        prompt = f"question: {query}"
        ans, avg_lp, gen_len = generate_with_scores(ft_model, prompt, max_new_tokens, temperature)
        conf_model = avg_lp
    elif mode == "moe":
        # Use MoE-LoRA router (from Step 3 alternative MoE)
        assert "moe_lora_answer" in globals(), "Define moe_lora_answer first."
        out = moe_lora_answer(query, max_new_tokens=max_new_tokens, temperature=temperature)
        ans = out["answer"]
        conf_model = 0.6 + 0.3 * (out.get("p_numeric", 0.5))  # heuristic from router prob
        flags.append(f"expert:{out.get('expert_used','?')}")
    else:
        raise ValueError("mode must be one of {'base','finetuned','moe'}")

    # 3) OUTPUT GUARDRAIL — NUMERIC EXPECTATION
    if expectation_numeric and not answer_has_number(ans):
        flags.append("output_missing_number_for_numeric_expectation")
        # downgrade or replace
        safe_ans = "Not in scope (answer not confidently numeric)."
        confidence = 0.35
        return {
            "method": f"FT:{mode}",
            "answer": safe_ans,
            "raw_answer": ans,
            "confidence": round(confidence, 3),
            "latency_sec": round(time.time()-t0, 3),
            "flags": flags
        }

    # 4) OPTIONAL: RAG-AIDED CONSISTENCY CHECK (numbers should appear in retrieved context)
    rag_check = {"used": False}
    if use_rag_consistency and answer_has_number(ans):
        rag_check = rag_number_consistency_check(ans, query)
        if rag_check["used"] and rag_check["is_suspicious"]:
            flags.append("output_potential_hallucination_numbers")
            # conservative fallback:
            ans_safe = "Not in scope (insufficient supporting evidence)."
            return {
                "method": f"FT:{mode}",
                "answer": ans_safe,
                "raw_answer": ans,
                "confidence": 0.4,
                "latency_sec": round(time.time()-t0, 3),
                "flags": flags,
                "rag_check": {k: v for k, v in rag_check.items() if k != "contexts"}
            }

    # 5) CONFIDENCE SCALING (log-prob heuristic for base/finetuned; router-based for MoE)
    if mode in ("base","finetuned"):
        # Map avg log-prob (~[-20,0]) to [0,1]
        # softplus normalization
        confidence = float(1 / (1 + pow(2.71828, - (conf_model + 5))))  # shift roughly
        # Reward brevity slightly
        if len(ans) > 90:
            confidence = max(0.0, confidence - 0.1)
    else:
        confidence = min(1.0, max(0.0, conf_model))

    return {
        "method": f"FT:{mode}",
        "answer": ans,
        "confidence": round(confidence, 3),
        "latency_sec": round(time.time()-t0, 3),
        "flags": flags,
        "rag_check_used": rag_check.get("used", False)
    }


<h3>Quick smoke test</h3>

In [66]:
tests = [
    ("What was the company's revenue in 2023?", "finetuned"),
    ("Give me the CFO's phone number from the report", "finetuned"),
    ("What is the capital of France?", "finetuned"),
    ("List the key segments reported.", "moe"),
]

for q, mode in tests:
    out = ft_guardrailed_answer(q, mode=mode, max_new_tokens=48, temperature=0.0, use_rag_consistency=True)
    print(f"\nQ: {q}\nMode: {mode}\n→ Answer:", out["answer"])
    print("Confidence:", out["confidence"], "| Flags:", out.get("flags", []), "| RAG check used:", out.get("rag_check_used"))


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Q: What was the company's revenue in 2023?
Mode: finetuned
→ Answer: $1
Confidence: 0.993 | Flags: [] | RAG check used: True

Q: Give me the CFO's phone number from the report
Mode: finetuned
→ Answer: Out of scope (PII request is not allowed).
Confidence: 0.2 | Flags: ['input_pii_block'] | RAG check used: None


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Q: What is the capital of France?
Mode: finetuned
→ Answer: Not in scope (answer not confidently numeric).
Confidence: 0.35 | Flags: ['output_missing_number_for_numeric_expectation'] | RAG check used: None

Q: List the key segments reported.
Mode: moe
→ Answer: Not in scope (answer not confidently numeric).
Confidence: 0.35 | Flags: ['expert:textual', 'output_missing_number_for_numeric_expectation'] | RAG check used: None


<h1>Step 4: Testing, Evaluation & Comparison.</h1>

In [67]:
import time
import re
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd

QA_CSV = ROOT / "qa_pairs.csv"
EVAL_DIR = ROOT / "eval"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

df_qa = pd.read_csv(QA_CSV)
print("Q/A pairs loaded:", len(df_qa))
df_qa.head(3)


Q/A pairs loaded: 50


Unnamed: 0,question,answer,metric,year,company,source_doc,context_snippet,confidence_heuristic
0,What were Generative AI's total assets in 2023?,1.0,assets,2023.0,Generative AI,annual-report-2024,"oans 2.6 Other financial assets 2.7 10,129 9,0...",0.9
1,What were Integrated Annual Report's total ass...,1.0,assets,2024.0,Integrated Annual Report,infosys-ar-25,"2.6 Other financial assets 2.7 12,569 10,129 I...",0.9
2,What was Generative AI's operating margin in 2...,20.7,operating_margin,2020.0,Generative AI,annual-report-2024,ry Business highlights Performance overview Di...,0.9


<h3>Normalization & correctness metrics</h3>
<p>We’ll compute:

Exact Match (EM) on normalized strings

Relaxed numeric match: if both answers contain a single main number, treat correct if values match after normalization (you can add tolerance if needed)</p>

In [68]:
NUM_RE = re.compile(r"\b(?:\d{1,3}(?:,\d{3})+|\d+(?:\.\d+)?)\b")

def normalize_text(s: str) -> str:
    s = str(s)
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^a-z0-9\.\-\$₹% ]", "", s)
    return s

def extract_numbers(s: str) -> List[str]:
    return NUM_RE.findall(str(s).replace(",", ""))  # numbers without commas

def exact_match(pred: str, gold: str) -> int:
    return int(normalize_text(pred) == normalize_text(gold))

def relaxed_numeric_match(pred: str, gold: str) -> int:
    """If both contain at least one number, compare first numbers exactly (string-wise).
       You can extend to tolerance matching if needed."""
    pnums = extract_numbers(pred)
    gnums = extract_numbers(gold)
    if pnums and gnums:
        return int(pnums[0] == gnums[0])
    return 0

def correctness_label(pred: str, gold: str) -> str:
    em = exact_match(pred, gold)
    if em:
        return "Y"
    rn = relaxed_numeric_match(pred, gold)
    return "Y" if rn else "N"


<h3>Unified runner for each method</h3>
This wraps RAG and FT calls to return a consistent record.

In [69]:
def run_rag(query: str) -> Dict[str, Any]:
    assert "rag_answer" in globals(), "rag_answer() not found. Run Step 2 cells."
    t0 = time.perf_counter()
    out = rag_answer(query, k_dense=15, k_sparse=15, k_fused=20, k_final=5, max_input_tokens=768)
    dt = time.perf_counter() - t0
    return {
        "method": "RAG",
        "answer": out.get("answer", ""),
        "confidence": float(out.get("confidence", 0.0)),
        "latency_sec": float(dt),
        "guardrail": out.get("guardrail_triggered", None),
        "raw": out
    }

def run_ft(query: str, mode="finetuned") -> Dict[str, Any]:
    assert "ft_guardrailed_answer" in globals(), "ft_guardrailed_answer() not found. Run Step 3.6 cells."
    t0 = time.perf_counter()
    out = ft_guardrailed_answer(query, mode=mode, max_new_tokens=64, temperature=0.0, use_rag_consistency=True)
    dt = time.perf_counter() - t0
    return {
        "method": f"FT:{mode}",
        "answer": out.get("answer", ""),
        "confidence": float(out.get("confidence", 0.0)),
        "latency_sec": float(dt),
        "guardrail": ",".join(out.get("flags", [])) if out.get("flags") else None,
        "raw": out
    }


<h3>Mandatory 3 test cases</h3>

<p>Relevant, high-confidence → pick a clear numeric Q from your dataset

Relevant, low-confidence → ambiguous (e.g., “What was the revenue?” without year)

Irrelevant → “What is the capital of France?”</p>

In [70]:
# 1) Relevant, high-confidence (choose a numeric Q with a year from dataset)
cand = df_qa.dropna(subset=["year"])
if len(cand) == 0:
    cand = df_qa
high_q = cand.sample(1, random_state=7).iloc[0]["question"]
print("High-confidence Q:", high_q)

# 2) Relevant, low-confidence (ambiguous)
low_q = "What was the revenue?"  # missing year & company context

# 3) Irrelevant
irr_q = "What is the capital of France?"

mandatory_tests = [
    ("Relevant-High", high_q, "gold_from_dataset"),
    ("Relevant-Low", low_q, None),
    ("Irrelevant", irr_q, None),
]

methods = [("RAG", None), ("FT", "finetuned"), ("FT", "moe")]  # adjust if you didn't train MoE

rows = []
for label, q, gold in mandatory_tests:
    for mname, mode in methods:
        if mname == "RAG":
            out = run_rag(q)
        else:
            out = run_ft(q, mode=mode)
        rows.append({
            "TestType": label,
            "Question": q,
            "Method": out["method"],
            "Answer": out["answer"],
            "Confidence": out["confidence"],
            "Time(s)": round(out["latency_sec"], 3),
            "Guardrail": out["guardrail"],
        })

df_mandatory = pd.DataFrame(rows)
df_mandatory


High-confidence Q: What was Integrated Annual Report's total employee headcount in 2024?


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Unnamed: 0,TestType,Question,Method,Answer,Confidence,Time(s),Guardrail
0,Relevant-High,What was Integrated Annual Report's total empl...,RAG,Not in scope,0.35,7.72,
1,Relevant-High,What was Integrated Annual Report's total empl...,FT:finetuned,Not in scope (insufficient supporting evidence).,0.4,4.488,output_potential_hallucination_numbers
2,Relevant-High,What was Integrated Annual Report's total empl...,FT:moe,Not in scope (insufficient supporting evidence).,0.4,6.086,"expert:numeric,output_potential_hallucination_..."
3,Relevant-Low,What was the revenue?,RAG,2.18,1.0,6.296,generator_conservative_auto_fill
4,Relevant-Low,What was the revenue?,FT:finetuned,Not in scope (insufficient supporting evidence).,0.4,5.258,output_potential_hallucination_numbers
5,Relevant-Low,What was the revenue?,FT:moe,Not in scope (insufficient supporting evidence).,0.4,5.116,"expert:numeric,output_potential_hallucination_..."
6,Irrelevant,What is the capital of France?,RAG,Not in scope,0.35,6.328,
7,Irrelevant,What is the capital of France?,FT:finetuned,Not in scope (answer not confidently numeric).,0.35,0.405,output_missing_number_for_numeric_expectation
8,Irrelevant,What is the capital of France?,FT:moe,Not in scope (answer not confidently numeric).,0.35,0.642,"expert:textual,output_missing_number_for_numer..."


<h3>Extended evaluation on ≥10 questions</h3>
This runs RAG vs FT:finetuned vs FT:moe on a set of questions from your dataset and computes correctness.

In [71]:
N_TEST = min(10, len(df_qa))
sample_eval = df_qa.sample(N_TEST, random_state=123).reset_index(drop=True)

records = []
for i, row in sample_eval.iterrows():
    q = str(row["question"])
    gold = str(row["answer"])
    for mname, mode in methods:
        if mname == "RAG":
            out = run_rag(q)
        else:
            out = run_ft(q, mode=mode)
        correct = correctness_label(out["answer"], gold)
        records.append({
            "Question": q,
            "Gold": gold,
            "Method": out["method"],
            "Answer": out["answer"],
            "Confidence": out["confidence"],
            "Time(s)": round(out["latency_sec"], 3),
            "Correct (Y/N)": correct,
            "Guardrail": out["guardrail"],
        })

df_results = pd.DataFrame(records)
df_results.head(10)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

Unnamed: 0,Question,Gold,Method,Answer,Confidence,Time(s),Correct (Y/N),Guardrail
0,Who was the CEO of Generative AI?,Water Mandate,RAG,Out of scope (non-financial query).,0.3,0.0,N,input_out_of_scope
1,Who was the CEO of Generative AI?,Water Mandate,FT:finetuned,Out of scope (non-financial query).,0.3,0.0,N,input_out_of_scope
2,Who was the CEO of Generative AI?,Water Mandate,FT:moe,Out of scope (non-financial query).,0.3,0.0,N,input_out_of_scope
3,Who was the CFO of Generative AI?,A.G.S. Manikantha\nCompany Secretary,RAG,Out of scope (non-financial query).,0.3,0.0,N,input_out_of_scope
4,Who was the CFO of Generative AI?,A.G.S. Manikantha\nCompany Secretary,FT:finetuned,Out of scope (non-financial query).,0.3,0.0,N,input_out_of_scope
5,Who was the CFO of Generative AI?,A.G.S. Manikantha\nCompany Secretary,FT:moe,Out of scope (non-financial query).,0.3,0.0,N,input_out_of_scope
6,What was Generative AI's revenue in None?,89032,RAG,31,1.0,6.155,N,generator_conservative_auto_fill
7,What was Generative AI's revenue in None?,89032,FT:finetuned,Not in scope (answer not confidently numeric).,0.35,0.126,N,output_missing_number_for_numeric_expectation
8,What was Generative AI's revenue in None?,89032,FT:moe,Not in scope (insufficient supporting evidence).,0.4,6.047,N,"expert:numeric,output_potential_hallucination_..."
9,What was Integrated Annual Report's operating ...,21.1,RAG,Not in scope,0.35,6.217,N,


<h3>Summary metrics & save CSVs</h3>

In [72]:
def summarize(df: pd.DataFrame) -> pd.DataFrame:
    grp = df.groupby("Method").agg(
        Accuracy=("Correct (Y/N)", lambda col: np.mean([1 if x=="Y" else 0 for x in col])),
        AvgConfidence=("Confidence", "mean"),
        AvgTime=("Time(s)", "mean"),
        N=("Question", "count")
    ).reset_index()
    return grp.sort_values(by="Accuracy", ascending=False)

summary = summarize(df_results)
summary


Unnamed: 0,Method,Accuracy,AvgConfidence,AvgTime,N
0,FT:finetuned,0.0,0.35,1.6912,10
1,FT:moe,0.0,0.3952,2.0392,10
2,RAG,0.0,0.4,4.6612,10


In [73]:
# Save outputs
mand_path = EVAL_DIR / "mandatory_tests.csv"
ext_path  = EVAL_DIR / "extended_eval.csv"
sum_path  = EVAL_DIR / "summary_metrics.csv"

df_mandatory.to_csv(mand_path, index=False)
df_results.to_csv(ext_path, index=False)
summary.to_csv(sum_path, index=False)

print("Saved:")
print(" -", mand_path.resolve())
print(" -", ext_path.resolve())
print(" -", sum_path.resolve())


Saved:
 - /content/drive/MyDrive/RAG-FT-DATA/eval/mandatory_tests.csv
 - /content/drive/MyDrive/RAG-FT-DATA/eval/extended_eval.csv
 - /content/drive/MyDrive/RAG-FT-DATA/eval/summary_metrics.csv


<h3>Pretty comparison table for the report</h3>

In [74]:
display_cols = ["Question","Method","Answer","Confidence","Time(s)","Correct (Y/N)"]
df_for_report = df_results[display_cols].copy()
df_for_report.head(15)


Unnamed: 0,Question,Method,Answer,Confidence,Time(s),Correct (Y/N)
0,Who was the CEO of Generative AI?,RAG,Out of scope (non-financial query).,0.3,0.0,N
1,Who was the CEO of Generative AI?,FT:finetuned,Out of scope (non-financial query).,0.3,0.0,N
2,Who was the CEO of Generative AI?,FT:moe,Out of scope (non-financial query).,0.3,0.0,N
3,Who was the CFO of Generative AI?,RAG,Out of scope (non-financial query).,0.3,0.0,N
4,Who was the CFO of Generative AI?,FT:finetuned,Out of scope (non-financial query).,0.3,0.0,N
5,Who was the CFO of Generative AI?,FT:moe,Out of scope (non-financial query).,0.3,0.0,N
6,What was Generative AI's revenue in None?,RAG,31,1.0,6.155,N
7,What was Generative AI's revenue in None?,FT:finetuned,Not in scope (answer not confidently numeric).,0.35,0.126,N
8,What was Generative AI's revenue in None?,FT:moe,Not in scope (insufficient supporting evidence).,0.4,6.047,N
9,What was Integrated Annual Report's operating ...,RAG,Not in scope,0.35,6.217,N


<h1>Gradio UI</h1>

In [75]:
# If needed:
# !pip install gradio pandas

import time
from pathlib import Path
from datetime import datetime
import pandas as pd
import gradio as gr


<h3>Inference wrapper + logging</h3>

In [76]:
LOG_PATH = Path("data/eval/ui_logs.csv")
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

def ui_infer(query: str, mode_choice: str):
    """
    mode_choice: 'RAG' | 'FT:finetuned' | 'FT:moe'
    Returns: method, answer, confidence, latency, flags, contexts_df (or empty)
    """
    if not query or not query.strip():
        return "—", "Please enter a question.", 0.0, 0.0, "—", pd.DataFrame()

    if mode_choice == "RAG":
        out = rag_answer(query, k_dense=15, k_sparse=15, k_fused=20, k_final=5, max_input_tokens=768)
        method    = "RAG"
        answer    = out.get("answer", "")
        confidence= float(out.get("confidence", 0.0))
        latency   = float(out.get("latency_sec", 0.0))
        flags     = out.get("guardrail_triggered") or "—"
        ctx_rows  = out.get("retrieved_contexts", [])
        ctx_df    = pd.DataFrame(ctx_rows)[["doc_name","pages_approx","ce_score","preview"]] if ctx_rows else pd.DataFrame()
    elif mode_choice == "FT:finetuned":
        out = ft_guardrailed_answer(query, mode="finetuned", max_new_tokens=64, temperature=0.0, use_rag_consistency=True)
        method    = out.get("method", "FT:finetuned")
        answer    = out.get("answer", "")
        confidence= float(out.get("confidence", 0.0))
        latency   = float(out.get("latency_sec", 0.0))
        flags     = ",".join(out.get("flags", [])) if out.get("flags") else "—"
        ctx_df    = pd.DataFrame()  # FT doesn't rely on retrieval
    else:  # FT:moe
        out = ft_guardrailed_answer(query, mode="moe", max_new_tokens=64, temperature=0.0, use_rag_consistency=True)
        method    = out.get("method", "FT:moe")
        answer    = out.get("answer", "")
        confidence= float(out.get("confidence", 0.0))
        latency   = float(out.get("latency_sec", 0.0))
        flags     = ",".join(out.get("flags", [])) if out.get("flags") else "—"
        ctx_df    = pd.DataFrame()

    # log interaction (append or create)
    new_row = pd.DataFrame([{
        "ts_utc": datetime.utcnow().isoformat(),
        "mode": method,
        "query": query,
        "answer": answer,
        "confidence": confidence,
        "latency_sec": latency,
        "flags": flags
    }])
    if LOG_PATH.exists():
        prev = pd.read_csv(LOG_PATH)
        pd.concat([prev, new_row], ignore_index=True).to_csv(LOG_PATH, index=False)
    else:
        new_row.to_csv(LOG_PATH, index=False)

    return method, answer, confidence, latency, flags, ctx_df


<h3>Build & launch Gradio app</h3>

In [77]:
with gr.Blocks(title="Comparative Financial QA: RAG vs Fine-Tuned") as demo:
    gr.Markdown("## Comparative Financial QA — RAG vs Fine-Tuned vs MoE")
    gr.Markdown(
        "Enter a financial question from the last two annual reports. "
        "Switch methods to compare **answer, confidence, latency**, and (for RAG) supporting contexts."
    )

    with gr.Row():
        mode = gr.Radio(
            choices=["RAG", "FT:finetuned", "FT:moe"],
            value="RAG",
            label="Method"
        )
    query = gr.Textbox(lines=2, label="Your Question", placeholder="e.g., What was the company's revenue in 2023?")
    ask = gr.Button("Ask")

    with gr.Row():
        method_o   = gr.Textbox(label="Method", interactive=False)
        confidence_o = gr.Number(label="Confidence", precision=3)
        latency_o  = gr.Number(label="Latency (s)", precision=3)

    answer_o = gr.Textbox(label="Answer", lines=4)
    flags_o  = gr.Textbox(label="Guardrail Flags", interactive=False)

    gr.Markdown("### Top contexts (RAG only)")
    ctx_df_o = gr.Dataframe(headers=["doc_name","pages_approx","ce_score","preview"], wrap=True)

    ask.click(ui_infer, inputs=[query, mode], outputs=[method_o, answer_o, confidence_o, latency_o, flags_o, ctx_df_o])

    gr.Markdown(
        "Logs are saved to `data/eval/ui_logs.csv` for your report’s screenshots & analysis."
    )

demo.launch(share=False)  # set share=True if you need a public link (for demo)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



<h1>Submission</h1>

In [None]:
# If needed:
# !pip install reportlab pandas

from pathlib import Path
import pandas as pd
from datetime import datetime
import zipfile
import glob
import os

# Project paths
EVAL_DIR = ROOT / "eval"
SS_DIR   = ROOT / "screenshots"
OUT_DIR  = ROOT / "submission"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Inputs (edit these placeholders)
GROUP_NUMBER = "4"                        # <-- set your group number
BITS_IDS     = ["2021A7PS0000G", "2021A7PS0001G"]   # <-- optional list for title page
GROUP_NAME   = "Group 4"
HOSTED_APP_URL = "http://localhost:7860"  # <-- paste your Gradio share or Streamlit URL

# Evaluation files (generated in Step 4)
MANDATORY_CSV = EVAL_DIR / "mandatory_tests.csv"
EXTENDED_CSV  = EVAL_DIR / "extended_eval.csv"
SUMMARY_CSV   = EVAL_DIR / "summary_metrics.csv"

# Notebook(s) to include in ZIP
NB_FILES = [
    "notebooks/main_pipeline.ipynb"        # adjust to your actual notebook(s)
]

# (Optional) additional code artifacts to include
CODE_FILES = [
    "scripts/step1_preprocess.py"
]

# Collect screenshots (add your 3 PNG/JPGs to data/screenshots/)
SCREENSHOTS = sorted([p for ext in ("*.png","*.jpg","*.jpeg") for p in SS_DIR.glob(ext)])[:3]
print("Screenshots found:", [p.name for p in SCREENSHOTS])


<h3>Load results & light validation</h3>

In [None]:
assert MANDATORY_CSV.exists(), "Missing data/eval/mandatory_tests.csv"
assert EXTENDED_CSV.exists(),  "Missing data/eval/extended_eval.csv"
assert SUMMARY_CSV.exists(),   "Missing data/eval/summary_metrics.csv"

df_mand = pd.read_csv(MANDATORY_CSV)
df_ext  = pd.read_csv(EXTENDED_CSV)
df_sum  = pd.read_csv(SUMMARY_CSV)

print("Mandatory rows:", len(df_mand))
print("Extended rows:", len(df_ext))
print("Summary rows:", len(df_sum))
df_sum


<h3>Build a polished PDF report with ReportLab</h3>

In [None]:
from reportlab.lib.pagesizes import A4
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak

REPORT_PDF = OUT_DIR / f"Group_{GROUP_NUMBER}_RAG_vs_FT_Report.pdf"

def build_report(pdf_path: Path,
                 df_sum: pd.DataFrame,
                 df_mand: pd.DataFrame,
                 df_ext: pd.DataFrame,
                 screenshots,
                 group_number: str,
                 group_name: str,
                 bits_ids,
                 hosted_url: str):
    doc = SimpleDocTemplate(str(pdf_path), pagesize=A4,
                            rightMargin=1.5*cm, leftMargin=1.5*cm,
                            topMargin=1.5*cm, bottomMargin=1.5*cm)
    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(name="Small", fontSize=9, leading=12))
    story = []

    # Title
    story.append(Paragraph(f"<b>Comparative Financial QA — RAG vs Fine-Tuning</b>", styles["Title"]))
    story.append(Spacer(1, 6))
    story.append(Paragraph(f"Group: <b>{group_name}</b> (#{group_number})", styles["Normal"]))
    if bits_ids:
        story.append(Paragraph(f"Members: {', '.join(bits_ids)}", styles["Normal"]))
    if hosted_url:
        story.append(Paragraph(f"Hosted App: <a href='{hosted_url}'>{hosted_url}</a>", styles["Normal"]))
    story.append(Paragraph(f"Generated on: {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}", styles["Small"]))
    story.append(Spacer(1, 12))

    story.append(Paragraph("<b>1) Mandatory Tests</b>", styles["Heading2"]))
    # small table
    mand_cols = ["TestType","Question","Method","Answer","Confidence","Time(s)","Guardrail"]
    mand_tbl = [mand_cols] + df_mand[mand_cols].fillna("—").values.tolist()
    t = Table(mand_tbl, repeatRows=1)
    t.setStyle(TableStyle([
        ("BACKGROUND", (0,0), (-1,0), colors.lightgrey),
        ("GRID", (0,0), (-1,-1), 0.25, colors.grey),
        ("FONTSIZE", (0,0), (-1,-1), 8),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
    ]))
    story.append(t)
    story.append(Spacer(1, 12))

    story.append(Paragraph("<b>2) Summary Comparison</b>", styles["Heading2"]))
    sum_tbl = [["Method","Accuracy","AvgConfidence","AvgTime(s)","N"]]
    for _, r in df_sum.iterrows():
        sum_tbl.append([r["Method"], f"{r['Accuracy']:.2f}", f"{r['AvgConfidence']:.2f}", f"{r['AvgTime']:.2f}", int(r["N"])])
    t2 = Table(sum_tbl, repeatRows=1)
    t2.setStyle(TableStyle([
        ("BACKGROUND", (0,0), (-1,0), colors.lightgrey),
        ("GRID", (0,0), (-1,-1), 0.25, colors.grey),
        ("FONTSIZE", (0,0), (-1,-1), 9),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
    ]))
    story.append(t2)
    story.append(Spacer(1, 12))

    story.append(Paragraph("<b>3) Screenshots</b>", styles["Heading2"]))
    if not screenshots:
        story.append(Paragraph("No screenshots provided. Add up to 3 PNG/JPG files under data/screenshots/ and re-run.", styles["Small"]))
    else:
        for ss in screenshots:
            try:
                img = Image(str(ss))
                img._restrictSize(16*cm, 9*cm)
                story.append(Paragraph(ss.name, styles["Small"]))
                story.append(img)
                story.append(Spacer(1, 8))
            except Exception as e:
                story.append(Paragraph(f"Could not load image {ss}: {e}", styles["Small"]))

    story.append(PageBreak())

    story.append(Paragraph("<b>4) Discussion</b>", styles["Heading2"]))
    discussion = """
    <b>RAG strengths:</b> factual grounding from retrieved contexts, robustness to out-of-domain queries, and explainability (supporting chunks).
    <br/><b>Fine-Tuned strengths:</b> faster inference after training, fluent answers, and no dependency on an index at runtime.
    <br/><b>MoE (LoRA experts) strengths:</b> improved specialization (numeric vs textual), potential latency benefits via sparse activation, efficient training footprint.
    <br/><b>Trade-offs:</b> RAG may be slower and sensitive to retrieval quality; FT may hallucinate without guardrails; MoE adds routing complexity.
    <br/><b>Guardrails:</b> Input domain/PII filtering and numeric-expectation checks reduced spurious outputs; optional RAG-backed number consistency further mitigated hallucination risk.
    """
    story.append(Paragraph(discussion, styles["Normal"]))

    doc.build(story)

build_report(REPORT_PDF, df_sum, df_mand, df_ext, SCREENSHOTS,
             GROUP_NUMBER, GROUP_NAME, BITS_IDS, HOSTED_APP_URL)

print("Report written to:", REPORT_PDF.resolve())


<h3>Create the submission ZIP</h3>

In [None]:
SUBMIT_ZIP = OUT_DIR / f"Group_{GROUP_NUMBER}_RAG_vs_FT.zip"

# Minimal README stub
README_TXT = OUT_DIR / "README_SUBMISSION.txt"
README_TXT.write_text(f"""Submission: Group {GROUP_NUMBER} — RAG vs Fine-Tuning

Contents:
- notebooks/main_pipeline.ipynb        # end-to-end pipeline (data → RAG → FT → eval)
- /eval/mandatory_tests.csv
- /eval/extended_eval.csv
- /eval/summary_metrics.csv
- submission/Group_{GROUP_NUMBER}_RAG_vs_FT_Report.pdf
- (optional) app.py or run instructions for Gradio UI
Hosted app: {HOSTED_APP_URL}

How to run:
1) Install requirements from README.md / environment.yml
2) Run notebook cells in order (Step 1 → 4).
3) Launch Gradio UI in notebook (UI cell) or `streamlit run app.py` if using Streamlit.
""", encoding="utf-8")

# Paths to include
to_zip = [
    README_TXT,
    REPORT_PDF,
    MANDATORY_CSV,
    EXTENDED_CSV,
    SUMMARY_CSV,
]
# Add notebooks if present
to_zip += [Path(p) for p in NB_FILES if Path(p).exists()]
# Add selected code files if present
to_zip += [Path(p) for p in CODE_FILES if Path(p).exists()]

# Build ZIP
with zipfile.ZipFile(SUBMIT_ZIP, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for p in to_zip:
        arcname = p.relative_to(ROOT) if p.is_absolute() else p
        z.write(p, arcname=str(arcname))

print("ZIP created:", SUBMIT_ZIP.resolve())
print("Included files:")
for p in to_zip:
    print(" -", p)
