<h3>Imports & Paths</h3>

In [13]:
import json
from pathlib import Path
from typing import List, Tuple, Dict, Any

import fitz  # PyMuPDF
import pandas as pd

# Project paths
ROOT = Path("/Users/dulaldas5/Group_43_RAG_vs_FT")
RAW_DIR   = ROOT / "data" / "raw"
CLEAN_DIR = ROOT / "data" / "cleaned_text"
PROC_DIR  = ROOT / "data" / "processed"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

list(RAW_DIR.glob("*.pdf"))


[PosixPath('/Users/dulaldas5/Group_43_RAG_vs_FT/data/raw/infosys-ar-25.pdf'),
 PosixPath('/Users/dulaldas5/Group_43_RAG_vs_FT/data/raw/annual-report-2024.pdf')]

<h3>Utility Helpers</h3>

In [20]:
HEADER_FOOTER_HINTS = [
    r"^page\s*\d+(\s*of\s*\d+)?$",   # Page 1 of 100
    r"^\d+$",                        # just a number (often a page number)
]

def clean_lines(lines: List[str]) -> List[str]:
    """
    Basic line-level cleanup:
    - strip
    - drop obvious headers/footers/page numbers
    - drop very short all-caps tokens
    - collapse multiple spaces
    """
    cleaned = []
    for ln in lines:
        s = ln.strip()
        if not s:
            continue
        drop = False
        for pat in HEADER_FOOTER_HINTS:
            if re.fullmatch(pat, s, flags=re.IGNORECASE):
                drop = True
                break
        if drop:
            continue
        if len(s) <= 3 and s.isupper():
            continue
        s = re.sub(r"\s+", " ", s)
        cleaned.append(s)
    return cleaned


<h3>Metric Patterns & Extraction</h3>

In [21]:
def extract_pdf_text(pdf_path: Path) -> Tuple[str, List[Tuple[int, str]]]:
    """
    Extract text per page using PyMuPDF and clean it.
    Returns: (full_clean_text, [(page_no, page_clean_text), ...])
    """
    doc = fitz.open(pdf_path)
    pages = []
    for pno in range(len(doc)):
        page = doc[pno]
        raw = page.get_text("text") or ""
        clines = clean_lines(raw.splitlines())
        pages.append((pno + 1, "\n".join(clines)))
    doc.close()

    full = "\n\n".join(t for _, t in pages if t.strip())
    return full, pages

def write_clean_text(doc_name: str, full_text: str, pages: List[Tuple[int, str]]):
    (CLEAN_DIR / f"{doc_name}.txt").write_text(full_text, encoding="utf-8")
    per_page = [{"page": p, "text": t} for p, t in pages]
    (CLEAN_DIR / f"{doc_name}.pages.json").write_text(
        json.dumps(per_page, ensure_ascii=False, indent=2), encoding="utf-8"
    )


In [22]:
SECTION_PATTERNS = {
    "balance_sheet": r"\b(balance\s+sheet|statement\s+of\s+financial\s+position)\b",
    "income_statement": r"\b(income\s+statement|profit\s+and\s+loss|statement\s+of\s+operations)\b",
    "cash_flow": r"\b(cash\s+flow|statement\s+of\s+cash\s+flows)\b",
    "mdna": r"\b(management\s+discussion\s+and\s+analysis|md&a)\b",
    "notes": r"\b(notes\s+to\s+the\s+financial\s+statements|notes\s+to\s+accounts)\b",
}

def rough_section_indices(text: str) -> Dict[str, List[int]]:
    idx = {}
    low = text.lower()
    for name, pat in SECTION_PATTERNS.items():
        idx[name] = [m.start() for m in re.finditer(pat, low)]
    return idx

def segment_sections(text: str) -> Dict[str, str]:
    indices = rough_section_indices(text)
    all_starts = []
    for sec, starts in indices.items():
        for s in starts:
            all_starts.append((s, sec))
    if not all_starts:
        return {"full_report": text}

    all_starts.sort(key=lambda x: x[0])
    result = {}
    for i, (start, sec) in enumerate(all_starts):
        end = all_starts[i+1][0] if i+1 < len(all_starts) else len(text)
        if sec not in result:  # keep first occurrence only
            result[sec] = text[start:end].strip()
    if not result:
        result["full_report"] = text
    return result


<h3>Chunking (100-word & 400-word windows with overlap)</h3>

In [23]:
def chunk_words(words, chunk_size, overlap=20):
    i = 0
    n = len(words)
    while i < n:
        j = min(i + chunk_size, n)
        yield (i, j, " ".join(words[i:j]))
        if j == n:
            break
        i = max(j - overlap, i + 1)

def make_chunks(doc_name: str, full_text: str, pages: List[Tuple[int, str]], sizes=(100, 400)):
    # Map approx page spans for traceability
    page_word_spans = []
    cum = 0
    for p, t in pages:
        wc = len(t.split())
        page_word_spans.append((p, cum, cum + wc))
        cum += wc

    def span_to_pages(start_w: int, end_w: int):
        touched = []
        for p, a, b in page_word_spans:
            if end_w <= a:
                break
            if start_w >= b:
                continue
            touched.append(p)
        if not touched:
            return []
        return [min(touched), max(touched)] if len(touched) > 1 else [touched[0], touched[0]]

    words = full_text.split()
    outputs = {}
    for sz in sizes:
        chunks = []
        for idx, (start, end, text) in enumerate(chunk_words(words, chunk_size=sz, overlap=20)):
            chunks.append({
                "chunk_id": f"{doc_name}_{sz}_{idx:05d}",
                "doc_name": doc_name,
                "chunk_size_words": sz,
                "start_word_index": start,
                "end_word_index": end,
                "pages_approx": span_to_pages(start, end),
                "text": text
            })
        outputs[sz] = chunks
    return outputs

def save_chunks(all_chunks: Dict[int, list]):
    PROC_DIR.mkdir(parents=True, exist_ok=True)
    for sz, chs in all_chunks.items():
        # JSONL
        jsonl_path = PROC_DIR / f"chunks_{sz}.jsonl"
        with open(jsonl_path, "w", encoding="utf-8") as f:
            for c in chs:
                f.write(json.dumps(c, ensure_ascii=False) + "\n")
        # JSON (convenience)
        (PROC_DIR / f"chunks_{sz}.json").write_text(
            json.dumps(chs, ensure_ascii=False, indent=2), encoding="utf-8"
        )


<h3>Run on All Raw PDFs</h3>

In [24]:
pdfs = sorted(RAW_DIR.glob("*.pdf"))
assert len(pdfs) > 0, "No PDFs found in data/raw/. Please add your annual reports."

all_chunks_100, all_chunks_400 = [], []

for pdf in pdfs:
    doc_name = pdf.stem
    print(f"[INFO] Processing {pdf.name} ...")
    full_text, pages = extract_pdf_text(pdf)
    # Save cleaned text + per-page JSON
    write_clean_text(doc_name, full_text, pages)
    # Save rough section splits for reference
    sections = segment_sections(full_text)
    (CLEAN_DIR / f"{doc_name}.sections.json").write_text(
        json.dumps(sections, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    # Build chunks (100 & 400)
    chunks_by_size = make_chunks(doc_name, full_text, pages, sizes=(100, 400))
    all_chunks_100.extend(chunks_by_size[100])
    all_chunks_400.extend(chunks_by_size[400])

# Save combined chunk files
save_chunks({100: all_chunks_100, 400: all_chunks_400})

print("\n[STATS] Documents processed:", [p.name for p in pdfs])
print("[STATS] 100-word chunks:", len(all_chunks_100))
print("[STATS] 400-word chunks:", len(all_chunks_400))
print("[OK] Outputs saved to:", PROC_DIR.resolve())


[INFO] Processing annual-report-2024.pdf ...
[INFO] Processing infosys-ar-25.pdf ...

[STATS] Documents processed: ['annual-report-2024.pdf', 'infosys-ar-25.pdf']
[STATS] 100-word chunks: 4212
[STATS] 400-word chunks: 888
[OK] Outputs saved to: /Users/dulaldas5/Group_43_RAG_vs_FT/data/processed


<h3>Quick Peek / Sanity Check</h3>

In [25]:
# Show a few cleaned lines and a couple of chunks for verification
txt_files = sorted(CLEAN_DIR.glob("*.txt"))
print("Cleaned text files:", [p.name for p in txt_files][:5])

sample_txt = txt_files[0].read_text(encoding="utf-8").splitlines()[:15]
print("\nSample cleaned lines:\n", "\n".join(sample_txt))

import json
sample_100 = json.loads((PROC_DIR / "chunks_100.json").read_text(encoding="utf-8"))[:3]
pd.DataFrame(sample_100)[["chunk_id","pages_approx","text"]].head(3)


Cleaned text files: ['annual-report-2024.txt', 'infosys-ar-25.txt']

Sample cleaned lines:
 Generative AI and You
Integrated Annual Report 2023-24

Infosys Integrated Annual Report 2023-24
We barely saw it happen. AI walking into our lives. Through
the ads that follow us on social media. The personalized
pick of movies and shows. Our cars. The maps helping us
navigate. Right there in our hands – our super-powerful
phones. And now, it’s happening again. This time with
generative AI. In the form of handy tools – like ChatGPT,
MetaAI and Stable Diffusion – that pique our imagination,
and stoke our curiosity.
Generative AI technology’s path into enterprises too has
been just as accelerated and enthusiastic, supported by
an exponential increase in investments. While almost


Unnamed: 0,chunk_id,pages_approx,text
0,annual-report-2024_100_00000,"[1, 2]",Generative AI and You Integrated Annual Report...
1,annual-report-2024_100_00001,"[2, 2]","pique our imagination, and stoke our curiosity..."
2,annual-report-2024_100_00002,"[2, 2]","believe, some ongoing AI pilots will scale to ..."


In [26]:
OUT_QA_JSONL = ROOT / "data" / "qa_pairs.jsonl"
OUT_QA_CSV   = ROOT / "data" / "qa_pairs.csv"

In [27]:
def load_clean_texts(clean_dir: Path) -> Dict[str, str]:
    texts = {}
    for p in clean_dir.glob("*.txt"):
        texts[p.stem] = p.read_text(encoding="utf-8", errors="ignore")
    return texts

def load_sections(clean_dir: Path) -> Dict[str, Dict[str, str]]:
    sections = {}
    for p in clean_dir.glob("*.sections.json"):
        try:
            sections[p.stem.replace(".sections","")] = json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            pass
    return sections

def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def guess_company_from_text(doc_name: str, text: str) -> str:
    first = "\n".join(text.splitlines()[:20])
    m = re.search(r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,3})\b", first)
    return m.group(1) if m else doc_name

def find_years(text: str):
    return sorted({int(y) for y in re.findall(r"\b(20\d{2})\b", text)})


<h3>Patterns & Extraction</h3>

In [28]:
# Amount/units patterns
CURRENCY = r"(?:₹|\$|USD|EUR|INR|Rs\.?)?"
NUM      = r"(?:\d{1,3}(?:[,\s]\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)"
UNITS    = r"(?:\s?(?:million|billion|mn|bn|crore|lakh|cr|m|bn))?"
AMOUNT   = CURRENCY + r"\s*" + NUM + r"\s*" + UNITS

METRIC_PATTERNS = {
    "revenue":      r"\b(revenue|total\s+revenue|net\s+sales|sales)\b[:\s\-–]*" + AMOUNT,
    "net_income":   r"\b(net\s+(?:income|profit)|profit\s+after\s+tax|PAT)\b[:\s\-–]*" + AMOUNT,
    "ebitda":       r"\b(EBITDA)\b[:\s\-–]*" + AMOUNT,
    "eps":          r"\b(EPS|earnings\s+per\s+share)\b[:\s\-–]*" + AMOUNT,
    "cash_flow":    r"\b(net\s+cash\s+from\s+operating\s+activities|operating\s+cash\s+flow)\b[:\s\-–]*" + AMOUNT,
    "assets":       r"\b(total\s+assets)\b[:\s\-–]*" + AMOUNT,
    "liabilities":  r"\b(total\s+liabilities)\b[:\s\-–]*" + AMOUNT,
}

def clean_amount(val: str) -> str:
    v = normalize_spaces(val)
    v = v.replace("USD", "USD ").replace("INR", "INR ").replace("Rs.", "Rs ")
    v = re.sub(r"\s+", " ", v)
    return v.strip()

def extract_metric_sentences(text: str, metric: str, pattern: str) -> List[Dict[str, Any]]:
    out = []
    for m in re.finditer(pattern, text, flags=re.IGNORECASE):
        span = m.span()
        start = max(0, span[0] - 160)
        end   = min(len(text), span[1] + 160)
        context = normalize_spaces(text[start:end])
        raw = normalize_spaces(m.group(0))
        amt_m = re.search(AMOUNT, raw, flags=re.IGNORECASE)
        amount = clean_amount(amt_m.group(0)) if amt_m else raw
        near = text[max(0, span[0]-80): min(len(text), span[1]+80)]
        year_m = re.search(r"\b(20\d{2})\b", near)
        year = int(year_m.group(1)) if year_m else None
        out.append({"metric": metric, "amount": amount, "year": year, "raw": raw, "context": context})
    return out

def extract_all_metrics(text: str) -> List[Dict[str, Any]]:
    results = []
    for metric, pat in METRIC_PATTERNS.items():
        results.extend(extract_metric_sentences(text, metric, pat))
    return results


<h3>Build Q/A Candidates</h3>

In [29]:
texts = load_clean_texts(CLEAN_DIR)
sections = load_sections(CLEAN_DIR)

candidates = []
for doc_name, txt in texts.items():
    company = guess_company_from_text(doc_name, txt)
    facts = extract_all_metrics(txt)

    for f in facts:
        metric, amount, year, ctx = f["metric"], f["amount"], f["year"], f["context"]

        if metric == "revenue":
            q = f"What was {company}'s revenue in {year}?"
        elif metric == "net_income":
            q = f"What was {company}'s net income in {year}?"
        elif metric == "ebitda":
            q = f"What was {company}'s EBITDA in {year}?"
        elif metric == "eps":
            q = f"What was {company}'s EPS in {year}?"
        elif metric == "cash_flow":
            q = f"What was {company}'s operating cash flow in {year}?"
        elif metric == "assets":
            q = f"What were {company}'s total assets in {year}?"
        elif metric == "liabilities":
            q = f"What were {company}'s total liabilities in {year}?"
        else:
            q = f"What was {metric.replace('_',' ')} in {year} for {company}?"

        conf = 0.9 if (year is not None and re.search(NUM, amount)) else 0.6
        candidates.append({
            "question": q,
            "answer": amount,
            "metric": metric,
            "year": year,
            "company": company,
            "source_doc": doc_name,
            "context_snippet": ctx,
            "confidence_heuristic": conf
        })

len(candidates)


26

<h3>Deduplicate & Add YoY Comparison Q/As</h3>

In [30]:
# Deduplicate by (question, answer)
seen = set()
uniq = []
for it in candidates:
    key = (it["question"], it["answer"])
    if key not in seen:
        seen.add(key)
        uniq.append(it)

def build_yoy_pairs(items: List[Dict[str, Any]], metric: str, company: str) -> List[Dict[str, Any]]:
    by_year = {}
    for it in items:
        if it["metric"] == metric and it["company"] == company and it["year"]:
            by_year[it["year"]] = it["answer"]
    years_sorted = sorted(by_year.keys(), reverse=True)
    out = []
    if len(years_sorted) >= 2:
        y1, y2 = years_sorted[0], years_sorted[1]
        q = f"Compare {company}'s {metric.replace('_',' ')} in {y2} vs {y1}."
        a = f"{y2}: {by_year[y2]}; {y1}: {by_year[y1]}."
        out.append({
            "question": q,
            "answer": a,
            "metric": f"{metric}_comparison",
            "year": f"{y2} vs {y1}",
            "company": company,
            "source_doc": "multiple",
            "context_snippet": "",
            "confidence_heuristic": 0.75
        })
    return out

companies = sorted({it["company"] for it in uniq})
yoy = []
for comp in companies:
    yoy += build_yoy_pairs(uniq, "revenue", comp)
    yoy += build_yoy_pairs(uniq, "net_income", comp)

qa_dataset = uniq + yoy
len(qa_dataset)


21

<h3>Balance to ~50 Q/As</h3>

In [31]:
# If you want to cap to ~50 for FT while keeping variety
def balanced_sample(items: List[Dict[str, Any]], per_metric: int = 8, max_total: int = 50):
    by_metric = {}
    for it in items:
        by_metric.setdefault(it["metric"], []).append(it)
    sampled = []
    # sort by confidence within each metric
    for m, arr in by_metric.items():
        arr_sorted = sorted(arr, key=lambda x: x.get("confidence_heuristic", 0), reverse=True)
        sampled.extend(arr_sorted[:per_metric])
    # if more than max_total, keep highest-confidence globally
    if len(sampled) > max_total:
        sampled = sorted(sampled, key=lambda x: x.get("confidence_heuristic", 0), reverse=True)[:max_total]
    return sampled

qa_balanced = balanced_sample(qa_dataset, per_metric=8, max_total=50)
print("Total candidates:", len(qa_dataset))
print("Balanced sample:", len(qa_balanced))
df_preview = pd.DataFrame(qa_balanced)[["question","answer","metric","year","company","source_doc","confidence_heuristic"]]
df_preview.head(15)


Total candidates: 21
Balanced sample: 18


Unnamed: 0,question,answer,metric,year,company,source_doc,confidence_heuristic
0,What was Integrated Annual Report's revenue in...,94111.0,revenue,,Integrated Annual Report,infosys-ar-25,0.6
1,What was Integrated Annual Report's revenue in...,1.0,revenue,,Integrated Annual Report,infosys-ar-25,0.6
2,What was Integrated Annual Report's revenue in...,6713.0,revenue,,Integrated Annual Report,infosys-ar-25,0.6
3,What was Integrated Annual Report's revenue in...,18.7,revenue,,Integrated Annual Report,infosys-ar-25,0.6
4,What was Integrated Annual Report's revenue in...,8492.0,revenue,,Integrated Annual Report,infosys-ar-25,0.6
5,What was Generative AI's revenue in None?,89032.0,revenue,,Generative AI,annual-report-2024,0.6
6,What was Generative AI's revenue in None?,1.0,revenue,,Generative AI,annual-report-2024,0.6
7,What was Generative AI's revenue in None?,5698.0,revenue,,Generative AI,annual-report-2024,0.6
8,What was Integrated Annual Report's net income...,25568.0,net_income,,Integrated Annual Report,infosys-ar-25,0.6
9,What was Integrated Annual Report's net income...,26750.0,net_income,,Integrated Annual Report,infosys-ar-25,0.6


<h3>Save Q/As (JSONL + CSV)</h3>

In [32]:
to_save = qa_balanced if len(qa_balanced) >= 40 else qa_dataset  # fall back if not enough
OUT_QA_JSONL.parent.mkdir(parents=True, exist_ok=True)

with open(OUT_QA_JSONL, "w", encoding="utf-8") as f:
    for row in to_save:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

pd.DataFrame(to_save).to_csv(OUT_QA_CSV, index=False)

print("Saved:")
print(" -", OUT_QA_JSONL.resolve())
print(" -", OUT_QA_CSV.resolve())
print("Total Q/A pairs saved:", len(to_save))


Saved:
 - /Users/dulaldas5/Group_43_RAG_vs_FT/data/qa_pairs.jsonl
 - /Users/dulaldas5/Group_43_RAG_vs_FT/data/qa_pairs.csv
Total Q/A pairs saved: 21


In [33]:
df = pd.read_csv(OUT_QA_CSV)
display_cols = ["question","answer","metric","year","company","source_doc","confidence_heuristic"]
df.sort_values(by="confidence_heuristic", ascending=False)[display_cols].head(25)


Unnamed: 0,question,answer,metric,year,company,source_doc,confidence_heuristic
20,What were Generative AI's total assets in 2023?,1,assets,2023.0,Generative AI,annual-report-2024,0.9
9,What were Integrated Annual Report's total ass...,1,assets,2024.0,Integrated Annual Report,infosys-ar-25,0.9
11,What was Generative AI's revenue in None?,1,revenue,,Generative AI,annual-report-2024,0.6
19,What were Generative AI's total assets in None?,26.6,assets,,Generative AI,annual-report-2024,0.6
18,What were Generative AI's total assets in None?,1,assets,,Generative AI,annual-report-2024,0.6
17,What was Generative AI's net income in None?,26248,net_income,,Generative AI,annual-report-2024,0.6
16,What was Generative AI's net income in None?,27234,net_income,,Generative AI,annual-report-2024,0.6
15,What was Generative AI's revenue in None?,"107,413 102,353",revenue,,Generative AI,annual-report-2024,0.6
14,What was Generative AI's revenue in None?,7341,revenue,,Generative AI,annual-report-2024,0.6
13,What was Generative AI's revenue in None?,21.1,revenue,,Generative AI,annual-report-2024,0.6


<h1>Step 2 (RAG)</h1>

In [38]:
# If needed:
# !pip install sentence-transformers faiss-cpu rank-bm25 scikit-learn pandas numpy tqdm

from pathlib import Path
import json
import re
import pickle
import time
from typing import List, Dict, Tuple
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi

import faiss  # CPU index

# Project paths (relative to repo root)
ROOT = Path("/Users/dulaldas5/Group_43_RAG_vs_FT")
PROC_DIR   = ROOT / "data" / "processed"
EMB_DIR    = ROOT / "embeddings"
FAISS_DIR  = EMB_DIR / "faiss_index"
BM25_DIR   = EMB_DIR / "bm25_index"

for p in [EMB_DIR, FAISS_DIR, BM25_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Choose which chunk set to index (100 or 400-word chunks)
CHUNK_SIZE_TO_USE = 400
CHUNKS_PATH = PROC_DIR / f"chunks_{CHUNK_SIZE_TO_USE}.jsonl"

assert CHUNKS_PATH.exists(), f"Missing {CHUNKS_PATH}. Run Step 1 to generate chunk files."
print("Using chunks file:", CHUNKS_PATH)


ModuleNotFoundError: No module named 'faiss'

In [37]:
pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Frameworks/Python.framework/Versions/3.13/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [39]:
pip install faiss-gpu

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Frameworks/Python.framework/Versions/3.13/bin/python3 -m pip install --upgrade pip[0m
[31mERROR: No matching distribution found for faiss-gpu[0m[31m
Note: you may need to restart the kernel to use updated packages.
