# Goal: Extract PDF text with page refs, detect competition boundaries, and persist raw artifacts.

## Imports & paths

In [4]:
# --- 0) Imports & paths ---
from pathlib import Path
import re, json
import fitz  # PyMuPDF
import pandas as pd

NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent

PDF_PATH = ROOT / "data/raw/rag-example-qa.pdf"
OUT_DIR = ROOT / "data/processed"
(OUT_DIR / "md").mkdir(parents=True, exist_ok=True)
(OUT_DIR / "jsonl").mkdir(parents=True, exist_ok=True)
(OUT_DIR / "csv").mkdir(parents=True, exist_ok=True)

## Extract text **with page numbers**

- Use **PyMuPDF (fitz)** to iterate pages.

- Store a list of dicts: `{"page": 1, "text": "...raw page text..."}`.

- Save to `data/processed/page_dump.parquet` (or JSONL) for traceability.

In [5]:
# --- 1) Extract text page-by-page ---
def extract_pages(pdf_path: Path):
    pages = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc, start=1):
            text = page.get_text("text")  # preserves reading order reasonably
            pages.append({"page": i, "text": text})
    return pages

pages = extract_pages(PDF_PATH)
len(pages), pages[0]["page"], pages[0]["text"][:300]

(28,
 1,
 "Hava Savunma S)stemler) Yarışması \n1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r? \nYarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek \nhava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma \ns+stemler+n+n önem+n+n ülke ça")

In [9]:
# QUICK CHECK: page count, empty pages, sample preview
import statistics as stats

assert len(pages) > 0, "No pages extracted."
lengths = [len(p["text"]) for p in pages]
print(f"Pages: {len(pages)} | avg chars/page: {int(stats.mean(lengths))} | min: {min(lengths)} | max: {max(lengths)}")
empties = [p["page"] for p in pages if len(p["text"].strip()) == 0]
print("Empty pages:", empties[:5], "..." if len(empties) > 5 else "")
print("Preview p1:\n", pages[0]["text"][:400])

Pages: 28 | avg chars/page: 2324 | min: 1475 | max: 3035
Empty pages: [] 
Preview p1:
 Hava Savunma S)stemler) Yarışması 
1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r? 
Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek 
hava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma 
s+stemler+n+n önem+n+n ülke çapında gen+ş b+r tabana yayılarak özgün, yerl+ ve yetenekl+ 
s+stemler+n gel+şt+r+lmes+n+ sağlamak da


## Quick section boundary detection (competition names)

- Scan each page’s text for any of these **anchors** (case-insensitive):

    - `Hava Savunma Sistemleri`

    - `E-Ticaret Hackathonu`

    - `Yapay Zeka Destekli Adres Çözümleme`

- Create a `page → competition` map by “last seen anchor wins”.

- Save to `data/processed/page_sections.parquet`.

In [11]:
# --- 2) Detect competition per page via simple anchors ---
# Canonical names and short codes
COMP_NAMES = {
    "hava savunma sistemleri": "HSS",
    "e-ticaret hackathonu": "E-TICARET",
    "yapay zeka destekli adres çözümleme": "ADRES",
}

def detect_competitions(pages):
    last_seen = None
    comp_by_page = {}
    for p in pages:
        t_low = p["text"].casefold()
        found = None
        for anchor, code in COMP_NAMES.items():
            if anchor in t_low:
                found = code
                break
        last_seen = found or last_seen
        comp_by_page[p["page"]] = last_seen  # may be None initially
    # Forward/back fill if any None remain
    # Back-fill from next known, then forward-fill from previous known
    keys = sorted(comp_by_page)
    # back-fill
    next_seen = None
    for k in reversed(keys):
        if comp_by_page[k] is None and next_seen is not None:
            comp_by_page[k] = next_seen
        elif comp_by_page[k] is not None:
            next_seen = comp_by_page[k]
    # forward-fill
    prev_seen = None
    for k in keys:
        if comp_by_page[k] is None and prev_seen is not None:
            comp_by_page[k] = prev_seen
        elif comp_by_page[k] is not None:
            prev_seen = comp_by_page[k]
    return comp_by_page

comp_by_page = detect_competitions(pages)
pd.DataFrame({"page":[p["page"] for p in pages], "competition":[comp_by_page[p["page"]] for p in pages]}).head(28)


Unnamed: 0,page,competition
0,1,HSS
1,2,HSS
2,3,HSS
3,4,HSS
4,5,HSS
5,6,HSS
6,7,HSS
7,8,HSS
8,9,HSS
9,10,HSS


In [7]:
# --- 3) Save raw JSONL dump and page→competition map ---
# JSONL: one line per page {page, text}
with open(OUT_DIR / "page_dump.jsonl", "w", encoding="utf-8") as f:
    for p in pages:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

# CSV map
pd.DataFrame([
    {"page": p["page"], "competition": comp_by_page[p["page"]]}
    for p in pages
]).to_csv(OUT_DIR / "page_sections.csv", index=False)


In [12]:
# FILE CHECK: artifacts exist and are non-empty
for p in [OUT_DIR / "page_dump.jsonl", OUT_DIR / "page_sections.csv"]:
    assert p.exists() and p.stat().st_size > 0, f"Missing or empty: {p}"
print("Saved:", (OUT_DIR / "page_dump.jsonl"), (OUT_DIR / "page_sections.csv"))


Saved: /Users/macbook/T3/rag-tekno/data/processed/page_dump.jsonl /Users/macbook/T3/rag-tekno/data/processed/page_sections.csv


## Persist a raw, linearized dump (optional but handy)

- Concatenate pages into a single text with clear separators:

    ```
    === PAGE 1 (HSS) ===
    ...text...
    === PAGE 2 (HSS) ===
    ```

- Save as `data/processed/md/raw_linearized.md` for quick greps later.

In [8]:
# --- 4) Linearized Markdown (handy for quick manual checks/greps) ---
linear_md = []
for p in pages:
    comp = comp_by_page[p["page"]] or "UNKNOWN"
    linear_md.append(f"=== PAGE {p['page']} ({comp}) ===\n{p['text'].rstrip()}\n\n")
(Path(OUT_DIR / "md") / "raw_linearized.md").write_text("".join(linear_md), encoding="utf-8")

print("Ingest complete →",
      "\n- data/processed/page_dump.jsonl",
      "\n- data/processed/page_sections.csv",
      "\n- data/processed/md/raw_linearized.md")


Ingest complete → 
- data/processed/page_dump.jsonl 
- data/processed/page_sections.csv 
- data/processed/md/raw_linearized.md
