# Notebook 01 — Ingestion & Structure
Goal: extract PDF → normalize Turkish text → map pages to competitions → prepare linearized MD.

In [26]:
# --- 0) Setup & paths ---
from pathlib import Path

# Resolve project root (assuming notebooks/ is under project root)
NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent
# Standard data layout
RAW_DIR        = ROOT / "data" / "raw"
PROCESSED_DIR  = ROOT / "data" / "processed"
MD_DIR         = PROCESSED_DIR / "md"
JSONL_DIR      = PROCESSED_DIR / "jsonl"
CSV_DIR        = PROCESSED_DIR / "csv"

# Ensure output dirs exist
for d in (RAW_DIR, PROCESSED_DIR, MD_DIR, JSONL_DIR, CSV_DIR):
    d.mkdir(parents=True, exist_ok=True)

# Candidate PDF locations (put the file in data/raw as rag-example-qa.pdf)
PDF_CANDIDATES = [
    RAW_DIR / "rag-example-qa.pdf",
    ROOT / "rag-example-qa.pdf",            # fallback if you keep it at project root
]

pdf_path = next((p for p in PDF_CANDIDATES if p.exists()), None)
if pdf_path is None:
    raise FileNotFoundError(
        f"PDF not found. Place your file at: {RAW_DIR / 'rag-example-qa.pdf'} and re-run this cell."
    )

# Declare outputs we’ll write in later cells
OUT_PAGE_JSONL     = PROCESSED_DIR / "page_dump.jsonl"
OUT_PAGE_SECTIONS  = PROCESSED_DIR / "page_sections.csv"
OUT_RAW_LINEAR_MD  = MD_DIR / "raw_linearized.md"

print("Project ROOT     :", ROOT)
print("Using PDF        :", pdf_path)
print("Outputs will be  :", OUT_PAGE_JSONL, OUT_PAGE_SECTIONS, OUT_RAW_LINEAR_MD, sep="\n  - ")

Project ROOT     : /Users/macbook/T3/rag-tekno
Using PDF        : /Users/macbook/T3/rag-tekno/data/raw/rag-example-qa.pdf
Outputs will be  :
  - /Users/macbook/T3/rag-tekno/data/processed/page_dump.jsonl
  - /Users/macbook/T3/rag-tekno/data/processed/page_sections.csv
  - /Users/macbook/T3/rag-tekno/data/processed/md/raw_linearized.md


## Extract text page-by-page (writes page_dump.jsonl)

In [28]:
# --- 1) Extract text page-by-page ---
import fitz  # PyMuPDF
import json, statistics as stats

pages = []
with fitz.open(pdf_path) as doc:
    for i, page in enumerate(doc, start=1):
        text = page.get_text("text")  # preserves reading order reasonably
        pages.append({"page": i, "text": text})

# persist raw page dump (one JSON per line)
with open(OUT_PAGE_JSONL, "w", encoding="utf-8") as f:
    for p in pages:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

# quick checks
lengths = [len(p["text"]) for p in pages]
print(f"Pages extracted: {len(pages)}")
print(f"Avg chars/page: {int(stats.mean(lengths))} | min: {min(lengths)} | max: {max(lengths)}")
print("Preview p1:\n", pages[0]["text"][:400])

Pages extracted: 28
Avg chars/page: 2324 | min: 1475 | max: 3035
Preview p1:
 Hava Savunma S)stemler) Yarışması 
1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r? 
Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek 
hava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma 
s+stemler+n+n önem+n+n ülke çapında gen+ş b+r tabana yayılarak özgün, yerl+ ve yetenekl+ 
s+stemler+n gel+şt+r+lmes+n+ sağlamak da


 ## Normalize Turkish text (dehyphen, newline cleanup) + quick checks

Purpose: create a clean text_norm per page; verify dehyphenation improved things.

In [29]:
# --- 2) Normalize Turkish text (idempotent) ---
import re

SOFT_HYPHEN = "\u00ad"

def normalize_text(s: str) -> str:
    # remove soft hyphens / tabs
    s = s.replace(SOFT_HYPHEN, "").replace("\t", " ")
    # dehyphenate across line breaks: "kelime-\nler" -> "kelimeler"
    s = re.sub(r"(\w+)-\n(\w+)", r"\1\2", s)
    # unify newlines
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    # collapse 3+ newlines to 2
    s = re.sub(r"\n{3,}", "\n\n", s)
    # ensure numbered lists have a space after "1." etc
    s = re.sub(r"(?m)^(\s*\d+)\.(\S)", r"\1. \2", s)
    return s

# apply + quick health checks
hyphen_breaks_before = sum(len(re.findall(r"(\w+)-\n(\w+)", p["text"])) for p in pages)
for p in pages:
    p["text_norm"] = normalize_text(p["text"])
hyphen_breaks_after  = sum(len(re.findall(r"(\w+)-\n(\w+)", p["text_norm"])) for p in pages)

print(f"Dehyphenation: before={hyphen_breaks_before} → after={hyphen_breaks_after}")
print("Normalized preview p1:\n", pages[0]["text_norm"][:400])

Dehyphenation: before=3 → after=0
Normalized preview p1:
 Hava Savunma S)stemler) Yarışması 
1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r? 
Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek 
hava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma 
s+stemler+n+n önem+n+n ülke çapında gen+ş b+r tabana yayılarak özgün, yerl+ ve yetenekl+ 
s+stemler+n gel+şt+r+lmes+n+ sağlamak da


## Detect competition per page (robust anchors + fill) — brief
Purpose: tag each page as HSS, E-TICARET, or ADRES via tolerant matching (handles hyphen/spacing/diacritics), then back/forward-fill gaps. Prints counts and boundary pages.

In [30]:
# --- 3) Detect competition per page (robust anchors) ---
import re, unicodedata, pandas as pd

def norm_for_match(s: str) -> str:
    # strip diacritics, lowercase, collapse hyphens/punct to spaces
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.lower()
    s = re.sub(r"[-_./]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

ANCHORS = {
    "HSS": [
        r"\bhava savunma sistemleri\b",
        r"\bhss\b",
    ],
    "E-TICARET": [
        r"\be ticaret hackathonu\b",
        r"\be ticaret\b",
        r"\beticaret\b",
    ],
    "ADRES": [
        r"\byapay zeka destekli adres cozumleme\b",
        r"\byapay zeka destekli adres\b",
        r"\badres cozumleme\b",
    ],
}
ANCHORS_RE = {code: [re.compile(p) for p in pats] for code, pats in ANCHORS.items()}

def detect_competitions(pages):
    last_seen = None
    comp_by_page = {}
    # pass 1: look for explicit anchors
    for p in pages:
        t_norm = norm_for_match(p["text_norm"])
        found = None
        for code, patterns in ANCHORS_RE.items():
            if any(rx.search(t_norm) for rx in patterns):
                found = code
                break
        last_seen = found or last_seen
        comp_by_page[p["page"]] = last_seen  # may be None initially

    # pass 2: back-fill from next known
    keys = sorted(comp_by_page)
    next_seen = None
    for k in reversed(keys):
        if comp_by_page[k] is None and next_seen is not None:
            comp_by_page[k] = next_seen
        elif comp_by_page[k] is not None:
            next_seen = comp_by_page[k]

    # pass 3: forward-fill from previous known
    prev_seen = None
    for k in keys:
        if comp_by_page[k] is None and prev_seen is not None:
            comp_by_page[k] = prev_seen
        elif comp_by_page[k] is not None:
            prev_seen = comp_by_page[k]
    return comp_by_page

comp_by_page = detect_competitions(pages)

df_comp = pd.DataFrame({
    "page": [p["page"] for p in pages],
    "competition": [comp_by_page[p["page"]] for p in pages]
})

print("by competition:\n", df_comp["competition"].value_counts(dropna=False))
transitions = df_comp[df_comp["competition"].shift(1) != df_comp["competition"]]
print("\nDetected competition boundaries (first few):\n", transitions.head(10))

by competition:
 competition
HSS          14
ADRES        10
E-TICARET     4
Name: count, dtype: int64

Detected competition boundaries (first few):
     page competition
0      1         HSS
14    15   E-TICARET
18    19       ADRES


## Save page→competition map (CSV)
Purpose: persist the routing we just detected for downstream notebooks.

In [32]:
# --- 4) Persist page→competition map ---
import pandas as pd

df_comp = pd.DataFrame({
    "page": [p["page"] for p in pages],
    "competition": [comp_by_page[p["page"]] for p in pages]
})
df_comp.to_csv(OUT_PAGE_SECTIONS, index=False, encoding="utf-8")
print("Wrote:", OUT_PAGE_SECTIONS, "| rows:", len(df_comp))
print(df_comp.head(28))

Wrote: /Users/macbook/T3/rag-tekno/data/processed/page_sections.csv | rows: 28
    page competition
0      1         HSS
1      2         HSS
2      3         HSS
3      4         HSS
4      5         HSS
5      6         HSS
6      7         HSS
7      8         HSS
8      9         HSS
9     10         HSS
10    11         HSS
11    12         HSS
12    13         HSS
13    14         HSS
14    15   E-TICARET
15    16   E-TICARET
16    17   E-TICARET
17    18   E-TICARET
18    19       ADRES
19    20       ADRES
20    21       ADRES
21    22       ADRES
22    23       ADRES
23    24       ADRES
24    25       ADRES
25    26       ADRES
26    27       ADRES
27    28       ADRES


## Linearized Markdown (one block per page)
Purpose: quick, human-readable file for spot checks and future regex work.

In [33]:
# --- 5) Write linearized Markdown (per page with headers) ---
lines = []
for p in pages:
    comp = comp_by_page[p["page"]] or "UNKNOWN"
    lines.append(f"# PAGE {p['page']} | {comp}")
    lines.append("")  # blank line
    lines.append(p["text_norm"].rstrip())
    lines.append("\n---\n")  # page separator

md_text = "\n".join(lines)
MD_DIR.mkdir(parents=True, exist_ok=True)
OUT_RAW_LINEAR_MD.write_text(md_text, encoding="utf-8")

# quick checks
print("Wrote:", OUT_RAW_LINEAR_MD, "| chars:", len(md_text))
print("Anchors present:",
      "PAGE 1" in md_text,
      "E-TICARET" in md_text,
      "ADRES" in md_text)
print("\nPreview:\n", md_text.splitlines()[:12])

Wrote: /Users/macbook/T3/rag-tekno/data/processed/md/raw_linearized.md | chars: 65709
Anchors present: True True True

Preview:
 ['# PAGE 1 | HSS', '', 'Hava Savunma S)stemler) Yarışması ', "1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r? ", 'Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek ', 'hava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma ', 's+stemler+n+n önem+n+n ülke çapında gen+ş b+r tabana yayılarak özgün, yerl+ ve yetenekl+ ', 's+stemler+n gel+şt+r+lmes+n+ sağlamak da hedeﬂenmekted+r. ', ' ', '2. Yarışmaya k+mler katılab+l+r? ', "Yarışmaya, Türk+ye'de veya yurt dışında öğren+m gören yükseköğret+m (ön l+sans, l+sans ve ", 'yüksek l+sans) öğrenc+ler+ takım hal+nde başvuru yapab+lmekted+r. ']


In [34]:
# --- 6) Final sanity checks ---
import json, pandas as pd

# page dump lines == 28
line_count = sum(1 for _ in open(OUT_PAGE_JSONL, "r", encoding="utf-8"))
print("page_dump.jsonl lines:", line_count)

# page→competition rows == 28 and all labeled
df_map = pd.read_csv(OUT_PAGE_SECTIONS)
print("page_sections rows:", len(df_map))
print(df_map["competition"].value_counts(dropna=False))

assert line_count == 28 and len(df_map) == 28
assert df_map["competition"].isna().sum() == 0
print("Notebook 01 ✔️ complete")

page_dump.jsonl lines: 28
page_sections rows: 28
competition
HSS          14
ADRES        10
E-TICARET     4
Name: count, dtype: int64
Notebook 01 ✔️ complete
