# Goal: Normalize Turkish text, split into atomic Q&A chunks per competition, and emit Markdown, JSONL, and optional CSVs.

## One-time installs (if you haven’t already)

In [None]:
#%pip install regex dateparser python-dateutil rapidfuzz

## Load page data and section map

In [1]:
# --- 0) Imports & load raw artifacts ---
from pathlib import Path
import re, json, math
import pandas as pd
from datetime import datetime
from dateparser.search import search_dates

NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent

IN_JSONL = ROOT / "data/processed/page_dump.jsonl"
PAGE_MAP_CSV = ROOT / "data/processed/page_sections.csv"
OUT_MD = ROOT / "data/processed/md"
OUT_JSONL = ROOT / "data/processed/jsonl" / "qa_meta.jsonl"
OUT_DEADLINES = ROOT / "data/processed/csv" / "deadlines.csv"
OUT_SCORING = ROOT / "data/processed/csv" / "scoring.csv"

# Load
pages = [json.loads(l) for l in open(IN_JSONL, "r", encoding="utf-8")]
page_map = pd.read_csv(PAGE_MAP_CSV)
comp_by_page = dict(zip(page_map.page, page_map.competition))


## **Normalize Turkish text**

Do a deterministic, idempotent pipeline (functions you can re-run safely):

- **Unicode normalize** (NFC) and **fix common OCR artifacts**:

    - Remove soft hyphens `\u00AD`, stray `+`, duplicated spaces.

    - Dehyphenate split words at linebreaks: `r"(\w+)-\n(\w+)" → r"\1\2"`.

- **Line hygiene**:

    - Replace multi-newlines with a single blank line.

    - Standardize bullet/number formats (e.g., `1.` → `1.` ).

- **Diacritics**: If the source lost Turkish diacritics, you _can’t reliably infer them_ programmatically; keep original if present. Ensure you **don’t strip** valid diacritics (`çğıöşüÇĞİÖŞÜ`).

- **Keep page anchors** so you can back-cite later.

In [2]:
# --- 1) Normalization helpers (idempotent) ---
SOFT_HYPHEN = "\u00ad"

def normalize_text(s: str) -> str:
    # remove soft hyphens / stray plus, collapse spaces
    s = s.replace(SOFT_HYPHEN, "")
    s = s.replace("\t", " ")
    # dehyphenate across line breaks: "kelime-\nler" -> "kelimeler"
    s = re.sub(r"(\w+)-\n(\w+)", r"\1\2", s)
    # unify newlines
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    # remove lines that are just '+'
    s = re.sub(r"^\s*\+\s*$", "", s, flags=re.MULTILINE)
    # collapse >2 newlines to 2
    s = re.sub(r"\n{3,}", "\n\n", s)
    # ensure numbered lists have a space after "1." etc
    s = re.sub(r"(?m)^(\s*\d+)\.(\S)", r"\1. \2", s)
    return s

for p in pages:
    p["text_norm"] = normalize_text(p["text"])


In [3]:
# --- 2) Build competition-specific corpora with page boundaries ---
from collections import defaultdict

corpora = defaultdict(lambda: {"text": "", "page_spans": []})
for p in pages:
    comp = comp_by_page.get(p["page"])
    if comp is None:
        continue  # skip unknown pages
    start = len(corpora[comp]["text"])
    corpora[comp]["text"] += p["text_norm"] + "\n\n"
    end = len(corpora[comp]["text"])
    corpora[comp]["page_spans"].append({"page": p["page"], "start": start, "end": end})

list(corpora.keys())

['HSS', 'E-TICARET', 'ADRES']

## **Split into atomic Q&A items**

Use layered heuristics—stop at the first that yields stable splits:

1. **Numbered questions** pattern (most common):

    - Split on lines that start with `^\s*\d+\.\s+` and where the line **ends with `?`** within 200 chars.

    - The block until the next question number is the **answer**.

2. **Fallback** if the doc mixes formats:

    - Consider “Soru:” / “S:” / “?” line endings as question starts (but avoid splitting inside answers by requiring a blank line before).

3. **Guardrails**:

    - Minimum answer length (e.g., ≥ 40 chars) to avoid headers being misread as Qs.

    - Merge tiny fragments back to previous answer if a false positive occurs.

In [7]:
# --- 3) Q&A splitting (one question + its answer as an atomic chunk) ---
# Heuristic: questions are numbered lines ending with '?'
Q_PATTERN = re.compile(r"(?m)^\s*(\d+)\.\s+(.{1,200}?\?)\s*$")

def page_range_from_offsets(comp: str, start_idx: int, end_idx: int):
    """Map a text slice [start_idx, end_idx) to page_start..page_end using precomputed spans."""
    spans = corpora[comp]["page_spans"]
    pages_hit = [s["page"] for s in spans if not (end_idx <= s["start"] or start_idx >= s["end"])]
    if not pages_hit:
        return None, None
    return min(pages_hit), max(pages_hit)

def split_qa_for_comp(comp: str):
    text = corpora[comp]["text"]
    qas = []
    matches = list(Q_PATTERN.finditer(text))
    for i, m in enumerate(matches):
        q_num = int(m.group(1))
        question = m.group(2).strip()
        ans_start = m.end()
        ans_end = matches[i+1].start() if i+1 < len(matches) else len(text)
        answer = text[ans_start:ans_end].strip()

        # guardrails: skip obviously bad splits
        if len(answer) < 40:   # too short = likely header/noise
            continue

        page_start, page_end = page_range_from_offsets(comp, m.start(), ans_end)
        qa = {
            "qa_id": f"{comp}-Q{q_num:03d}",
            "competition": comp,
            "page_start": page_start, "page_end": page_end,
            "question": question, "answer": answer,
        }
        qas.append(qa)
    return qas

all_qas = []
for comp in corpora:
    all_qas.extend(split_qa_for_comp(comp))

len(all_qas), all_qas[:3]


(49,
 [{'qa_id': 'HSS-Q001',
   'competition': 'HSS',
   'page_start': 1,
   'page_end': 1,
   'question': "Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r?",
   'answer': 'Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek \nhava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma \ns+stemler+n+n önem+n+n ülke çapında gen+ş b+r tabana yayılarak özgün, yerl+ ve yetenekl+ \ns+stemler+n gel+şt+r+lmes+n+ sağlamak da hedeﬂenmekted+r.'},
  {'qa_id': 'HSS-Q002',
   'competition': 'HSS',
   'page_start': 1,
   'page_end': 1,
   'question': 'Yarışmaya k+mler katılab+l+r?',
   'answer': "Yarışmaya, Türk+ye'de veya yurt dışında öğren+m gören yükseköğret+m (ön l+sans, l+sans ve \nyüksek l+sans) öğrenc+ler+ takım hal+nde başvuru yapab+lmekted+r."},
  {'qa_id': 'HSS-Q003',
   'competition': 'HSS',
   'page_start': 1,
   'page_end': 1,
   'question': 'Takım oluşturma kuralları nelerd+r?',
   'answer': 'Takımların en az 3, en faz

## **Topic labeling** (rule-based for now)

Create a simple keyword rule set; you can swap to ML later if needed.

- **eligibility**: `katılım|kimler|başvuru koşul|uygun|gereklilik|şart`

- **team**: `takım|üye|danışman|roller|katılımcı sayısı`

- **stages**: `aşama|görev|süreç|workflow|sunum`

- **scoring**: `puan|puanlama|BSP|bonus|değerlendir|kriter`

- **penalties**: `ceza|diskalifiye|ihlal|yasak|kural dışı`

- **logistics**: `konaklama|ulaşım|destek|sunum süresi|sponsor`

- **timeline**: `son başvuru|tarih|takvim|deadline|program`


Assign **the first matching topic**; if none match, label `other`. Keep it **overrideable** with a tiny YAML later.

In [8]:
# --- 4) Topic labeling (simple rule-based for now) ---
TOPIC_RULES = {
    "eligibility": r"(katılım|kimler|başvuru koşul|uygun|gereklilik|şart)",
    "team": r"(takım|üye|danışman|rol|katılımcı sayısı)",
    "stages": r"(aşama|görev|süreç|workflow|sunum)",
    "scoring": r"(puan|puanlama|bsp|bonus|değerlendir|kriter)",
    "penalties": r"(ceza|diskalifiye|ihlal|yasak|kural dışı|kuraldışı)",
    "logistics": r"(konaklama|ulaşım|destek|sunum süresi|sponsor)",
    "timeline": r"(son başvuru|tarih|takvim|deadline|program)",
}
TOPIC_COMPILED = {k: re.compile(v, re.IGNORECASE) for k,v in TOPIC_RULES.items()}

def label_topic(q, a):
    text = (q + "\n" + a).casefold()
    for topic, rx in TOPIC_COMPILED.items():
        if rx.search(text):
            return topic
    return "other"

for qa in all_qas:
    qa["topic"] = label_topic(qa["question"], qa["answer"])


In [20]:
# TOPIC DISTRIBUTION: sanity check that key topics exist
topic_counts = df["topic"].value_counts()
print(topic_counts)
assert topic_counts.sum() == len(df), "Topic labeling missing for some QAs."
# Show one example per common topic
for t in topic_counts.head(5).index:
    row = df[df["topic"]==t].iloc[0]
    print(f"\nTopic={t} → {row['qa_id']}: {row['question']}")


topic
team           31
other           7
stages          5
eligibility     4
timeline        1
scoring         1
Name: count, dtype: int64

Topic=team → HSS-Q002: Yarışmaya k+mler katılab+l+r?

Topic=other → HSS-Q006: Ön Tasarım Raporu (ÖTR) +ç+n son tesl+m tar+h+ ne zamandır?

Topic=stages → HSS-Q016: Görev Kab+l+yet Göster+m+ V+deosu +ç+n çözünürlük ve süre gereks+n+mler+ nelerd+r?

Topic=eligibility → HSS-Q001: Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r?

Topic=timeline → HSS-Q005: Yarışma +ç+n son başvuru tar+h+ ned+r?


## **Metadata extraction** (dates, numbers, formulas, links)

- **Dates**: handle Turkish and ISO; regex + `dateparser`:

    - Match patterns like `\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b`, or Turkish months:

        - `Ocak|Şubat|Mart|Nisan|Mayıs|Haziran|Temmuz|Ağustos|Eylül|Ekim|Kasım|Aralık`

    - Normalize to `YYYY-MM-DD` (store original + normalized).

- **Numbers & units**:

    - Percentages: `\b\d{1,3}\s?%|\%\d{1,3}\b`

    - Minutes/seconds: `\b\d+\s?(dk|dakika|sn|saniye)\b`

    - Team sizes: `\b(en az|en fazla)\s?\d+\b`

    - Money: `\b\d{1,3}(\.\d{3})*(,\d+)?\s?(TL|₺|KZT|USD|EUR)\b`

- **Formulas**:

    - Look for known tokens like `BSP` and capture math around them:

        - e.g., `BSP\s*[:=]\s*([^\n]+)` → store the formula string exactly.

- **Links**:

    - `https?://\S+` (keep raw; you won’t auto-open in answers unless allowed).


In [9]:
# --- 5) Metadata extraction: dates, numbers, formulas, links ---
MONTHS_TR = "Ocak|Şubat|Mart|Nisan|Mayıs|Haziran|Temmuz|Ağustos|Eylül|Ekim|Kasım|Aralık"
DATE_RX = re.compile(
    rf"(\b\d{{1,2}}[./]\d{{1,2}}[./]\d{{2,4}}\b|(\b\d{{1,2}}\s+(?:{MONTHS_TR})\s+\d{{4}}\b))",
    re.IGNORECASE
)
PERCENT_RX = re.compile(r"\b\d{1,3}\s?%|%\s?\d{1,3}\b")
MINUTES_RX = re.compile(r"\b\d+\s?(?:dk|dakika)\b", re.IGNORECASE)
POINTS_RX  = re.compile(r"\b(\d+)\s*puan\b", re.IGNORECASE)
MONEY_RX   = re.compile(r"\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\s?(?:TL|₺|KZT|USD|EUR)\b")
BSP_RX     = re.compile(r"\bBSP\b.*", re.IGNORECASE)
LINK_RX    = re.compile(r"https?://\S+")

def extract_dates_tr(text: str):
    """Return list of {raw, iso} using dateparser (TR) with regex pre-filter."""
    hits = []
    for m in DATE_RX.finditer(text):
        raw = m.group(0)
        # dateparser search (language TR)
        found = search_dates(raw, languages=["tr"], settings={"DATE_ORDER": "DMY"})
        if found:
            # found is list of tuples (raw, datetime)
            for _, dt in found:
                iso = dt.strftime("%Y-%m-%d")
                hits.append({"raw": raw, "iso": iso})
        else:
            hits.append({"raw": raw, "iso": None})
    # deduplicate by (raw, iso)
    uniq = {(h["raw"], h["iso"]): h for h in hits}
    return list(uniq.values())

def extract_numbers(text: str):
    nums = []
    for rx, kind in [(PERCENT_RX, "percent"), (MINUTES_RX, "minutes"), (POINTS_RX, "points"), (MONEY_RX, "money")]:
        for m in rx.finditer(text):
            nums.append({"raw": m.group(0), "kind": kind})
    return nums

def extract_formulas(text: str):
    return [{"name": "BSP", "expr": line.strip()} for line in BSP_RX.findall(text)]

def extract_links(text: str):
    return LINK_RX.findall(text)

def detect_stage(text: str):
    m = re.search(r"aşama\s*(\d)", text, re.IGNORECASE)
    return int(m.group(1)) if m else None

for qa in all_qas:
    full = qa["question"] + "\n" + qa["answer"]
    qa["stage"]    = detect_stage(full)
    qa["dates"]    = extract_dates_tr(full)
    qa["numbers"]  = extract_numbers(full)
    qa["formulas"] = extract_formulas(full)
    qa["links"]    = extract_links(full)


In [21]:
# METADATA COVERAGE: how many QAs have dates/numbers/formulas/links
has_dates    = sum(bool(x) for x in df["dates"])
has_numbers  = sum(bool(x) for x in df["numbers"])
has_formulas = sum(bool(x) for x in df["formulas"])
has_links    = sum(bool(x) for x in df["links"])
print(f"Dates:{has_dates}  Numbers:{has_numbers}  Formulas:{has_formulas}  Links:{has_links}")

# Spot-check BSP presence if expected
bsp_rows = df[df["formulas"].astype(str).str.contains("BSP", case=False, na=False)]
print("BSP rows:", len(bsp_rows))
if len(bsp_rows):
    print("Example BSP expr:", bsp_rows.iloc[0]["formulas"][0])

Dates:4  Numbers:11  Formulas:1  Links:1
BSP rows: 1
Example BSP expr: {'name': 'BSP', 'expr': 'BSP)'}


## **Output artifacts**

1. **Markdown corpus** (for human review + optional BM25 later):

    - One file per competition, e.g., `data/processed/md/HSS.md`

    - Format:

        ```
        # Hava Savunma Sistemleri

        ## Q23. <question text>
        <answer paragraph(s)>

        [page: 7–8] [qa_id: HSS-Q023]
        ---
        ```

2. **JSONL sidecar** with one object per Q&A:

    - `data/processed/jsonl/qa_meta.jsonl`

    - Fields: `qa_id, competition, topic, stage, page_start, page_end, question, answer, dates, numbers, formulas, links`

3. **CSVs (optional, for exact lookups)**

    - `deadlines.csv` columns: `competition, label, date_iso, page_start, qa_id`

    - `scoring.csv` columns: `competition, item, points, penalties, formula, qa_id`

In [10]:
# --- 6) Write Markdown per competition ---
OUT_MD.mkdir(parents=True, exist_ok=True)
grouped = {}
for qa in all_qas:
    grouped.setdefault(qa["competition"], []).append(qa)

for comp, items in grouped.items():
    items = sorted(items, key=lambda x: x["qa_id"])
    lines = [f"# {comp}\n"]
    for it in items:
        lines.append(f"## {it['qa_id']}: {it['question']}")
        lines.append(it["answer"].rstrip())
        lines.append(f"[page: {it['page_start']}–{it['page_end']}] [topic: {it['topic']}]")
        lines.append("---\n")
    (OUT_MD / f"{comp}.md").write_text("\n".join(lines), encoding="utf-8")

In [11]:
# --- 7) Write JSONL sidecar with rich metadata ---
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for qa in all_qas:
        rec = {
            "qa_id": qa["qa_id"],
            "competition": qa["competition"],
            "topic": qa["topic"],
            "stage": qa["stage"],
            "page_start": qa["page_start"],
            "page_end": qa["page_end"],
            "question": qa["question"],
            "answer": qa["answer"],
            "dates": qa["dates"],
            "numbers": qa["numbers"],
            "formulas": qa["formulas"],
            "links": qa["links"],
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")


In [12]:
# --- 7) Write JSONL sidecar with rich metadata ---
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for qa in all_qas:
        rec = {
            "qa_id": qa["qa_id"],
            "competition": qa["competition"],
            "topic": qa["topic"],
            "stage": qa["stage"],
            "page_start": qa["page_start"],
            "page_end": qa["page_end"],
            "question": qa["question"],
            "answer": qa["answer"],
            "dates": qa["dates"],
            "numbers": qa["numbers"],
            "formulas": qa["formulas"],
            "links": qa["links"],
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")


In [13]:
# --- 8) Optional CSVs: deadlines & scoring (simple first cut) ---
# Deadlines: any QA with dates (prefer those with timeline topic)
deadline_rows = []
for qa in all_qas:
    if not qa["dates"]:
        continue
    label = "timeline" if qa["topic"] == "timeline" else "general"
    for d in qa["dates"]:
        deadline_rows.append({
            "competition": qa["competition"],
            "qa_id": qa["qa_id"],
            "label": label,
            "date_raw": d["raw"],
            "date_iso": d["iso"],
            "page_start": qa["page_start"],
        })
pd.DataFrame(deadline_rows).to_csv(OUT_DEADLINES, index=False)

# Scoring: capture points, BSP formulas, and % mentions
scoring_rows = []
for qa in all_qas:
    if qa["topic"] not in ("scoring", "penalties") and not qa["formulas"]:
        continue
    pts = [n["raw"] for n in qa["numbers"] if n["kind"] in ("points","percent")]
    scoring_rows.append({
        "competition": qa["competition"],
        "qa_id": qa["qa_id"],
        "points_mentions": "; ".join(pts) if pts else "",
        "formulas": "; ".join([f["expr"] for f in qa["formulas"]]) if qa["formulas"] else "",
        "page_start": qa["page_start"],
    })
pd.DataFrame(scoring_rows).to_csv(OUT_SCORING, index=False)

print("Split & structure complete →",
      "\n- data/processed/md/{HSS,E-TICARET,ADRES}.md",
      "\n- data/processed/jsonl/qa_meta.jsonl",
      "\n- data/processed/csv/deadlines.csv (optional)",
      "\n- data/processed/csv/scoring.csv (optional)")


Split & structure complete → 
- data/processed/md/{HSS,E-TICARET,ADRES}.md 
- data/processed/jsonl/qa_meta.jsonl 
- data/processed/csv/deadlines.csv (optional) 
- data/processed/csv/scoring.csv (optional)


In [23]:
# CSV SHAPES: optional but useful quick checks
if OUT_DEADLINES.exists():
    dfd = pd.read_csv(OUT_DEADLINES)
    print("deadlines.csv →", dfd.shape, "| examples:", dfd.head(2).to_dict("records"))
if OUT_SCORING.exists():
    dfs = pd.read_csv(OUT_SCORING)
    print("scoring.csv →", dfs.shape, "| examples:", dfs.head(2).to_dict("records"))


deadlines.csv → (4, 6) | examples: [{'competition': 'HSS', 'qa_id': 'HSS-Q005', 'label': 'timeline', 'date_raw': '1 Mart 2025', 'date_iso': '2025-03-01', 'page_start': 1}, {'competition': 'HSS', 'qa_id': 'HSS-Q006', 'label': 'general', 'date_raw': '17 Mart 2025', 'date_iso': '2025-03-17', 'page_start': 1}]
scoring.csv → (2, 5) | examples: [{'competition': 'HSS', 'qa_id': 'HSS-Q022', 'points_mentions': '40 puan; 0 puan', 'formulas': nan, 'page_start': 4}, {'competition': 'HSS', 'qa_id': 'HSS-Q050', 'points_mentions': '60 puan; 100 puan; 140 puan; 0 puan; 5 puan; 20 puan; 10 Puan; 10 Puan; 10 \npuan', 'formulas': 'BSP); BSP), Hava Savunma Sistemleri yarışmasının her üç görev aşamasında', 'page_start': 8}]


## **Quality gates** (small, but critical)

- **No-empty**: Assert question and answer are non-empty for all items.

- **Length bounds**: Flag any answer > 2,000 chars (may indicate merge error).

- **Competition consistency**: All Q&As must inherit a competition. If any `None`, inspect page anchors.

- **Duplicate detection**: Use `rapidfuzz` to flag 95%+ near-duplicates; collapse if needed.

In [14]:
# --- 9) Minimal quality gates (fail early if something is off) ---
import sys

df = pd.DataFrame(all_qas)
assert not df["question"].isna().any() and not df["answer"].isna().any(), "Empty Q/A found"
too_long = df["answer"].str.len() > 2000
if too_long.any():
    print("WARNING: Very long answers detected:\n", df[too_long][["qa_id","competition"]].head())
if df["competition"].isna().any():
    raise RuntimeError("Some QAs have no competition assigned")
dups = df["question"].duplicated(keep=False)
if dups.any():
    print("NOTE: Near duplicates detected; consider post-processing:\n", df[dups][["qa_id","question"]].head())
print("Quality checks passed.")


        qa_id competition
48  HSS-Q050         HSS
Quality checks passed.
