# Notebook 02 — Split, Label, Metadata, Sidecars
Goal: build corpora by competition → split into atomic Q&A → label topics/stage → extract metadata → write MD/JSONL/CSVs.

## Load inputs & quick sanity

Purpose: load the artifacts from Notebook 01 (page_dump.jsonl, page_sections.csv) and join them.

In [1]:
# --- 0) Setup, inputs, and sanity ---
from pathlib import Path
import json, pandas as pd

# Resolve project root (same layout as Notebook 01)
NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent
RAW_DIR        = ROOT / "data" / "raw"
PROCESSED_DIR  = ROOT / "data" / "processed"
MD_DIR         = PROCESSED_DIR / "md"
JSONL_DIR      = PROCESSED_DIR / "jsonl"
CSV_DIR        = PROCESSED_DIR / "csv"

IN_PAGE_JSONL    = PROCESSED_DIR / "page_dump.jsonl"
IN_PAGE_SECTIONS = PROCESSED_DIR / "page_sections.csv"

assert IN_PAGE_JSONL.exists(), f"Missing {IN_PAGE_JSONL}"
assert IN_PAGE_SECTIONS.exists(), f"Missing {IN_PAGE_SECTIONS}"

# Load pages
pages = [json.loads(l) for l in open(IN_PAGE_JSONL, "r", encoding="utf-8")]
df_sections = pd.read_csv(IN_PAGE_SECTIONS)

# Attach competition to each page dict
comp_map = dict(zip(df_sections["page"], df_sections["competition"]))
for p in pages:
    p["competition"] = comp_map.get(p["page"])

print("Pages:", len(pages))
print("By competition:\n", df_sections["competition"].value_counts())
print("Preview p1 (raw):\n", pages[0]["text"][:200])

Pages: 28
By competition:
 competition
HSS          14
ADRES        10
E-TICARET     4
Name: count, dtype: int64
Preview p1 (raw):
 Hava Savunma S)stemler) Yarışması 
1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r? 
Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek 
hava savunm


## Normalize again & build corpora with page spans

Purpose: re-use a light normalizer, then create a single text per competition plus page offset spans (so later we can map Q&A chunks to page ranges).

In [8]:
# --- 1) Normalize text & build corpora (text + page_spans) ---
import re

SOFT_HYPHEN = "\u00ad"
def normalize_text(s: str) -> str:
    s = s.replace(SOFT_HYPHEN, "").replace("\t", " ")
    s = re.sub(r"(\w+)-\n(\w+)", r"\1\2", s)
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"\n{3,}", "\n\n", s)
    s = re.sub(r"(?m)^(\s*\d+)\.(\S)", r"\1. \2", s)
    return s

for p in pages:
    p["text_norm"] = normalize_text(p["text"])

# Build corpora ordered by page
corpora = {}
for p in pages:
    comp = p["competition"]
    if comp is None:
        continue
    if comp not in corpora:
        corpora[comp] = {"text": "", "page_spans": []}
    start = len(corpora[comp]["text"])
    corpora[comp]["text"] += p["text_norm"] + "\n\n"
    end = len(corpora[comp]["text"])
    corpora[comp]["page_spans"].append({"page": p["page"], "start": start, "end": end})

# Quick stats
for comp, bundle in corpora.items():
    print(f"{comp}: chars={len(bundle['text'])} | pages={len(bundle['page_spans'])}")
    # tiny preview
    print("  preview:", bundle["text"][:120].replace("\n"," ") + "…")

HSS: chars=30030 | pages=14
  preview: Hava Savunma S)stemler) Yarışması  1. Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r?  Yarışmanın amacı, takıml…
E-TICARET: chars=10875 | pages=4
  preview: teslimlerine, itiraz süreçlerinden üye ekleme/çıkarma işlemlerine ve resmi duyurulara kadar  tüm organizasyonel faaliyet…
ADRES: chars=24217 | pages=10
  preview: sıralama yapmaları beklenir. Ayrıca, bu aşama "Önyüz Tasarlama" görevini de içerdiği için26,  takımların   React, Vue gi…


## Q&A patterns + splitters (with robust fallbacks)
Purpose: split into one question + its answer per item. Priorities:
	1.	Numbered questions (HSS/ADRES),
	2.	“Soru:” fallback,
	3.	Generic lines ending with “?” (covers E-Ticaret prose).

We also map each QA back to page_start–page_end

In [9]:
# --- 2) Q&A splitting (one question + its answer) ---
import re

# 1) Numbered questions (the "?" is optional in this doc)
Q_PATTERN = re.compile(r"(?m)^\s*(\d+)\.\s+(.{1,200}?)(\?)?\s*$")
# 2) "Soru:" fallback
Q_FALLBACK = re.compile(r"(?m)^\s*Soru[:：]\s+(.{1,200}?)(\?)?\s*$", re.IGNORECASE)
# 3) Generic question line at **paragraph start** (covers prose, avoids in-answer rhetoricals)
Q_GENERIC = re.compile(r"(?m)(?:^|\n\n)(?P<q>.{4,200}\?)\s*$")

# Apply generic-question fallback only to prose-style sections
GENERIC_ALLOWED = {"E-TICARET", "ADRES"}

MIN_ANS_CHARS = 20  # skip headers/noise

def _norm_qtext(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip().lower()
    s = s[:-1] if s.endswith("?") else s
    return s

def page_range_from_offsets(comp: str, start_idx: int, end_idx: int):
    spans = corpora[comp]["page_spans"]
    pages_hit = [s["page"] for s in spans if not (end_idx <= s["start"] or start_idx >= s["end"])]
    if not pages_hit:
        return None, None
    return min(pages_hit), max(pages_hit)

def split_by_matches(text: str, comp: str, matches, with_numbers=True, tag="Q"):
    qas, matches = [], list(matches)
    for i, m in enumerate(matches):
        if with_numbers:
            q_num = int(m.group(1))
            question = m.group(2).strip()
            qa_id = f"{comp}-{tag}{q_num:03d}"
            q_start = m.start()
        else:
            q_num = i + 1
            # "Soru:" fallback (group 1) or generic (?P<q>)
            question = (m.group(1) if m.groups() else m.group("q")).strip()
            qa_id = f"{comp}-{tag}{q_num:03d}"
            q_start = m.start()

        ans_start = m.end()
        ans_end = matches[i+1].start() if i+1 < len(matches) else len(text)
        answer = text[ans_start:ans_end].strip()

        if len(answer) < MIN_ANS_CHARS:
            continue

        page_start, page_end = page_range_from_offsets(comp, q_start, ans_end)
        qas.append({
            "qa_id": qa_id,
            "competition": comp,
            "page_start": page_start, "page_end": page_end,
            "question": question, "answer": answer,
        })
    return qas

def split_qa_for_comp(comp: str):
    text = corpora[comp]["text"]

    # Collect candidates from all matchers
    qas_all = []

    # Priority tags help you later, but we UNION them here
    qas_all += split_by_matches(text, comp, Q_PATTERN.finditer(text), with_numbers=True,  tag="Q")
    qas_all += split_by_matches(text, comp, Q_FALLBACK.finditer(text), with_numbers=False, tag="QF")
    # Only apply generic-question matcher to prose sections (avoid double-count in HSS)
    if comp in GENERIC_ALLOWED:
        qas_all += split_by_matches(text, comp, Q_GENERIC.finditer(text), with_numbers=False, tag="QG")

    # Deduplicate: same competition + page_start + normalized question
    seen, deduped = set(), []
    for qa in sorted(qas_all, key=lambda r: (r["page_start"] or 0, r["qa_id"])):
        key = (qa["competition"], qa["page_start"], _norm_qtext(qa["question"]))
        if key in seen:
            continue
        seen.add(key)
        deduped.append(qa)

    return deduped

all_qas = []
for comp in corpora:
    all_qas.extend(split_qa_for_comp(comp))

## Quick checks (totals + per-competition) & tiny preview
Purpose: verify we captured all three sections (E-Ticaret should be > 0 now) and see a sample.

In [10]:
# --- 3) Q&A split checks ---
import pandas as pd

df = pd.DataFrame(all_qas)
print("QAs total:", len(df))
print("QAs per competition:\n", df["competition"].value_counts(dropna=False))

# show 1 example per competition
for comp in df["competition"].unique():
    row = df[df["competition"]==comp].iloc[0]
    print(f"\n[{comp}] {row['qa_id']}\nQ:", row["question"][:120], "\nA:", row["answer"][:200], "…")

QAs total: 102
QAs per competition:
 competition
HSS          51
ADRES        45
E-TICARET     6
Name: count, dtype: int64

[HSS] HSS-Q001
Q: Hava Savunma S+stemler+ Yarışması'nın temel amacı ned+r 
A: Yarışmanın amacı, takımların ver+len senaryolara uygun görevler+ başarıyla yer+ne get+recek 
hava savunma s+stemler+ gel+şt+rmes+ ve üretmes+d+r. Aynı zamanda, hava savunma 
s+stemler+n+n önem+n+n ülk …

[E-TICARET] E-TICARET-QG001
Q: önemi nedir? Bu süre takımlara ne gibi avantajlar sağlar? 
A: E-Ticaret Hackathonu takviminde, ﬁnalistlerin açıklanması ile ﬁziksel etkinliğin başlaması 
arasında yaklaşık 15 günlük bir süre bırakılması66, takımlara stratejik bir hazırlık ve proje 
olgunlaştırma …

[ADRES] ADRES-QG001
Q: Yarışmanın temel amacı nedir? 
A: Yarışmanın temel amacı, özellikle son kilometre teslimatlarında yaşanan adres verisi 
tutarsızlıkları ve yönetim sorunlarına yönelik yapay zeka, doğal dil işleme (NLP), coğraﬁ bilgi 
sistemleri (GIS)  …


## Topic labeling (regex, first-match wins)

Purpose: assign topic for routing/filters (eligibility, team, stages, scoring, penalties, logistics, timeline, other).

In [13]:
# --- 4) Topic labeling ---
import re

TOPIC_RULES = {
    "eligibility": r"(katılım|kimler|başvuru koşul|uygun|gereklilik|şart)",
    "team":        r"(takım|üye|danışman|rol|katılımcı sayısı)",
    "stages":      r"(aşama|görev|süreç|workflow|sunum)",
    "scoring":     r"(puan|puanlama|bsp|bonus|değerlendir|kriter|puan kesinti)",
    "penalties":   r"(ceza\w*|diskalifiye\w*|ihlal\w*|yasak\w*|kural\s*dışı|kuraldışı|dost\s+(hedef|ateş[iı]))",
    "logistics":   r"(konaklama|ulaşım|destek|sponsor|mekan|yer|konaklama)",
    "timeline":    r"(son başvuru|son tarih|takvim|deadline|program|başvuru tarihi|teslim tarihi)",
}
TOPIC_RE = {k: re.compile(v, re.IGNORECASE) for k,v in TOPIC_RULES.items()}

def label_topic(q, a):
    text = f"{q}\n{a}"
    for name, rx in TOPIC_RE.items():
        if rx.search(text):
            return name
    return "other"

df["topic"] = [label_topic(q, a) for q,a in zip(df["question"], df["answer"])]
print("Topic counts:\n", df["topic"].value_counts())

Topic counts:
 topic
team           49
eligibility    33
stages          9
other           6
logistics       2
scoring         2
timeline        1
Name: count, dtype: int64


## Stage detection (Aşama 1/2/3 and roman numerals)

Purpose: fill the stage column when “aşama”/“stage” is mentioned near a number.

In [17]:
# --- 5) Stage detection ---
import re
ROMAN = {"i":1, "ii":2, "iii":3, "iv":4, "v":5}

def detect_stage(text: str):
    t = text.lower()
    m = re.search(r"(?:aşama|stage)\s*[:\-]?\s*(\d+)", t)
    if m: return int(m.group(1))
    m = re.search(r"(?:aşama|stage)\s*[:\-]?\s*([ivx]{1,3})\b", t)
    if m: return ROMAN.get(m.group(1), None)
    m = re.search(r"\b([ivx]{1,3})\.\s*aşama\b", t)
    if m: return ROMAN.get(m.group(1), None)
    return None

df["stage"] = [detect_stage(f"{q}\n{a}") for q,a in zip(df["question"], df["answer"])]
print("Stage distribution:\n", df["stage"].value_counts(dropna=False).head(10))

Stage distribution:
 stage
NaN    99
1.0     3
Name: count, dtype: int64


### Stage from page headings (page context + neighbor fallback)

Purpose: infer stage from page headings like “I. AŞAMA”, “Aşama 2”, etc., and fill NaNs.

In [18]:
# --- 5b) Stage from page context (headings + neighbor-page fallback) ---
import pandas as pd
import re

# Reuse ROMAN from previous cell
# ROMAN = {"i":1, "ii":2, "iii":3, "iv":4, "v":5}

def page_stage(text: str):
    t = text.lower()
    # Prefer Roman headings like "I. AŞAMA" or "AŞAMA II"
    m = re.search(r"\b([ivx]{1,3})\.\s*aşama\b", t)
    if m:
        return ROMAN.get(m.group(1))
    m = re.search(r"\başama\s*[:\-]?\s*([ivx]{1,3})\b", t)
    if m:
        return ROMAN.get(m.group(1))
    # Numeric forms: "Aşama 1", "Stage 2"
    m = re.search(r"(?:aşama|stage)\s*[:\-]?\s*(\d+)\b", t)
    if m:
        try:
            return int(m.group(1))
        except:
            return None
    return None

# Build a per-page stage map from normalized page text
stage_by_page = {p["page"]: page_stage(p.get("text_norm") or p["text"]) for p in pages}

# Fill from the QA's start page when stage is NaN
df["stage_page"] = df["page_start"].map(stage_by_page)
df["stage"] = df["stage"].fillna(df["stage_page"])

# Neighbor-page fallback (within same competition)
def neighbor_stage(row):
    if pd.notna(row["stage"]):
        return row["stage"]
    start = row["page_start"]
    comp = row["competition"]
    for off in (-1, 1):
        pg = (start or 0) + off
        if pg in stage_by_page and stage_by_page[pg] is not None and comp_map.get(pg) == comp:
            return stage_by_page[pg]
    return None

df["stage"] = df.apply(lambda r: r["stage"] if pd.notna(r["stage"]) else neighbor_stage(r), axis=1)

print("Stage distribution (after page-context fill):")
print(df["stage"].value_counts(dropna=False).head(10))

Stage distribution (after page-context fill):
stage
NaN    77
1.0    25
Name: count, dtype: int64


 ## Metadata extraction (add fields: dates, numbers, formulas, links)

In [19]:
# --- 6) Metadata extraction ---
import re

TR_MONTHS = {
    "ocak":"01","şubat":"02","subat":"02","mart":"03","nisan":"04","mayıs":"05","mayis":"05",
    "haziran":"06","temmuz":"07","ağustos":"08","agustos":"08","eylül":"09","eylul":"09",
    "ekim":"10","kasım":"11","kasim":"11","aralık":"12","aralik":"12"
}

date_dd_mon_yyyy = re.compile(r"\b(\d{1,2})\s+([a-zçğıöşü]+)\s+(\d{4})\b", re.IGNORECASE)
date_dd_mm_yyyy  = re.compile(r"\b(\d{1,2})[./](\d{1,2})[./](\d{4})\b")
http_re     = re.compile(r"https?://\S+")
money_re    = re.compile(r"(?:₺|TL)\s*\d[\d.,]*", re.IGNORECASE)
percent_re  = re.compile(r"(?:\b\d{1,3}\s*%|%\s*\d{1,3}\b)")
points_re   = re.compile(r"\b\d+\s*puan\b", re.IGNORECASE)
time_re     = re.compile(r"\b(\d{1,2})\s*(?:dk|dakika|saat|hour|min)\b", re.IGNORECASE)
formula_re  = re.compile(r"\bBSP\b|=|formül", re.IGNORECASE)

def extract_dates_tr(text: str):
    dates = []
    for d,m,y in date_dd_mon_yyyy.findall(text):
        mm = TR_MONTHS.get(m.lower())
        dates.append({"raw": f"{d} {m} {y}", "iso": f"{y}-{mm}-{int(d):02d}" if mm else None})
    for d,m,y in date_dd_mm_yyyy.findall(text):
        dates.append({"raw": f"{d}.{m}.{y}", "iso": f"{y}-{int(m):02d}-{int(d):02d}"})
    # dedupe by raw
    out, seen = [], set()
    for r in dates:
        if r["raw"] in seen: continue
        seen.add(r["raw"]); out.append(r)
    return out

def extract_numbers(text: str):
    vals = []
    vals += percent_re.findall(text)
    vals += points_re.findall(text)
    vals += money_re.findall(text)
    vals += time_re.findall(text)
    # normalize tuples from regex groups
    return list(dict.fromkeys([v if isinstance(v, str) else " ".join(v) for v in vals]))

def extract_formulas(text: str):
    lines = [ln.strip() for ln in text.splitlines() if formula_re.search(ln)]
    return lines[:5]

def extract_links(text: str):
    return http_re.findall(text)[:5]

df["dates"]    = [extract_dates_tr(f"{q}\n{a}") for q,a in zip(df["question"], df["answer"])]
df["numbers"]  = [extract_numbers(f"{q}\n{a}") for q,a in zip(df["question"], df["answer"])]
df["formulas"] = [extract_formulas(f"{q}\n{a}") for q,a in zip(df["question"], df["answer"])]
df["links"]    = [extract_links(f"{q}\n{a}") for q,a in zip(df["question"], df["answer"])]

print("Has dates   :", sum(bool(x) for x in df['dates']))
print("Has numbers :", sum(bool(x) for x in df['numbers']))
print("Has formulas:", sum(bool(x) for x in df['formulas']))
print("Has links   :", sum(bool(x) for x in df['links']))

Has dates   : 6
Has numbers : 20
Has formulas: 4
Has links   : 1


## Write Markdown per competition (human-readable) — brief
Purpose: save quick review files you can skim; gets overwritten on re-runs.

In [20]:
# --- 7) Markdown per competition ---
MD_DIR.mkdir(parents=True, exist_ok=True)

def md_block(row):
    head = f"## {row['qa_id']}: {row['question']}".strip()
    meta = f"[page: {row['page_start']}-{row['page_end']}] [topic: {row['topic']}]"
    return f"{head}\n{row['answer']}\n\n{meta}\n---\n"

for comp in sorted(df["competition"].unique()):
    text = "".join(md_block(r) for _, r in df[df["competition"]==comp].iterrows())
    outp = MD_DIR / f"{comp}.md"
    outp.write_text(text, encoding="utf-8")
    print("Wrote:", outp, "| items:", (df["competition"]==comp).sum())

Wrote: /Users/macbook/T3/rag-tekno/data/processed/md/ADRES.md | items: 45
Wrote: /Users/macbook/T3/rag-tekno/data/processed/md/E-TICARET.md | items: 6
Wrote: /Users/macbook/T3/rag-tekno/data/processed/md/HSS.md | items: 51


## Write JSONL sidecar + optional CSVs (deadlines, scoring)
Purpose: machine-readable data for your RAG + two small lookup tables.

In [21]:
# --- 8) JSONL sidecar + optional CSVs ---
import json

JSONL_DIR.mkdir(parents=True, exist_ok=True)
sidecar_path = JSONL_DIR / "qa_meta.jsonl"

fields = ["qa_id","competition","topic","stage","page_start","page_end",
          "question","answer","dates","numbers","formulas","links"]

with open(sidecar_path, "w", encoding="utf-8") as f:
    for _, r in df[fields].iterrows():
        f.write(json.dumps({k: r[k] for k in fields}, ensure_ascii=False) + "\n")

print("Wrote:", sidecar_path)

# Optional: deadlines.csv & scoring.csv
CSV_DIR.mkdir(parents=True, exist_ok=True)

# Deadlines: any QA with a date
dead_rows = []
for _, r in df.iterrows():
    for d in r["dates"]:
        dead_rows.append({
            "qa_id": r["qa_id"], "competition": r["competition"],
            "topic": r["topic"], "raw_date": d["raw"], "iso": d["iso"],
            "page_start": r["page_start"], "page_end": r["page_end"],
        })
import pandas as pd
pd.DataFrame(dead_rows).to_csv(CSV_DIR / "deadlines.csv", index=False, encoding="utf-8")
print("Wrote:", CSV_DIR / "deadlines.csv", "| rows:", len(dead_rows))

# Scoring: topic in {scoring, penalties} or formulas present
sc_mask = (df["topic"].isin(["scoring","penalties"])) | (df["formulas"].astype(bool))
sc_df = df.loc[sc_mask, ["qa_id","competition","topic","numbers","formulas","page_start","page_end"]]
sc_df.to_csv(CSV_DIR / "scoring.csv", index=False, encoding="utf-8")
print("Wrote:", CSV_DIR / "scoring.csv", "| rows:", len(sc_df))

Wrote: /Users/macbook/T3/rag-tekno/data/processed/jsonl/qa_meta.jsonl
Wrote: /Users/macbook/T3/rag-tekno/data/processed/csv/deadlines.csv | rows: 9
Wrote: /Users/macbook/T3/rag-tekno/data/processed/csv/scoring.csv | rows: 6


## Quality gates (must pass before embeddings)

Purpose: sanity-check the dataset so we don’t carry bad artifacts into indexing.

In [22]:
# --- 9) Quality gates ---
import pandas as pd

# 1) basic shape & empties
assert len(df) > 0, "No QAs!"
assert df["question"].str.len().min() > 0, "Empty question found"
assert df["answer"].str.len().min() > 0, "Empty answer found"

# 2) competitions & per-comp coverage
allowed = {"HSS","E-TICARET","ADRES"}
assert set(df["competition"].unique()) <= allowed, f"Unexpected competition labels: {set(df['competition'].unique())-allowed}"
per_comp = df["competition"].value_counts()
assert (per_comp > 0).all(), "Some competition has zero items"
print("Per-competition counts:\n", per_comp.to_string())

# 3) duplicates (same competition + question)
dup_mask = df.duplicated(subset=["competition","question"], keep=False)
print("Duplicates by (competition, question):", int(dup_mask.sum()))
if dup_mask.any():
    print(df.loc[dup_mask, ["competition","qa_id","question"]].head(5).to_string(index=False))

# 4) very long answers (inspect if any)
long_mask = df["answer"].str.len() > 2000
print("Very long answers (>2000 chars):", int(long_mask.sum()))
if long_mask.any():
    print(df.loc[long_mask, ["qa_id","competition","page_start","page_end"]].head().to_string(index=False))

print("\nNotebook 02 ✅ complete")

Per-competition counts:
 competition
HSS          51
ADRES        45
E-TICARET     6
Duplicates by (competition, question): 0
Very long answers (>2000 chars): 2
     qa_id competition  page_start  page_end
ADRES-Q002       ADRES          26        28
  HSS-Q003         HSS           9        14

Notebook 02 ✅ complete
