In [6]:
import re, json
import pandas as pd
from pathlib import Path

In [7]:
# Finegrained placeholder map
PLACEHOLDERS = {
    "VORNAME": {"VORNAME"},
    "NACHNAME": {"NACHNAME"},
    "FIRMA": {"FIRMA"},
    "TELEFONNUMMER": {"TELEFONNUMMER"},
    "EMAIL": {"EMAIL"},
    "FAX": {"FAX"},
    "STRASSE": {"STRASSE"},
    "HAUSNUMMER": {"HAUSNUMMER"},
    "POSTLEITZAHL": {"POSTLEITZAHL"},
    "WOHNORT": {"WOHNORT"},
    "ZÄHLERNUMMER": {"ZÄHLERNUMMER"},
    "ZÄHLERSTAND": {"ZÄHLERSTAND"},
    "VERTRAGSNUMMER": {"VERTRAGSNUMMER", "ANGEBOTSNUMMER", "KUNDENNUMMER"},
    "ZAHLUNG": {"ZAHLUNG"},
    "BANK": {"BANK"},
    "IBAN": {"IBAN"},
    "BIC": {"BIC"},
    "DATUM": {"DATUM"},
    "TITEL": {"TITEL"},
    "GESENDET_MIT": {"GESENDET_MIT"},
    "LINK": {"LINK"},
}


In [8]:
# ── 2) Helper to map any column → its placeholder label ────────────────
def col_to_label(col_name: str) -> str | None:
    up = col_name.upper()
    for label, keys in PLACEHOLDERS.items():
        if up in keys:
            return label
    return None

# ── 3) Load your metadata CSV/Excel ────────────────────────────────────
#    Make sure it has a "TextFile" column with the filename of each email
META_PATH = Path("data/Daia_Manual_Labelling_granular.xlsx")
meta = pd.read_excel(META_PATH, dtype=str)

# ── 4) Iterate and build JSON records ─────────────────────────────────
EMAIL_DIR = Path("data/golden_dataset_original")
OUTPUT    = []

# Regex to find any placeholder-like span; we'll ignore it here
tag_re = re.compile(r"<<[^>]+>>")

for _, row in meta.iterrows():
    fname = row["TextFile"]
    email_path = EMAIL_DIR / fname
    if not email_path.exists():
        print(f"⚠️  File not found: {fname}")
        continue

    text = email_path.read_text(encoding="utf-8", errors="ignore")
    labels = []

    # For each metadata column (except TextFile), extract spans
    for col, val in row.items():
        if col == "TextFile" or pd.isna(val):
            continue

        label = col_to_label(col)
        if not label:
            continue

        literal = str(val).strip()
        if not literal:
            continue

        # Find all non-overlapping occurrences of the literal
        for m in re.finditer(re.escape(literal), text):
            labels.append({
                "start": m.start(),
                "end":   m.end(),
                "label": label
            })

    OUTPUT.append({
        "file":   fname,
        "text":   text,
        "labels": labels
    })

# ── 5) Write out the JSON ──────────────────────────────────────────────
OUT_PATH = Path("data/golden_dataset_with_spans.json")
OUT_PATH.write_text(json.dumps(OUTPUT, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✓ Wrote {len(OUTPUT)} records with spans to {OUT_PATH}")

✓ Wrote 160 records with spans to data/golden_dataset_with_spans.json
