In [8]:
import re
import json
import pandas as pd
from pathlib import Path
import unicodedata

# ── Normalisierung ───────────────────────────────
def normalize(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.replace('\xa0', ' ')
    text = re.sub(r'[\u200b\u2028\u2029\ufeff]', '', text)
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    return text

# ── Überlappende Labels filtern ──────────────────
def remove_exactly_nested_labels(labels, text):
    labels = sorted(labels, key=lambda x: (x["end"] - x["start"]), reverse=True)
    filtered = []
    for current in labels:
        current_span = text[current["start"]:current["end"]]
        overlap = False
        for other in filtered:
            if (
                other["start"] <= current["start"]
                and other["end"] >= current["end"]
                and text[other["start"]:other["end"]] == current_span
            ):
                overlap = True
                break
        if not overlap:
            filtered.append(current)
    return sorted(filtered, key=lambda x: x["start"])

# ── Placeholder-Mapping ───────────────────────────
PLACEHOLDERS = {
    "TITEL":         ["TITEL"],
    "VORNAME":       ["VORNAME"],
    "NACHNAME":      ["NACHNAME"],
    "FIRMA":         ["FIRMA"],
    "TELEFONNUMMER": ["TELEFONNUMMER"],
    "EMAIL":         ["EMAIL"],
    "FAX":           ["FAX"],
    "STRASSE":       ["STRASSE"],
    "HAUSNUMMER":    ["HAUSNUMMER"],
    "POSTLEITZAHL":  ["POSTLEITZAHL","PLZ","ZIP"],
    "WOHNORT":       ["WOHNORT","ORT","CITY"],
    "ZÄHLERNUMMER":  ["ZÄHLERNUMMER","METER_ID"],
    "ZÄHLERSTAND":   ["ZÄHLERSTAND","METER_READING"],
    "VERTRAGSNUMMER":["VERTRAGSNUMMER","ANGEBOTSNUMMER","KUNDENNUMMER", "RECHNUNGSNUMMER"],
    "ZAHLUNG":       ["BETRAG","ZAHLUNG","AMOUNT"],
    "BANK":          ["BANK"],
    "IBAN":          ["IBAN"],
    "BIC":           ["BIC"],
    "DATUM":         ["DATUM","DATE"],
    "GESENDET_MIT":  ["GESENDET_MIT"],
    "LINK":          ["LINK"],
}

SENSITIVE_TAGS = {"HAUSNUMMER", "POSTLEITZAHL", "ZAHLUNG"}

def map_col(col: str) -> str|None:
    up = col.upper()
    for tag, keys in PLACEHOLDERS.items():
        if any(k in up for k in keys):
            return f"<<{tag}>>"
    return None

def extract_repls(row: pd.Series):
    repl = []
    for col, val in row.items():
        if pd.isna(val): continue
        ph = map_col(col)
        if not ph: continue
        lit = str(val).strip()
        if lit:
            repl.append((lit, ph))
    return sorted(repl, key=lambda x: len(x[0]), reverse=True)

# ── Pfade ─────────────────────────────────────────
META      = Path("../../../data/excel_manual_labeling/Daia_Manual_Labelling_granular.xlsx")
RAW_DIR   = Path("../../../data/original/golden_dataset_original")
JSON_OUT  = Path("../../../data/original/golden_dataset_with_spans_norm.json")

df = pd.read_excel(META, dtype=str)
tag_re = re.compile(r"<<([^>]+)>>")

output = []

# ── Verarbeitung pro Zeile ─────────────────────────
for _, row in df.iterrows():
    fname     = row["TextFile"]
    orig_text = (RAW_DIR/fname).read_text(encoding="utf-8", errors="ignore")
    norm_text = normalize(orig_text)

    labels = []

    for literal, placeholder in extract_repls(row):
        tag = tag_re.match(placeholder).group(1)
        norm_lit = re.escape(normalize(literal))

        # Wortgrenzen bei sensiblen Werten
        if tag in SENSITIVE_TAGS or literal.isdigit():
            pattern = rf'\b{norm_lit}\b'
        else:
            pattern = norm_lit

        # Suche im normalisierten Originaltext
        for m in re.finditer(pattern, norm_text, flags=re.IGNORECASE):
            match_norm = m.group()
            # suche das exakte Stück im Originaltext, ab Position m.start()
            match_orig = orig_text[m.start():m.start()+len(match_norm)]
            start = orig_text.find(match_orig, m.start())
            if start != -1:
                labels.append({
                    "start": start,
                    "end":   start + len(match_orig),
                    "label": tag
                })
            else:
                print(f"⚠️ Konnte '{match_norm}' nicht im Originaltext finden (Datei: {fname})")

    labels = remove_exactly_nested_labels(labels, orig_text)

    output.append({
        "file":   fname,
        "text":   orig_text,
        "labels": labels
    })

# ── Export ─────────────────────────────────────────
JSON_OUT.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✓ Wrote {len(output)} records to {JSON_OUT}")


✓ Wrote 160 records to ..\..\..\data\original\golden_dataset_with_spans_norm.json
