In [201]:
'''
!git clone https://github.com/AnnaGhost2713/daia-eon.git
%cd daia-eon/piiranha_refinement
'''

'\n!git clone https://github.com/AnnaGhost2713/daia-eon.git\n%cd daia-eon/piiranha_refinement\n'

In [202]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re
import spacy
import json
from sklearn.metrics import classification_report
import pandas as pd

Label Mapping der einzelnen Identifier

In [203]:
# 📌 Priorität definieren: je höher, desto stärker
PRIORITY_MAP = {
    "regex": 3,
    "piiranha": 1,
    "spacy": 2
}

TARGET_LABELS = ["TITEL", "VORNAME", "NACHNAME", "FIRMA", "TELEFONNUMMER", "EMAIL", "FAX", "STRASSE",
                 "HAUSNUMMER", "POSTLEITZAHL", "WOHNORT", "ZÄHLERNUMMER", "ZÄHLERSTAND", "VERTRAGSNUMMER",
                 "ZAHLUNG", "BANK", "IBAN", "BIC", "DATUM", "GESENDET_MIT", "LINK"]


LABEL_MAP = {
    # spaCy-Labels
    "PER": "NAME", "LOC": "ADRESSE", "ORG": "FIRMA", "DATE": "DATUM", "TIME": "DATUM",
    "GPE": "ADRESSE", "NORP": "GRUPPE", "MONEY": "ZAHLUNG",

    # PIIranha-Labels
    "I-GIVENNAME": "NAME", "I-SURNAME": "NAME", "I-DATEOFBIRTH": "DATUM",
    "I-EMAIL": "KONTAKT", "I-TELEPHONENUM": "KONTAKT", "I-USERNAME": "KONTAKT",
    "I-CREDITCARDNUMBER": "ZAHLUNG",
    "I-ACCOUNTNUM": "VERTRAG", "I-BILLINGNUM": "VERTRAG",
    "I-IDCARDNUM": "VERTRAG", "I-TAXNUM": "VERTRAG",
    "I-CITY": "ADRESSE", "I-ZIPCODE": "ADRESSE", "I-STREET": "ADRESSE", "I-BUILDINGNUM": "ADRESSE",
}

REGEX_PATTERNS = {
    # ✅ Sehr sicheres E-Mail-Muster
    "EMAIL": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}\b",

    # ✅ IBAN beginnt mit Ländercode und ist sehr eindeutig
    "IBAN": r"\bDE\d{20}\b",

    # ✅ BIC hat klar definierte Struktur
    "BIC": r"\b[A-Z]{6}[A-Z2-9][A-NP-Z0-9]{1}([A-Z0-9]{3})?\b",

    # ✅ URLs sind eindeutig
    "URL": r"\bhttps?://[^\s]+|www\.[^\s]+\b",

    # ✅ Vertragsnummer: nur wenn z. B. „vertragsnummer: 406027919“
    "VERTRAG": r"\b(vertragsnummer|vertragsnr\.?|vnr|vn)[\s:]{1,3}\d{7,10}\b",

    # ✅ Datum: nur vollständige, gut formatierte Datumsangaben
    "DATUM": (
    r"\b\d{2}\.\d{2}\.\d{4}\b|"    # z. B. 15.08.2024
    r"\b\d{4}-\d{2}-\d{2}\b|"      # ISO 2024-08-15
    r"\b(19|20)\d{2}\b"            # Jahreszahlen wie 1999, 2023
    ),

    # ✅ Telefonnummer: nur mit internationalem Prefix
    "TELEFON": r"\b\+49\s?\d[\d\s/-]{6,}\b",

    # ✅ Zählernummer: alphanumerisch, min. 10 Zeichen, typischer Stil
    "ZÄHLERNUMMER": r"\b[A-Z]{2}[A-Z0-9]{8,}\b",

    # ✅ Zahlung: nur Beträge mit Währungszeichen – reine Zahlen werden ignoriert
    "ZAHLUNG": r"\b\d{1,5}[.,]\d{2}\s?(€|Euro|EUR|Cent)\b",

    # ✅ Straße: nur wenn typisches Straßenwort UND Zahl folgt
    "STRASSE": r"\b\w+(straße|gasse|allee|weg|platz|str\.|grund)\b"
}


PIIranha Spans

In [204]:
model_name = "iiiorg/piiranha-v1-detect-personal-information"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_piiranha_spans(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, return_offsets_mapping=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    offset_mapping = inputs.pop("offset_mapping")[0].tolist()

    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist()

    spans = []
    current_label = None
    current_start = None

    for i, (start, end) in enumerate(offset_mapping):
        if start == end:  # Special tokens
            continue

        raw_label = model.config.id2label[predictions[i]]
        mapped_label = LABEL_MAP.get(raw_label, None)

        if mapped_label in TARGET_LABELS:
            if current_label == mapped_label:
                continue  # Laufzeit verlängert sich bis label endet
            else:
                # Wenn neuer Start: alten Span abschließen
                if current_label is not None:
                    spans.append({"start": current_start, "end": offset_mapping[i-1][1], "label": current_label})
                current_label = mapped_label
                current_start = start
        else:
            if current_label is not None:
                spans.append({"start": current_start, "end": offset_mapping[i-1][1], "label": current_label})
                current_label = None
                current_start = None

    # Letzten Span abschließen
    if current_label is not None:
        spans.append({"start": current_start, "end": offset_mapping[-1][1], "label": current_label})

    return spans


SpaCy Ruler laden & Spans

In [205]:
from pathlib import Path

nlp = spacy.load("../custom_spacy_model_synthetic_data_b_push")
#nlp = spacy.load("../custom_spacy_model_doccano_labeling")

'''
# 🧩 Schritt 7.1: EntityRuler hinzufügen
# ❌ Entferne ggf. vorher vorhandenen EntityRuler
if "entity_ruler" in nlp.pipe_names:
    nlp.remove_pipe("entity_ruler")

# ✅ EntityRuler nach dem NER einfügen, damit er bevorzugt wird
ruler = nlp.add_pipe("entity_ruler", before="ner")
# Beispiel: Lade Muster
# Funktion zum Laden von Namen aus Datei
def load_names(path, label):
    with open(path, "r", encoding="utf-8") as f:
          names = [name.strip() for name in f if name.strip()]
          patterns = [{"label": label, "pattern": name} for name in names]
    return patterns, names

# Gazetteer laden
vornamen_patterns, vornamen_liste = load_names("../Gazetteer/Vornamen.txt", "VORNAME")
nachnamen_patterns, nachnamen_liste = load_names("../Gazetteer/Nachnamen.txt", "NACHNAME")
titel_patterns, titel_liste = load_names("../Gazetteer/Titel.txt", "TITEL")
wohnort_patterns, wohnort_liste = load_names("../Gazetteer/Orte.txt", "WOHNORT")
postleitzahl_patterns, postleitzahl_liste = load_names("../Gazetteer/Postleitzahlen.txt", "POSTLEITZAHL")
strasse_patterns = [
    {
        "label": "STRASSE",
        "pattern": [
            {"TEXT": {"REGEX": r".*(straße|gasse|allee|weg|platz|str.|grund)$"}},
            {"TEXT": {"REGEX": r"^\d+[a-zA-Z]?$"}}
        ]
    }
]
vertragsnummer_patterns = [
    {
        "label": "VERTRAGSNUMMER",
        "pattern": [
            {"LOWER": {"IN": ["vertragsnummer", "vertragsnr.", "vnr", "vn"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d{6,12}\.?$"}}

        ]
    }
]

kundennummer_patterns = [
    {
        "label": "KUNDENNUMMER",
        "pattern": [
            {"LOWER": {"IN": ["kundennummer", "kundennr.", "kdnr", "kd"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d{6,12}\.?$"}}
        ]
    }
]

zuordnungsnummer_patterns = [
    {
        "label": "ZUORDNUNGSNUMMER",
        "pattern": [
            {"LOWER": {"IN": ["znr", "zuordnungsnummer"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d{6,12}\.?$"}}
        ]
    }
]
iban_pattern = [
    {"label": "IBAN", "pattern": [{"TEXT": {"REGEX": r"^[A-Z]{2}[0-9]{2}[A-Z0-9]{11,30}$"}}]}
]

bic_pattern = [
    {"label": "BIC", "pattern": [{"TEXT": {"REGEX": r"^[A-Z]{6}[A-Z2-9][A-NP-Z0-9]([A-Z0-9]{3})?$"}}]}
]

zahlung_pattern = [
    {
        "label": "ZAHLUNG",
        "pattern": [
            {"TEXT": {"REGEX": r"^\d+[.,]?\d{0,2}$"}},
            {"TEXT": {"REGEX": r"^(€|euro|eur)$"}}
        ]
    }
]

zählerstand_patterns = [
    {
        "label": "ZÄHLERSTAND",
        "pattern": [
            {"LOWER": {"IN": ["zählerstand"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d+(\.\d+)?$"}}
        ]
    },
    {
        "label": "ZÄHLERSTAND",
        "pattern": [
            {"LOWER": {"IN": ["zählerstand"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"IN": [":"]}, "OP": "?"},
            {"TEXT": {"REGEX": r"^\d{1,5}([.,]\d{1,2})?$"}}
        ]
    }
]

zählernummer_patterns = [
    {
        "label": "ZÄHLERNUMMER",
        "pattern": [
            {"LOWER": {"IN": ["zählernummer"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^[A-Z0-9]{6,20}$"}}  # Groß- und Kleinbuchstaben + Ziffern erlaubt
        ]
    }
]

verbrauch_patterns = [
    {
        "label": "VERBRAUCH",
        "pattern": [
            {"LOWER": {"IN": ["verbrauch"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d+(\.\d+)?$"}},
            {"LOWER": {"IN": ["kwh", "m³", "kw"]}, "OP": "?"}
        ]
    }
]

verbrauch_patterns += [
    {
        "label": "VERBRAUCH",
        "pattern": [
            {"LOWER": {"IN": ["verbrauch"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d+(?:[.,]\d+)?(kwh|m³|kw)$"}}
        ]
    }
]


wlv_patterns = [
    {
        "label": "WLV",
        "pattern": [
            {"LOWER": {"IN": ["wlv"]}},
            {"IS_PUNCT": True, "OP": "*"},
            {"TEXT": {"REGEX": r"^\d{4,12}$"}}
        ]
    }
]

email_pattern = [
    {
        "label": "EMAIL",
        "pattern": [
            {"TEXT": {"REGEX": r"^[\w\.-]+@[\w\.-]+\.\w{2,}$"}}
        ]
    }
]

telefon_pattern = [
    {
        "label": "TELEFON",
        "pattern": [
            {"TEXT": {"REGEX": r"^(\+49|0)[\d\s/-]{7,}$"}}
        ]
    }
]

url_pattern = [
    {
        "label": "LINK",
        "pattern": [
            {"TEXT": {"REGEX": r"^https?://[\w\-\.]+\.\w{2,}(/[\w\-\.]*)*$"}}
        ]
    },
    {
        "label": "LINK",
        "pattern": [
            {"TEXT": {"REGEX": r"^www\.[\w\-\.]+\.\w{2,}(/[\w\-\.]*)*$"}}
        ]
    }
]

datum_pattern = [
    {
        "label": "DATUM",
        "pattern": [
            {"TEXT": {"REGEX": r"^(\d{1,2}[./-]){2}\d{2,4}$"}}  # z. B. 15.06.2024
        ]
    },
    {
        "label": "DATUM",
        "pattern": [
            {"TEXT": {"REGEX": r"^\d{4}-\d{2}-\d{2}$"}}  # z. B. 2024-06-15
        ]
    },
    {
        "label": "DATUM",
        "pattern": [
            {"TEXT": {"REGEX": r"^\d{1,2}$"}},  # z. B. 15
            {"LOWER": {"IN": [
                "januar", "jan", "februar", "feb", "märz", "maerz", "mrz", "april", "apr",
                "mai", "juni", "jun", "juli", "jul", "august", "aug", "september", "sep",
                "oktober", "okt", "november", "nov", "dezember", "dez"
            ]}},
            {"TEXT": {"REGEX": r"^\d{2,4}$"}, "OP": "?"}  # optional Jahr
        ]
    }
]





# EntityRuler erstellen und Muster hinzufügen

ruler.add_patterns(zahlung_pattern + url_pattern + iban_pattern + bic_pattern + zahlung_pattern + zählerstand_patterns + email_pattern + telefon_pattern)  # 👈 Muster hinzufügen!


# 💾 (Optional) Modell MIT Ruler neu speichern
output_dir_ruler = Path("custom_spacy_model_with_ruler")
output_dir_ruler.mkdir(exist_ok=True)
nlp.to_disk(output_dir_ruler)
print(f"✅ Modell mit EntityRuler gespeichert unter: {output_dir_ruler.resolve()}")
'''

'\n# 🧩 Schritt 7.1: EntityRuler hinzufügen\n# ❌ Entferne ggf. vorher vorhandenen EntityRuler\nif "entity_ruler" in nlp.pipe_names:\n    nlp.remove_pipe("entity_ruler")\n\n# ✅ EntityRuler nach dem NER einfügen, damit er bevorzugt wird\nruler = nlp.add_pipe("entity_ruler", before="ner")\n# Beispiel: Lade Muster\n# Funktion zum Laden von Namen aus Datei\ndef load_names(path, label):\n    with open(path, "r", encoding="utf-8") as f:\n          names = [name.strip() for name in f if name.strip()]\n          patterns = [{"label": label, "pattern": name} for name in names]\n    return patterns, names\n\n# Gazetteer laden\nvornamen_patterns, vornamen_liste = load_names("../Gazetteer/Vornamen.txt", "VORNAME")\nnachnamen_patterns, nachnamen_liste = load_names("../Gazetteer/Nachnamen.txt", "NACHNAME")\ntitel_patterns, titel_liste = load_names("../Gazetteer/Titel.txt", "TITEL")\nwohnort_patterns, wohnort_liste = load_names("../Gazetteer/Orte.txt", "WOHNORT")\npostleitzahl_patterns, postleitzahl_li

In [206]:
def get_spacy_spans(text):
    doc = nlp(text)
    spans = []
    for ent in doc.ents:
        label = LABEL_MAP.get(ent.label_, ent.label_)
        if label in TARGET_LABELS:
            spans.append({"start": ent.start_char, "end": ent.end_char, "label": label})
    return spans

Regex Spans

In [207]:
def get_regex_spans(text):
    spans = []
    for raw_label, pattern in REGEX_PATTERNS.items():
        mapped_label = LABEL_MAP.get(raw_label, raw_label)  # bleibt bei IBAN = IBAN
        if mapped_label not in TARGET_LABELS:
            continue
        for match in re.finditer(pattern, text):
            spans.append({
                "start": match.start(),
                "end": match.end(),
                "label": mapped_label
            })
    return spans

In [208]:
# Beispieltext zum Testen
sample_text = """
Sehr geehrter Herr John Doe,
Ihre Kundennummer 4012345678 ist aktiv.
Bitte kontaktieren Sie uns unter max@eon.de oder +49 171 1234567.
Ihre Zahlung über 89,99 € wurde am 15. August 2024 verbucht.
"""

# PIIranha-Spans abrufen
piiranha_spans = get_piiranha_spans(sample_text)
spacy_spans = get_spacy_spans(sample_text)
regex_spans = get_regex_spans(sample_text)

# Ergebnisse ausgeben
print(piiranha_spans)
print(spacy_spans)
print(regex_spans)
print("Hi")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[]
[{'start': 20, 'end': 24, 'label': 'VORNAME'}, {'start': 25, 'end': 28, 'label': 'NACHNAME'}, {'start': 48, 'end': 58, 'label': 'VERTRAGSNUMMER'}, {'start': 119, 'end': 135, 'label': 'TELEFONNUMMER'}, {'start': 154, 'end': 159, 'label': 'ZAHLUNG'}]
[{'start': 103, 'end': 113, 'label': 'EMAIL'}, {'start': 182, 'end': 186, 'label': 'DATUM'}]
Hi


In [209]:
# 🧠 Duplikate/Überschneidungen optional vereinfachen
def merge_spans(spans):
    return sorted(spans, key=lambda x: x['start'])

def resolve_conflicts(spans):
    # Sortiere Spans nach Startindex, dann nach Länge absteigend (damit äußere zuerst), dann nach Priorität
    spans = sorted(spans, key=lambda x: (x["start"], -(x["end"] - x["start"]), -PRIORITY_MAP.get(x.get("source", ""), 0)))

    resolved = []
    occupied = set()

    for span in spans:
        span_range = set(range(span["start"], span["end"]))
        conflict = False

        for existing in resolved:
            existing_range = set(range(existing["start"], existing["end"]))

            # ❌ Wenn Spans sich überschneiden
            if span_range & existing_range:
                # ➕ Wenn span vollständig in existing liegt oder umgekehrt → Priorität entscheidet
                if span["start"] >= existing["start"] and span["end"] <= existing["end"]:
                    if PRIORITY_MAP[span["source"]] > PRIORITY_MAP[existing["source"]]:
                        resolved.remove(existing)
                        break
                    else:
                        conflict = True
                        break
                elif existing["start"] >= span["start"] and existing["end"] <= span["end"]:
                    if PRIORITY_MAP[span["source"]] < PRIORITY_MAP[existing["source"]]:
                        conflict = True
                        break
                    else:
                        resolved.remove(existing)
                        break
                else:
                    conflict = True
                    break

        if not conflict:
            resolved.append(span)
            occupied.update(span_range)

    return resolved

# 🔐 Redaktion anwenden mit Labelnummerierung
def apply_final_redaction(text, spans):
    spans = sorted(spans, key=lambda x: x["start"])
    redacted = []
    last_index = 0
    label_counter = defaultdict(int)

    for span in spans:
        label = span["label"]
        label_counter[label] += 1
        label_with_id = f"{label}_{label_counter[label]}"

        # Text vor dem Span beibehalten
        redacted.append(text[last_index:span["start"]])
        # Ersetzung einfügen
        redacted.append(f"[{label_with_id}]")
        # Update der Position
        last_index = span["end"]

    # Rest anhängen
    redacted.append(text[last_index:])
    return ''.join(redacted)


# 🧩 Hauptfunktion
def mask_text_with_all(text, components=["regex"]):
    all_spans = []

    if "regex" in components:
        for span in get_regex_spans(text):
            span["source"] = "regex"
            all_spans.append(span)

    if "piiranha" in components:
        for span in get_piiranha_spans(text):
            span["source"] = "piiranha"
            all_spans.append(span)

    if "spacy" in components:
        for span in get_spacy_spans(text):
            span["source"] = "spacy"
            all_spans.append(span)

    # 🔧 Konflikte auflösen
    spans = resolve_conflicts(all_spans)
    merged = merge_spans(spans)
    return apply_final_redaction(text, merged)


def mask_text_with_single_component(text, component="regex"):
    if component == "regex":
        all_spans = get_regex_spans(text)
    elif component == "piiranha":
        all_spans = get_piiranha_spans(text)
    elif component == "spacy":
        all_spans = get_spacy_spans(text)
    else:
        raise ValueError(f"Unbekannte Komponente: {component}")

    # Optional: Konflikte lösen, falls eine Komponente mehrere Spans mit Überschneidung liefert
    spans = resolve_conflicts(all_spans)
    merged = merge_spans(spans)

    # Gib nur den maskierten Text zurück – analog zur all-Funktion
    return apply_final_redaction(text, merged)


In [210]:
sample = "01.08.2023\n Isabelle Eckhauer : (+49 (0) 5402 008802)\n"
        
print(get_piiranha_spans(sample))
print(get_spacy_spans(sample))
print(get_regex_spans(sample))

text = mask_text_with_all(sample)
print(text)

[]
[{'start': 0, 'end': 10, 'label': 'DATUM'}, {'start': 12, 'end': 20, 'label': 'VORNAME'}, {'start': 21, 'end': 29, 'label': 'NACHNAME'}, {'start': 32, 'end': 52, 'label': 'TELEFONNUMMER'}]
[{'start': 0, 'end': 10, 'label': 'DATUM'}]
[DATUM_1]
 Isabelle Eckhauer : (+49 (0) 5402 008802)



In [211]:
import json
import pandas as pd
from collections import defaultdict, Counter

# 📂 Ground Truth laden
with open("../../../../data/original/ground_truth_split/test_norm.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# 🔄 Hilfsfunktion: span → tuple
def to_tuple(span):
    return (span["start"], span["end"], span["label"])

def evaluate_entities(pred_fn, data, threshold=0.8):
    stats = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
    total_tp = total_fp = total_fn = 0

    for entry in data:
        text = entry["text"]
        gold_spans = [to_tuple(s) for s in entry["labels"]]
        pred_spans = [to_tuple(s) for s in pred_fn(text)]
        matched_gold = set()
        matched_pred = set()

        # Versuche jedes pred_span mit einem passenden gold_span zu matchen
        for pi, p in enumerate(pred_spans):
            best_match = None
            best_overlap = 0
            for gi, g in enumerate(gold_spans):
                if g[2] != p[2]:  # nur gleicher Labeltyp
                    continue
                # Overlap berechnen
                overlap = max(0, min(p[1], g[1]) - max(p[0], g[0]))
                union = max(p[1], g[1]) - min(p[0], g[0])
                jaccard = overlap / union if union > 0 else 0
                if jaccard >= threshold and jaccard > best_overlap:
                    best_overlap = jaccard
                    best_match = gi
            if best_match is not None:
                matched_gold.add(best_match)
                matched_pred.add(pi)
                stats[p[2]]["tp"] += 1
                total_tp += 1

        # False Positives: vorhergesagte Spans ohne passenden Gold-Span
        for i, p in enumerate(pred_spans):
            if i not in matched_pred:
                stats[p[2]]["fp"] += 1
                total_fp += 1

        # False Negatives: goldene Spans ohne passenden Vorhersage-Span
        for i, g in enumerate(gold_spans):
            if i not in matched_gold:
                stats[g[2]]["fn"] += 1
                total_fn += 1

    # Reporting
    rows = []
    for label, counts in stats.items():
        tp, fp, fn = counts["tp"], counts["fp"], counts["fn"]
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        rows.append({
            "component": "Combined",
            "label": label,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "precision": precision,
            "recall": recall,
            "f1": f1
        })

    # Overall-Zeile
    overall_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    overall_recall    = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    overall_f1        = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

    rows.append({
        "component": "Combined",
        "label": "OVERALL",
        "tp": total_tp,
        "fp": total_fp,
        "fn": total_fn,
        "precision": overall_precision,
        "recall": overall_recall,
        "f1": overall_f1
    })

    return pd.DataFrame(rows)

# 🔌 Kombinierte Vorhersagefunktion mit conflict resolution
def run_combined_spans(text):
    all_spans = []
    '''
    for span in get_regex_spans(text):
        span["source"] = "regex"
        all_spans.append(span)
    
    for span in get_piiranha_spans(text):
        span["source"] = "piiranha"
        all_spans.append(span)
    '''
    for span in get_spacy_spans(text):
        span["source"] = "spacy"
        all_spans.append(span)

    resolved = resolve_conflicts(all_spans)
    return merge_spans(resolved)

# 🧾 Evaluation ausführen und speichern
df_eval = evaluate_entities(run_combined_spans, test_data, threshold=0.5)
df_eval.to_csv("Results_synthetic_data_b_only_spacy_no_ruler.csv", index=False)
print("✓ Evaluation auf Entity-Ebene gespeichert: evaluation_entity_level_combined.csv")

✓ Evaluation auf Entity-Ebene gespeichert: evaluation_entity_level_combined.csv


In [212]:
df_eval

Unnamed: 0,component,label,tp,fp,fn,precision,recall,f1
0,Combined,GESENDET_MIT,3,0,3,1.0,0.5,0.666667
1,Combined,ZAHLUNG,7,3,1,0.7,0.875,0.777778
2,Combined,LINK,0,0,3,0.0,0.0,0.0
3,Combined,VERTRAGSNUMMER,28,0,12,1.0,0.7,0.823529
4,Combined,VORNAME,57,1,0,0.982759,1.0,0.991304
5,Combined,NACHNAME,60,2,1,0.967742,0.983607,0.97561
6,Combined,EMAIL,5,6,0,0.454545,1.0,0.625
7,Combined,BANK,0,3,1,0.0,0.0,0.0
8,Combined,DATUM,14,0,16,1.0,0.466667,0.636364
9,Combined,IBAN,1,1,3,0.5,0.25,0.333333


In [213]:
def extract_error_spans(pred_fn, data, threshold=0.8):
    false_positives = []
    false_negatives = []

    def to_tuple(span):
        return (span["start"], span["end"], span["label"])

    for entry in data:
        text = entry["text"]
        gold_spans = [to_tuple(s) for s in entry["labels"]]
        pred_spans = [to_tuple(s) for s in pred_fn(text)]
        matched_gold = set()
        matched_pred = set()

        for pi, p in enumerate(pred_spans):
            for gi, g in enumerate(gold_spans):
                if g[2] != p[2]:
                    continue
                # Overlap berechnen
                overlap = max(0, min(p[1], g[1]) - max(p[0], g[0]))
                union = max(p[1], g[1]) - min(p[0], g[0])
                jaccard = overlap / union if union > 0 else 0
                if jaccard >= threshold:
                    matched_gold.add(gi)
                    matched_pred.add(pi)
                    break  # nur erster Treffer zählt

        # False Positives
        for pi, p in enumerate(pred_spans):
            if pi not in matched_pred:
                false_positives.append({
                    "type": "FP",
                    "text": text[p[0]:p[1]],
                    "label": p[2],
                    "start": p[0],
                    "end": p[1],
                    "source": "pred_only"
                })

        # False Negatives
        for gi, g in enumerate(gold_spans):
            if gi not in matched_gold:
                false_negatives.append({
                    "type": "FN",
                    "text": text[g[0]:g[1]],
                    "label": g[2],
                    "start": g[0],
                    "end": g[1],
                    "source": "gold_only"
                })

    return pd.DataFrame(false_positives + false_negatives)


In [214]:
df_errors = extract_error_spans(run_combined_spans, test_data, threshold=0.1)
df_errors.to_csv("error_analysis_partial_match.csv", index=False)