In [26]:
!git clone https://github.com/AnnaGhost2713/daia-eon.git
%cd daia-eon/piiranha_refinement

Cloning into 'daia-eon'...
remote: Enumerating objects: 857, done.[K
remote: Counting objects: 100% (857/857), done.[K
remote: Compressing objects: 100% (631/631), done.[K
remote: Total 857 (delta 451), reused 596 (delta 218), pack-reused 0 (from 0)[K
Receiving objects: 100% (857/857), 3.05 MiB | 2.70 MiB/s, done.
Resolving deltas: 100% (451/451), done.
/content/daia-eon/notebooks/daia-eon/piiranha_refinement


In [22]:
import spacy
nlp = spacy.load("de_core_news_sm")

In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re
import spacy
import json
from sklearn.metrics import classification_report
import pandas as pd

Label Mapping der einzelnen Identifier

In [24]:
# 📌 Priorität definieren: je höher, desto stärker
PRIORITY_MAP = {
    "regex": 3,
    "piiranha": 1,
    "spacy": 2
}

TARGET_LABELS = ["TITEL", "VORNAME", "NACHNAME", "FIRMA", "TELEFONNUMMER", "EMAIL", "FAX", "STRASSE",
                 "HAUSNUMMER", "POSTLEITZAHL", "WOHNORT", "ZÄHLERNUMMER", "ZÄHLERSTAND", "VERTRAGSNUMMER",
                 "ZAHLUNG", "BANK", "IBAN", "BIC", "DATUM", "GESENDET_MIT", "LINK"]


LABEL_MAP = {
    # spaCy-Labels
    "PER": "NAME", "LOC": "ADRESSE", "ORG": "FIRMA", "DATE": "DATUM", "TIME": "DATUM",
    "GPE": "ADRESSE", "NORP": "GRUPPE", "MONEY": "ZAHLUNG",

    # PIIranha-Labels
    "I-GIVENNAME": "NAME", "I-SURNAME": "NAME", "I-DATEOFBIRTH": "DATUM",
    "I-EMAIL": "KONTAKT", "I-TELEPHONENUM": "KONTAKT", "I-USERNAME": "KONTAKT",
    "I-CREDITCARDNUMBER": "ZAHLUNG",
    "I-ACCOUNTNUM": "VERTRAG", "I-BILLINGNUM": "VERTRAG",
    "I-IDCARDNUM": "VERTRAG", "I-TAXNUM": "VERTRAG",
    "I-CITY": "ADRESSE", "I-ZIPCODE": "ADRESSE", "I-STREET": "ADRESSE", "I-BUILDINGNUM": "ADRESSE",
}

REGEX_PATTERNS = {
    "KONTAKT": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+|\+49[\d\s\-\(\)]+" ,
    "VERTRAG": r"\b\d{9,10}\b|4\s?0(?:\s?\d){7}",
    "ZAHLUNG": r"\b\d{1,5},\d{2}\s?(€|Euro|Cent)?\b",
    "IBAN": r"DE\d{20}",
    "DATUM": (
        r"\b\d{1,2}\.\d{1,2}\.\d{4}\b|"  # 15.08.2024
        r"\b\d{1,2}\s+(Januar|Februar|März|April|Mai|Juni|Juli|August|"
        r"September|Oktober|November|Dezember)\s+\d{4}\b|"  # 15 August 2024
        r"\b\d{1,2}\.\s+(Januar|Februar|März|April|Mai|Juni|Juli|August|"
        r"September|Oktober|November|Dezember)\s+\d{4}\b|"  # 15. August 2024
        r"\b(Januar|Februar|März|April|Mai|Juni|Juli|August|"
        r"September|Oktober|November|Dezember)\b|"          # August
        r"\b(19|20)\d{2}\b"                                 # Jahreszahlen wie 2023
    )
}


PIIranha Spans

In [None]:
model_name = "iiiorg/piiranha-v1-detect-personal-information"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_piiranha_spans(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, return_offsets_mapping=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    offset_mapping = inputs.pop("offset_mapping")[0].tolist()

    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist()

    spans = []
    current_label = None
    current_start = None

    for i, (start, end) in enumerate(offset_mapping):
        if start == end:  # Special tokens
            continue

        raw_label = model.config.id2label[predictions[i]]
        mapped_label = LABEL_MAP.get(raw_label, None)

        if mapped_label in TARGET_LABELS:
            if current_label == mapped_label:
                continue  # Laufzeit verlängert sich bis label endet
            else:
                # Wenn neuer Start: alten Span abschließen
                if current_label is not None:
                    spans.append({"start": current_start, "end": offset_mapping[i-1][1], "label": current_label})
                current_label = mapped_label
                current_start = start
        else:
            if current_label is not None:
                spans.append({"start": current_start, "end": offset_mapping[i-1][1], "label": current_label})
                current_label = None
                current_start = None

    # Letzten Span abschließen
    if current_label is not None:
        spans.append({"start": current_start, "end": offset_mapping[-1][1], "label": current_label})

    return spans


SpaCy Spans

In [None]:
nlp = spacy.load("de_core_news_sm")

def get_spacy_spans(text):
    doc = nlp(text)
    spans = []
    for ent in doc.ents:
        label = LABEL_MAP.get(ent.label_, ent.label_)
        if label in TARGET_LABELS:
            spans.append({"start": ent.start_char, "end": ent.end_char, "label": label})
    return spans

Regex Spans

In [None]:
def get_regex_spans(text):
    spans = []
    for raw_label, pattern in REGEX_PATTERNS.items():
        mapped_label = LABEL_MAP.get(raw_label, raw_label)  # bleibt bei IBAN = IBAN
        if mapped_label not in TARGET_LABELS:
            continue
        for match in re.finditer(pattern, text):
            spans.append({
                "start": match.start(),
                "end": match.end(),
                "label": mapped_label
            })
    return spans

In [None]:
# Beispieltext zum Testen
sample_text = """
Sehr geehrter Herr John Doe,
Ihre Kundennummer 4012345678 ist aktiv.
Bitte kontaktieren Sie uns unter max@eon.de oder +49 171 1234567.
Ihre Zahlung über 89,99 € wurde am 15. August 2024 verbucht.
"""

# PIIranha-Spans abrufen
piiranha_spans = get_piiranha_spans(sample_text)
spacy_spans = get_spacy_spans(sample_text)
regex_spans = get_regex_spans(sample_text)

# Ergebnisse ausgeben
print(piiranha_spans)
print(spacy_spans)
print(regex_spans)
print("Hi")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'start': 47, 'end': 58, 'label': 'VERTRAG'}, {'start': 102, 'end': 113, 'label': 'KONTAKT'}, {'start': 118, 'end': 135, 'label': 'KONTAKT'}]
[{'start': 20, 'end': 28, 'label': 'NAME'}]
[{'start': 103, 'end': 113, 'label': 'KONTAKT'}, {'start': 119, 'end': 134, 'label': 'KONTAKT'}, {'start': 48, 'end': 58, 'label': 'VERTRAG'}, {'start': 154, 'end': 159, 'label': 'ZAHLUNG'}, {'start': 171, 'end': 186, 'label': 'DATUM'}]


In [None]:
# 🧠 Duplikate/Überschneidungen optional vereinfachen
def merge_spans(spans):
    return sorted(spans, key=lambda x: x['start'])

def resolve_conflicts(spans):
    # Sortiere Spans nach Startindex, dann nach Länge absteigend (damit äußere zuerst), dann nach Priorität
    spans = sorted(spans, key=lambda x: (x["start"], -(x["end"] - x["start"]), -PRIORITY_MAP.get(x.get("source", ""), 0)))

    resolved = []
    occupied = set()

    for span in spans:
        span_range = set(range(span["start"], span["end"]))
        conflict = False

        for existing in resolved:
            existing_range = set(range(existing["start"], existing["end"]))

            # ❌ Wenn Spans sich überschneiden
            if span_range & existing_range:
                # ➕ Wenn span vollständig in existing liegt oder umgekehrt → Priorität entscheidet
                if span["start"] >= existing["start"] and span["end"] <= existing["end"]:
                    if PRIORITY_MAP[span["source"]] > PRIORITY_MAP[existing["source"]]:
                        resolved.remove(existing)
                        break
                    else:
                        conflict = True
                        break
                elif existing["start"] >= span["start"] and existing["end"] <= span["end"]:
                    if PRIORITY_MAP[span["source"]] < PRIORITY_MAP[existing["source"]]:
                        conflict = True
                        break
                    else:
                        resolved.remove(existing)
                        break
                else:
                    conflict = True
                    break

        if not conflict:
            resolved.append(span)
            occupied.update(span_range)

    return resolved



# 🔐 Redaktion anwenden
def apply_final_redaction(text, spans):
    spans = sorted(spans, key=lambda x: x["start"])
    redacted = []
    last_index = 0

    for span in spans:
        # Text vor dem Span beibehalten
        redacted.append(text[last_index:span["start"]])
        # Ersetzung einfügen
        redacted.append(f"[{span['label']}]")
        # Update der Position
        last_index = span["end"]

    # Rest anhängen
    redacted.append(text[last_index:])
    return ''.join(redacted)


# 🧩 Hauptfunktion
def mask_text_with_all(text):
    all_spans = []

    # Ergebnisse holen und mit 'source' annotieren
    for span in get_regex_spans(text):
        span["source"] = "regex"
        all_spans.append(span)

    for span in get_piiranha_spans(text):
        span["source"] = "piiranha"
        all_spans.append(span)

    for span in get_spacy_spans(text):
        span["source"] = "spacy"
        all_spans.append(span)

    # 🔧 Konflikte auflösen
    spans = resolve_conflicts(all_spans)

    merged = merge_spans(spans)
    return apply_final_redaction(text, merged)

def mask_text_with_single_component(text, component="regex"):
    if component == "regex":
        all_spans = get_regex_spans(text)
    elif component == "piiranha":
        all_spans = get_piiranha_spans(text)
    elif component == "spacy":
        all_spans = get_spacy_spans(text)
    else:
        raise ValueError(f"Unbekannte Komponente: {component}")

    # Optional: Konflikte lösen, falls eine Komponente mehrere Spans mit Überschneidung liefert
    spans = resolve_conflicts(all_spans)
    merged = merge_spans(spans)

    # Gib nur den maskierten Text zurück – analog zur all-Funktion
    return apply_final_redaction(text, merged)


In [None]:
sample = "Sehr geehrte Damen und Herren,\nIch habe 2021 das Haus meines verstorbenen\nOnkels \u00fcbernommen.\nLeider wurde damals vers\u00e4umt den Namen im Vertrag zu \u00e4ndern.\nIch bitte die Daten im Vertrag entsprechend zu \u00e4ndern.\nKarl-Friedrich R\u00f6rricht\n20.06.1980\nDie Bankverbindung(Einziehung vom Konto meiner Schwester John R\u00f6rricht) kann weiter verwendet werden.\nMit freundlichen Gr\u00fc\u00dfen\nKarl-Friedrich R\u00f6rricht\nDiese Nachricht wurde von meinem Android Mobiltelefon mit GMX Mail gesendet.\n"

print(get_piiranha_spans(sample))
print(get_spacy_spans(sample))
print(get_regex_spans(sample))
print()
print(sample)
print(mask_text_with_all(sample))
print(mask_text_with_single_component(sample, component="piiranha"))
print(mask_text_with_single_component(sample, component="regex"))
print(mask_text_with_single_component(sample, component="spacy"))

[{'start': 39, 'end': 44, 'label': 'ADRESSE'}, {'start': 208, 'end': 232, 'label': 'NAME'}, {'start': 305, 'end': 314, 'label': 'NAME'}, {'start': 369, 'end': 393, 'label': 'NAME'}]
[{'start': 49, 'end': 60, 'label': 'ADRESSE'}, {'start': 209, 'end': 232, 'label': 'NAME'}, {'start': 291, 'end': 314, 'label': 'NAME'}, {'start': 370, 'end': 393, 'label': 'ADRESSE'}, {'start': 452, 'end': 460, 'label': 'FIRMA'}]
[{'start': 40, 'end': 44, 'label': 'DATUM'}, {'start': 233, 'end': 243, 'label': 'DATUM'}]

Sehr geehrte Damen und Herren,
Ich habe 2021 das Haus meines verstorbenen
Onkels übernommen.
Leider wurde damals versäumt den Namen im Vertrag zu ändern.
Ich bitte die Daten im Vertrag entsprechend zu ändern.
Karl-Friedrich Rörricht
20.06.1980
Die Bankverbindung(Einziehung vom Konto meiner Schwester John Rörricht) kann weiter verwendet werden.
Mit freundlichen Grüßen
Karl-Friedrich Rörricht
Diese Nachricht wurde von meinem Android Mobiltelefon mit GMX Mail gesendet.

Sehr geehrte Damen und 

In [27]:
import json
import pandas as pd
from sklearn.metrics import classification_report

# 📂 Testdaten laden
with open("data_piiranha/test_labels.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# 🎯 Ground Truth in zeichenbasierte Labels umwandeln
def extract_true_labels(data):
    texts, labels = [], []
    for entry in data:
        text = entry["text"]
        char_labels = ["O"] * len(text)
        for label in entry["labels"]:
            for i in range(label["start"], label["end"]):
                char_labels[i] = label["label"]
        texts.append(text)
        labels.append(char_labels)
    return texts, labels

# 🧩 Hilfsfunktion: PII-Spans → Zeichenbasierte Labels
def spans_to_charlabels(text, spans):
    labels = ["O"] * len(text)
    for span in spans:
        for i in range(span["start"], span["end"]):
            labels[i] = span["label"]
    return labels

# 🔍 Komponenten (diese Funktionen nutzt du aus deinem Erkennungscode)
def run_regex_component(text):
    return spans_to_charlabels(text, get_regex_spans(text))

def run_spacy_component(text):
    return spans_to_charlabels(text, get_spacy_spans(text))

def run_piiranha_component(text):
    return spans_to_charlabels(text, get_piiranha_spans(text))

def run_combined_component(text):
    all_spans = []
    for span in get_regex_spans(text):
        span["source"] = "regex"
        all_spans.append(span)
    for span in get_piiranha_spans(text):
        span["source"] = "piiranha"
        all_spans.append(span)
    for span in get_spacy_spans(text):
        span["source"] = "spacy"
        all_spans.append(span)

    resolved = resolve_conflicts(all_spans)
    merged = merge_spans(resolved)
    return spans_to_charlabels(text, merged)

# 🧪 Evaluation pro Komponente
def evaluate_component(name, component_fn, texts, y_true):
    y_pred = [component_fn(text) for text in texts]
    y_true_flat = [label for seq in y_true for label in seq]
    y_pred_flat = [label for seq in y_pred for label in seq]
    report = classification_report(y_true_flat, y_pred_flat, output_dict=True, zero_division=0)
    df = pd.DataFrame(report).transpose()
    df["component"] = name
    return df

# 🚀 Hauptauswertung starten
texts, y_true = extract_true_labels(test_data)

results = [
    evaluate_component("Regex", run_regex_component, texts, y_true),
    evaluate_component("spaCy", run_spacy_component, texts, y_true),
    evaluate_component("PIIranha", run_piiranha_component, texts, y_true),
    evaluate_component("combined", run_combined_component, texts, y_true)
]

# 📊 Ergebnisse kombinieren
result_df = pd.concat(results).reset_index().rename(columns={"index": "label"})

# 🔍 Relevante PII-Kategorien auswählen
relevant_labels = ["NAME", "ADRESSE", "FIRMA", "DATUM", "KONTAKT", "VERTRAG", "ZAHLUNG", "TECHNISCHE_DATEN"]
filtered_df = result_df[result_df["label"].isin(relevant_labels)]

# ✅ Finale Übersicht
final_df = filtered_df[["component", "label", "precision", "recall", "f1-score", "support"]]
print(final_df.sort_values(["label", "component"]).round(3))


   component             label  precision  recall  f1-score  support
25  PIIranha           ADRESSE      0.000   0.000     0.000    427.0
0      Regex           ADRESSE      0.000   0.000     0.000    427.0
37  combined           ADRESSE      0.000   0.000     0.000    427.0
13     spaCy           ADRESSE      0.000   0.000     0.000    427.0
26  PIIranha             DATUM      0.000   0.000     0.000    236.0
1      Regex             DATUM      0.907   0.822     0.862    236.0
38  combined             DATUM      0.907   0.822     0.862    236.0
14     spaCy             DATUM      0.000   0.000     0.000    236.0
27  PIIranha             FIRMA      0.000   0.000     0.000    140.0
2      Regex             FIRMA      0.000   0.000     0.000    140.0
39  combined             FIRMA      0.271   0.564     0.367    140.0
15     spaCy             FIRMA      0.252   0.564     0.349    140.0
28  PIIranha           KONTAKT      0.000   0.000     0.000    336.0
4      Regex           KONTAKT    