# Piiranha Raw Test

In [None]:
!pip install transformers

In [None]:
# pip install ipywidgets

## Import Piiranha Model from Huggingfase and Map the Piiranha Labels to our Custom Labels

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load model and tokenizer
model_name = "iiiorg/piiranha-v1-detect-personal-information"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Label mapping: PIIRANHA → your schema
PIIRANHA_TO_CUSTOM = {
    "GIVENNAME": "VORNAME",
    "SURNAME": "NACHNAHME",
    "ACCOUNTNUM": "KUNDENNUMMER",
    "CREDITCARD": "KUNDENNUMMER",
    "EMAIL": "EMAIL",
    "PHONENUMBER": "TELEFONNUMMER",
    "IBAN": "IBAN",
    "BIC": "BIC",
    "URL": "LINK",
    "DATE": "DATUM",
    "LOCATION": "ADRESSE",
    "USERNAME": "NACHNAHME",
    "ZIPCODE": "ADRESSE",
    "STREET": "ADRESSE",
    "CITY": "ADRESSE",
    "STATE": "ADRESSE",
    "ORGANIZATION": "FIRMA",
    "TITLE": "TITEL"
}

def apply_redaction(masked_text, start, end, pii_type, aggregate_redaction):
    for j in range(start, end):
        masked_text[j] = ''
    if aggregate_redaction:
        masked_text[start] = '[redacted]'
    else:
        masked_text[start] = f'[{pii_type}]'

def convert_labels_to_custom(spans):
    converted = []
    for span in spans:
        raw_label = span["label"].replace("I-", "").replace("B-", "")
        mapped_label = PIIRANHA_TO_CUSTOM.get(raw_label)
        if mapped_label:
            converted.append({
                "start": span["start"],
                "end": span["end"],
                "label": mapped_label
            })
    return converted

def mask_pii_spans(text, aggregate_redaction=True, return_spans=False):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=-1)

    encoded_inputs = tokenizer.encode_plus(
        text, return_offsets_mapping=True, add_special_tokens=True
    )
    offset_mapping = encoded_inputs['offset_mapping']

    masked_text = list(text)
    is_redacting = False
    redaction_start = 0
    current_pii_type = ''
    predicted_spans = []

    for i, (start, end) in enumerate(offset_mapping):
        if start == end:
            continue  # Skip special tokens

        label_id = predictions[0][i].item()
        label = model.config.id2label[label_id]

        if label != 'O':
            if not is_redacting:
                is_redacting = True
                redaction_start = start
                current_pii_type = label
            elif not aggregate_redaction and label != current_pii_type:
                predicted_spans.append({"start": redaction_start, "end": start, "label": current_pii_type})
                apply_redaction(masked_text, redaction_start, start, current_pii_type, aggregate_redaction)
                redaction_start = start
                current_pii_type = label
        else:
            if is_redacting:
                predicted_spans.append({"start": redaction_start, "end": end, "label": current_pii_type})
                apply_redaction(masked_text, redaction_start, end, current_pii_type, aggregate_redaction)
                is_redacting = False

    if is_redacting:
        predicted_spans.append({"start": redaction_start, "end": len(masked_text), "label": current_pii_type})
        apply_redaction(masked_text, redaction_start, len(masked_text), current_pii_type, aggregate_redaction)

    result = ''.join(masked_text)
    if return_spans:
        return result, predicted_spans
    return result

## Load Original Mails and execute Anonymization

In [35]:
from pathlib import Path

folder = Path("../../Nicolas_Testing/data/golden_dataset_original_copy")

for file_path in folder.glob("*.txt"):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    masked_text, spans = mask_pii_spans(text, return_spans=True)
    mapped_spans = convert_labels_to_custom(spans)

    print(f"\n=== {file_path.name} ===")
    print(masked_text[:500])  # Preview first 500 chars
    print("Predicted PII spans:")
    for span in mapped_spans:
        print(span)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



=== 29.txt ===
Sehr geehrte Damen und Herren,
hiermit möchte ich dass all meine personenbezogenen Daten (Vertragskonto[redacted]öscht werden.
Ich bitte um eine schriftliche Bestätigung darüber, dass alle Daten von mir mit dem o.g. Vertragskonto gelöscht worden sind.
Mit freundlichen Grüßen,
Liselotte Metz

Predicted PII spans:
{'start': 103, 'end': 118, 'label': 'KUNDENNUMMER'}

=== 15.txt ===
Sehr geehrte Damen und Herren,
hiermit lege Ich,[redacted] Vertragsnummer[redacted] bei der Schlussrechnung 2022/ 2023 Wiederspruch ein.
Die Wohnungsabnahme war am 15.05.2022, anbei das Übergabeprotokoll der Hausverwaltung. Der Abbrechnungszeitraum vom 16.05.-31.05.22 fäll somit nicht mehr in meinen Bemessungszeitraum. Anbei auch die neue Meldebescheinigung. Ich bitte hiermit um Klärung der Abrechnung.
Mit freundlichen Grüßen[redacted]
Predicted PII spans:
{'start': 48, 'end': 65, 'label': 'VORNAME'}
{'start': 80, 'end': 91, 'label': 'KUNDENNUMMER'}
{'start': 437, 'end': 454, 'label': 'VORNAME'}

In [39]:
with open("../../Nicolas_Testing/data/golden_dataset_original_copy/1.txt", "r", encoding="utf-8") as f:
    text = f.read()

masked_text, spans = mask_pii_spans(text, return_spans=True)
mapped_spans = convert_labels_to_custom(spans)

print(masked_text)
print("Predicted PII spans:")
for span in mapped_spans:
    print(span)

Hallo liebes Eon Team,
es geht um die Vertragsnummer[redacted] der Einrichtung meines neuen Vertrages wurde leider die Überweisung als
Zahlungsart gewählt von dem jungen Kollegen an der Wohnungstür. Ich würde
es gerne wieder per Lastschrift abbuchen lassen, um mir den Stress zu
ersparen.
Verbraucherstelle ist weiterhin die[redacted][redacted]
Gruß Berthold Huhn

Predicted PII spans:
{'start': 52, 'end': 67, 'label': 'KUNDENNUMMER'}
{'start': 329, 'end': 345, 'label': 'ADRESSE'}
{'start': 345, 'end': 359, 'label': 'ADRESSE'}


## Load Ground Truth and Calculate Precision, Recall & F1

In [40]:
import json

# Load ground truth
with open("../../Nicolas_Testing/data/piranha_ground_truth.json", "r", encoding="utf-8") as f:
    ground_truth = json.load(f)

# Metrics counters
TP, FP, FN = 0, 0, 0

# Optional: group similar gold labels
GROUPED_LABELS = {
    "KUNDENNUMMER": {"KUNDENNUMMER", "VERTRAGSNUMMER", "ZUORDNUNGSNUMMER", "WLV"},
    "ADRESSE": {"ADRESSE", "ZIPCODE", "CITY", "STREET"},
    "VORNAME": {"VORNAME", "GIVENNAME"},
    "NACHNAME": {"NACHNAME", "SURNAME", "USERNAME"},
    "EMAIL": {"EMAIL", "MAIL"},
    "TELEFONNUMMER": {"TELEFONNUMMER", "PHONENUMBER"},
    "IBAN": {"IBAN"},
    "BIC": {"BIC"},
    "LINK": {"LINK", "URL"},
    "DATUM": {"DATUM", "DATE"},
    "FIRMA": {"FIRMA", "ORGANIZATION"},
    "TITEL": {"TITEL", "TITLE"}
}

# Helper: normalize labels to group base (e.g. map VERTRAGSNUMMER → KUNDENNUMMER)
def normalize_label(label):
    for group, aliases in GROUPED_LABELS.items():
        if label in aliases:
            return group
    return label

# Relaxed span match with IoU
def relaxed_overlap(pred, gold, iou_threshold=0.5):
    pred_label = normalize_label(pred["label"])
    gold_label = normalize_label(gold["label"])
    if pred_label != gold_label:
        return False
    inter_start = max(pred["start"], gold["start"])
    inter_end = min(pred["end"], gold["end"])
    intersection = max(0, inter_end - inter_start)
    union = max(pred["end"], gold["end"]) - min(pred["start"], gold["start"])
    iou = intersection / union if union > 0 else 0
    return iou >= iou_threshold

# Evaluation loop
for example in ground_truth:
    text = example["text"]
    gold_spans = [label for label in example["labels"] if label["start"] < label["end"]]

    _, raw_preds = mask_pii_spans(text, return_spans=True)
    pred_spans = convert_labels_to_custom(raw_preds)

    matched_gold = set()
    matched_pred = set()

    for i, pred in enumerate(pred_spans):
        for j, gold in enumerate(gold_spans):
            if relaxed_overlap(pred, gold):
                TP += 1
                matched_gold.add(j)
                matched_pred.add(i)
                break

    FP += len(pred_spans) - len(matched_pred)
    FN += len(gold_spans) - len(matched_gold)

# Compute metrics
precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

print("=== Evaluation Metrics ===")
print(f"True Positives: {TP}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")
print(f"\nPrecision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

=== Evaluation Metrics ===
True Positives: 119
False Positives: 214
False Negatives: 345

Precision: 0.3574
Recall:    0.2565
F1 Score:  0.2986


## Per Label Results

In [41]:
from collections import defaultdict
import json

# Load ground truth
with open("../../Nicolas_Testing/data/piranha_ground_truth.json", "r", encoding="utf-8") as f:
    ground_truth = json.load(f)

# Initialize per-label counts
label_metrics = defaultdict(lambda: {"TP": 0, "FP": 0, "FN": 0})

# Evaluation loop
for example in ground_truth:
    text = example["text"]
    gold_spans = [label for label in example["labels"] if label["start"] < label["end"]]

    _, raw_preds = mask_pii_spans(text, return_spans=True)
    pred_spans = convert_labels_to_custom(raw_preds)

    matched_gold = set()
    matched_pred = set()

    for i, pred in enumerate(pred_spans):
        for j, gold in enumerate(gold_spans):
            if relaxed_overlap(pred, gold):
                norm_label = normalize_label(gold["label"])
                label_metrics[norm_label]["TP"] += 1
                matched_gold.add(j)
                matched_pred.add(i)
                break

    # Count unmatched predictions as FP
    for i, pred in enumerate(pred_spans):
        if i not in matched_pred:
            norm_label = normalize_label(pred["label"])
            label_metrics[norm_label]["FP"] += 1

    # Count unmatched gold spans as FN
    for j, gold in enumerate(gold_spans):
        if j not in matched_gold:
            norm_label = normalize_label(gold["label"])
            label_metrics[norm_label]["FN"] += 1

# Print per-label metrics
print("=== Per-Label Evaluation ===")
for label, counts in sorted(label_metrics.items()):
    TP = counts["TP"]
    FP = counts["FP"]
    FN = counts["FN"]
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f"\nLabel: {label}")
    print(f"  TP: {TP}, FP: {FP}, FN: {FN}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")

=== Per-Label Evaluation ===

Label: ADRESSE
  TP: 50, FP: 41, FN: 24
  Precision: 0.5495
  Recall:    0.6757
  F1 Score:  0.6061

Label: BETRAG
  TP: 0, FP: 0, FN: 25
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: BIC
  TP: 0, FP: 0, FN: 1
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: DATUM
  TP: 0, FP: 0, FN: 99
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: EMAIL
  TP: 1, FP: 8, FN: 13
  Precision: 0.1111
  Recall:    0.0714
  F1 Score:  0.0870

Label: FAX
  TP: 0, FP: 0, FN: 5
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: FIRMA
  TP: 0, FP: 0, FN: 25
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: GESENDET_MIT
  TP: 0, FP: 0, FN: 26
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: IBAN
  TP: 0, FP: 0, FN: 7
  Precision: 0.0000
  Recall:    0.0000
  F1 Score:  0.0000

Label: KUNDENNUMMER
  TP: 68, FP: 17, FN: 20
  Precision: 0.8000
  Recall:    0.7727
  F1 Sco