In [19]:
import json
import json
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
import evaluate


# Pfade anpassen
train_path = "../data/real_focus_zusammengefasst/train_zusammengefasst.json"
dev_path = "../data/real_focus_zusammengefasst/dev_zusammengefasst.json"

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


train_data = load_json(train_path)
dev_data = load_json(dev_path)


print(f"{len(train_data)} Trainingsbeispiele, {len(dev_data)} Validierungsbeispiele")


112 Trainingsbeispiele, 24 Validierungsbeispiele


In [20]:
model_name = "iiiorg/piiranha-v1-detect-personal-information"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_tag(data):
    tokenized = []
    for example in data:
        text = example["text"]
        spans = example["labels"]
        encoding = tokenizer(text, return_offsets_mapping=True, truncation=True)
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
        offsets = encoding["offset_mapping"]
        ner_tags = []

        for (start, end) in offsets:
            if start == end:
                ner_tags.append("O")
                continue
            label = "O"
            for span in spans:
                if start >= span["start"] and end <= span["end"]:
                    label = span["label"]
                    break
            ner_tags.append(label)

        tokenized.append({
            "tokens": tokens,
            "labels": ner_tags
        })
    return tokenized

tokenized_train_raw = tokenize_and_tag(train_data)
tokenized_dev_raw = tokenize_and_tag(dev_data)

ds_train = Dataset.from_list(tokenized_train_raw)
ds_dev = Dataset.from_list(tokenized_dev_raw)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [21]:
unique_labels = sorted(set(lab for x in ds_train["labels"] for lab in x if lab != "O"))
label2id = {"O": 0, **{label: i+1 for i, label in enumerate(unique_labels)}}
id2label = {v: k for k, v in label2id.items()}

def encode_labels(example):
    example["labels"] = [label2id[label] for label in example["labels"]]
    return example

ds_train = ds_train.map(encode_labels)
ds_dev = ds_dev.map(encode_labels)


Map: 100%|██████████| 112/112 [00:00<00:00, 6185.47 examples/s]
Map: 100%|██████████| 24/24 [00:00<00:00, 7411.52 examples/s]


In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=256)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(label[word_id])
            prev_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = ds_train.map(tokenize_and_align_labels, batched=True)
tokenized_dev = ds_dev.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 112/112 [00:00<00:00, 3440.75 examples/s]
Map: 100%|██████████| 24/24 [00:00<00:00, 3642.60 examples/s]


In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
import numpy as np

# Modell und Tokenizer laden
model_name = "iiiorg/piiranha-v1-detect-personal-information"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Collator und Metrik
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval", scheme_type="IOB2")

# Metrics-Funktion

# Trainingsparameter
training_args = TrainingArguments(
    output_dir="./piiranha-custom-model",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=200,
    save_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Trainer definieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 🔁 Modell trainieren
trainer.train()

# 🧪 Optional: Evaluation auf Validierungsdaten explizit anstoßen
print("\n🔎 Manuelle Evaluation auf Validierungsdaten:")
trainer.evaluate()


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at iiiorg/piiranha-v1-detect-personal-information and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([18]) in the checkpoint and torch.Size([9]) in the model instantiated
- classifier.weight: found shape torch.Size([18, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss



🔎 Manuelle Evaluation auf Validierungsdaten:


{'eval_loss': 0.28166326880455017,
 'eval_precision': 0.11827956989247312,
 'eval_recall': 0.10476190476190476,
 'eval_f1': 0.1111111111111111,
 'eval_accuracy': 0.9229226361031518,
 'eval_runtime': 1.7135,
 'eval_samples_per_second': 14.006,
 'eval_steps_per_second': 1.751,
 'epoch': 5.0}

In [55]:
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(preds, labels)
    ]

    # Metriken berechnen
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # ✅ Detaillierte Ergebnisse ausgeben
    print("\n📊 Ergebnisse pro Label:")
    for label in results['entities']:
        scores = results['entities'][label]
        print(f"{label}: P={scores['precision']:.2%}, R={scores['recall']:.2%}, F1={scores['f1']:.2%}")

    # Ergebnisse zurückgeben (gesamt + je Label)
    return {
        "precision": results["overall"]["precision"],
        "recall": results["overall"]["recall"],
        "f1": results["overall"]["f1"],
        "accuracy": results["overall"]["accuracy"],
        **{f"{label}_f1": results["entities"][label]["f1"] for label in results["entities"]}
    }


In [56]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.28166326880455017, 'eval_precision': 0.11827956989247312, 'eval_recall': 0.10476190476190476, 'eval_f1': 0.1111111111111111, 'eval_accuracy': 0.9229226361031518, 'eval_runtime': 5.7013, 'eval_samples_per_second': 4.21, 'eval_steps_per_second': 0.526}


In [58]:
from pathlib import Path
import nbformat

# Lade das Notebook
notebook_path = Path("../data/piiranha_file.ipynb")
with open(notebook_path, "r", encoding="utf-8") as f:
    notebook_content = f.read()

# Parse das Notebook
nb = nbformat.reads(notebook_content, as_version=4)

# Durchsuche alle Codezellen nach compute_metrics und ersetze sie
for cell in nb.cells:
    if cell.cell_type == "code" and "def compute_metrics" in cell.source:
        cell.source = '''
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(preds, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    print("\\n📊 Ergebnisse pro Label:")
    for label in results['entities']:
        scores = results['entities'][label]
        print(f"{label}: P={scores['precision']:.2%}, R={scores['recall']:.2%}, F1={scores['f1']:.2%}")

    return {
        "precision": results["overall"]["precision"],
        "recall": results["overall"]["recall"],
        "f1": results["overall"]["f1"],
        "accuracy": results["overall"]["accuracy"],
        **{f"{label}_f1": results["entities"][label]["f1"] for label in results["entities"]}
    }
'''

# Speichere das aktualisierte Notebook
updated_path = "/mnt/data/piiranha_file_updated.ipynb"
with open(updated_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

updated_path


FileNotFoundError: [Errno 2] No such file or directory: '../data/piiranha_file.ipynb'