# Vorbereitung des Trainingsdatensatzes für Piranha-Finetuning mit gruppierten Labels
Speichert die entstandenen Files im Ordner /data/converted_piranha

In [None]:


import json
from transformers import AutoTokenizer
from pathlib import Path

# Gruppierte Kategorien
GROUPED_LABELS = {
    "NAME": ["VORNAME", "NACHNAME", "TITEL", "SKYPE"],
    "ADRESSE": ["STRASSE", "HAUSNUMMER", "POSTLEITZAHL", "WOHNORT"],
    "VERTRAG": ["VERTRAGSNUMMER", "KUNDENNUMMER", "ZUORDNUNGSNUMMER"],
    "ZAHLUNG": ["ZAHLUNG", "IBAN", "BIC", "FAX"],
    "TECHNISCHE_DATEN": ["ZÄHLERSTAND", "ZÄHLERNUMMER", "VERBRAUCH", "WLV"],
    "KONTAKT": ["TELEFONNUMMER", "EMAIL", "MAIL", "LINK", "GESENDET_MIT", "FIRMENDATEN"],
    "FIRMA": ["FIRMA"],
    "DATUM": ["DATUM"]
}

# Erstelle finale Label-Liste
label_list = ["O"] + [f"{prefix}-{label}" for label in GROUPED_LABELS for prefix in ("B", "I")]

# Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

# Konvertierungsfunktion

def convert_to_token_classification_format(json_data):
    dataset = []
    for entry in json_data:
        text = entry["text"]
        labels = entry.get("labels", [])

        entities = [(l["start"], l["end"], l["label"]) for l in labels]
        label_map = {}
        for start, end, label in entities:
            for group, keys in GROUPED_LABELS.items():
                if label.upper() in keys or label.upper() == group:
                    label_map[(start, end)] = group
                    break

        encoding = tokenizer(text, return_offsets_mapping=True, truncation=True)
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
        offset_mapping = encoding["offset_mapping"]

        token_labels = []
        for i, (start, end) in enumerate(offset_mapping):
            if start == end:
                token_labels.append("O")
                continue
            assigned = "O"
            for (ent_start, ent_end), ent_label in label_map.items():
                if start >= ent_start and end <= ent_end:
                    prefix = "B-" if start == ent_start else "I-"
                    assigned = f"{prefix}{ent_label}"
                    break
            token_labels.append(assigned)

        dataset.append({"tokens": tokens, "labels": token_labels})
    return dataset

# Lade Daten
with open("../old_data/piranha_old/piranha_training_data.json", encoding="utf-8") as f:
    real_data = json.load(f)

with open("../old_data/piranha_old/piranha_synthetic_data.json", encoding="utf-8") as f:
    synthetic_data = json.load(f)

# Konvertiere
train_dataset = convert_to_token_classification_format(synthetic_data)
eval_dataset = convert_to_token_classification_format(real_data)

# Speichere .jsonl
Path("../old_data/converted_piranha").mkdir(parents=True, exist_ok=True)

with open("../old_data/converted_piranha/train_converted_piranha.jsonl", "w", encoding="utf-8") as f:
    for item in train_dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

with open("../old_data/converted_piranha/eval_converted_piranha.jsonl", "w", encoding="utf-8") as f:
    for item in eval_dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")
