In [1]:
import os
import re
import json
import pandas as pd
from pathlib import Path

GROUPED_PLACEHOLDERS = {
    "NAME": ["NAME", "SKYPE"],
    "ADRESSE": ["ADRESSE"],
    "VERTRAG": ["VERTRAGSNUMMER", "KUNDENNUMMER", "ZUORDNUNGSNUMMER"],
    "ZAHLUNG": ["ZAHLUNG", "IBAN", "BIC"],
    "TECHNISCHE_DATEN": ["ZÄHLERSTAND", "ZÄHLERNUMMER", "VERBRAUCH", "WLV"],
    "KONTAKT": ["TELEFONNUMMER", "EMAIL", "MAIL", "LINK", "GESENDET_MIT", "FIRMENDATEN", "FAX"],
    "FIRMA": ["FIRMA"],
    "DATUM": ["DATUM"]
}

def map_column_to_label(col_name):
    upper_col = col_name.upper()
    for label, keywords in GROUPED_PLACEHOLDERS.items():
        if any(keyword in upper_col for keyword in keywords):
            return label
    return None

def find_all_occurrences(text, value):
    """Finde alle (nicht überlappenden) Vorkommen von `value` im `text`, case-insensitive."""
    matches = []
    pattern = re.escape(value)
    for match in re.finditer(pattern, text, re.IGNORECASE):
        matches.append((match.start(), match.end()))
    return matches

def generate_piranha_training_data(df, email_folder_path, output_file_path):
    training_data = []

    for index, row in df.iterrows():
        file_name = row["TextFile"]
        email_path = os.path.join(email_folder_path, file_name)

        try:
            with open(email_path, "r", encoding="utf-8") as file:
                text = file.read()
        except FileNotFoundError:
            print(f"❌ Datei nicht gefunden: {file_name}")
            continue

        labels = []
        for col in row.index:
            if pd.isna(row[col]):
                continue
            label_type = map_column_to_label(col)
            if label_type:
                value = str(row[col]).strip()
                if not value:
                    continue

                matches = find_all_occurrences(text, value)
                if not matches:
                    print(f"⚠️ Wert nicht gefunden: '{value}' in Datei {file_name}")
                for start_idx, end_idx in matches:
                    labels.append({
                        "start": start_idx,
                        "end": end_idx,
                        "label": label_type
                    })

        training_data.append({
            "text": text,
            "labels": labels
        })

    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(training_data, f, ensure_ascii=False, indent=2)

    print(f"✅ Piranha-Trainingsdaten gespeichert unter: {output_file_path}")


    # Beispielhafte Anwendung:
df = pd.read_excel("../data/DAiA_Manual_Labeling_zusammengefasst.xlsx")
generate_piranha_training_data(
    df,
    email_folder_path="../data/golden_dataset_original",        # Pfad zu den Original-E-Mails
    output_file_path="../data/piranha_training_data_zusammengefasst.json"       # Ziel-Datei mit JSON-Daten
)



⚠️ Wert nicht gefunden: '919684277.0' in Datei 24.txt
⚠️ Wert nicht gefunden: '0.0' in Datei 26.txt
⚠️ Wert nicht gefunden: '14042.0' in Datei 100.txt
⚠️ Wert nicht gefunden: '2019.0' in Datei 162.txt
✅ Piranha-Trainingsdaten gespeichert unter: ../data/piranha_training_data_zusammengefasst.json
