In [29]:
import pandas as pd
from pathlib import Path

df = pd.read_excel(Path("../data/DAiA Manual Labeling.xlsx"))


In [30]:
import json

with open("../data/piranha_training_data.json", "r", encoding="utf-8") as f:
    training_data = json.load(f)


In [31]:
result_df.to_csv("label_fund_analyse.csv", index=False, encoding="utf-8")
print("Analyse gespeichert unter: label_fund_analyse.csv")


Analyse gespeichert unter: label_fund_analyse.csv


In [43]:
import pandas as pd
import json
from pathlib import Path

# Mappingfunktion aus deinem ursprünglichen Code
GROUPED_PLACEHOLDERS = {
    "NAME": ["TITEL", "VORNAME", "NACHNAME", "SKYPE"],
    "ADRESSE": ["STRASSE", "HAUSNUMMER", "POSTLEITZAHL", "WOHNORT"],
    "VERTRAG": ["VERTRAGSNUMMER", "KUNDENNUMMER", "ZUORDNUNGSNUMMER"],
    "ZAHLUNG": ["ZAHLUNG", "IBAN", "BIC"],
    "TECHNISCHE_DATEN": ["ZÄHLERSTAND", "ZÄHLERNUMMER", "VERBRAUCH", "WLV"],
    "KONTAKT": ["TELEFONNUMMER", "EMAIL", "MAIL", "LINK", "GESENDET_MIT", "FIRMENDATEN", "FAX"],
    "FIRMA": ["FIRMA"],
    "DATUM": ["DATUM"]
}

def map_column_to_label(col_name):
    upper_col = col_name.upper()
    for label, keywords in GROUPED_PLACEHOLDERS.items():
        for keyword in keywords:
            if keyword in upper_col:
                return label
    return None

def extract_missing_entries(df, training_data):
    missing_entries = []

    for i, row in df.iterrows():
        textfile = row["TextFile"]
        text_entry = training_data[i]
        text = text_entry["text"].lower()

        for col in row.index:
            if pd.isna(row[col]) or col == "TextFile":
                continue

            expected_value = str(row[col]).strip()

            # 🛠️ Saubere Entfernung von .0-Endungen (z. B. aus Excel-Zahlen)
            if expected_value.endswith(".0"):
                try:
                    expected_value = str(int(float(expected_value)))
                except ValueError:
                    pass

            expected_lower = expected_value.lower()
            label_type = map_column_to_label(col)

            if label_type and expected_lower not in text:
                missing_entries.append({
                    "TextFile": textfile,
                    "LabelType": label_type,
                    "Column": col,
                    "ExpectedValue": expected_value
        })

    return pd.DataFrame(missing_entries)

# Lade Daten
excel_path = Path("../data/DAiA Manual Labeling.xlsx")  # oder deine konkrete Datei
json_path = Path("../data/piranha_training_data.json")

df = pd.read_excel(excel_path)
with open(json_path, "r", encoding="utf-8") as f:
    training_data = json.load(f)

# Analyse durchführen
missing_df = extract_missing_entries(df, training_data)

# Ausgabe oder Export
print(missing_df.head())  # Optional anzeigen
missing_df.to_csv("fehlende_labels.csv", index=False, encoding="utf-8")
print("Fehlende Labels gespeichert unter: fehlende_labels.csv")


Empty DataFrame
Columns: []
Index: []
Fehlende Labels gespeichert unter: fehlende_labels.csv


In [9]:
import pandas as pd

def analyze_label_success(df, training_data):
    results = []

    for i, row in df.iterrows():
        textfile = row["TextFile"]
        text_entry = training_data[i]
        text = text_entry["text"].lower()
        found_labels = set()
        missing_labels = []

        for col in row.index:
            if pd.isna(row[col]) or col == "TextFile":
                continue

            expected_value = str(row[col]).strip()
            expected_lower = expected_value.lower()
            matched = expected_lower in text

            label_type = map_column_to_label(col)
            if label_type is None:
                continue

            if matched:
                found_labels.add(label_type)
            else:
                missing_labels.append({"column": col, "label": label_type, "value": expected_value})

        results.append({
            "TextFile": textfile,
            "FoundLabels": sorted(found_labels),
            "MissingLabels": [m["label"] for m in missing_labels],
            "MissingDetails": missing_labels
        })

    return pd.DataFrame(results)
