In [1]:
import json
import spacy
from pathlib import Path

# Lade spaCy Modell (oder ersatzweise nlp = spacy.blank("de"))
nlp = spacy.load("de_core_news_md")

# Eingabe- und Ausgabe-Dateien
input_path = Path("ground_truth.json")
output_path = Path("ground_truth_cleaned.json")

def clean_labels(entry, nlp):
    text = entry["text"]
    labels = entry.get("labels", [])
    doc = nlp(text)
    cleaned_labels = []

    for label in labels:
        span = doc.char_span(label["start"], label["end"], label=label["label"], alignment_mode="expand")
        if span:
            cleaned_labels.append({
                "start": span.start_char,
                "end": span.end_char,
                "label": span.label_
            })
        else:
            print(f"⚠️ Label nicht ausrichtbar in {entry.get('file', '??')}: '{text[label['start']:label['end']]}' @ {label['start']}-{label['end']}")

    return {
        "file": entry.get("file", ""),
        "text": text,
        "labels": cleaned_labels
    }

def process_file(input_path, output_path):
    with input_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned_data = [clean_labels(entry, nlp) for entry in data]

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Bereinigte Datei gespeichert unter: {output_path.resolve()}")

# Ausführen
process_file(input_path, output_path)


✅ Bereinigte Datei gespeichert unter: /Users/timonmartens/Library/CloudStorage/OneDrive-Persönlich/Desktop/Veranstaltungen/Data Analytics in Applications/daia-eon/data/original/ground_truth_cleaned.json


In [3]:
import json
import spacy
from pathlib import Path

# Lade spaCy Modell
nlp = spacy.load("de_core_news_md")

# Pfade
input_path = Path("ground_truth.json")
output_path = Path("ground_truth_cleaned.json")

def clean_labels(entry, nlp):
    text = entry["text"]
    labels = entry.get("labels", [])
    doc = nlp(text)
    cleaned_labels = []

    for label in labels:
        start, end, label_name = label["start"], label["end"], label["label"]
        span_exact = doc.char_span(start, end, label=label_name, alignment_mode="strict")
        span_expand = doc.char_span(start, end, label=label_name, alignment_mode="expand")

        if span_exact:
            # Exakt zuweisbar
            cleaned_labels.append({
                "start": span_exact.start_char,
                "end": span_exact.end_char,
                "label": span_exact.label_
            })
        elif span_expand:
            # Zeige was expand erfassen würde
            print(f"🔍 Nicht exakt ausrichtbar ({entry.get('file', '??')}): '{text[start:end]}' @ {start}-{end}")
            print(f"➡️  Würde erweitert auf: '{span_expand.text}' @ {span_expand.start_char}-{span_expand.end_char}")
            print(f"    Tokens: {[t.text for t in span_expand]}")
            print("    ❓ Akzeptieren oder anpassen?\n")

            # Du kannst hier entscheiden, ob du trotzdem übernehmen willst
            # Zum Beispiel temporär automatisch übernehmen:
            cleaned_labels.append({
                "start": span_expand.start_char,
                "end": span_expand.end_char,
                "label": span_expand.label_
            })
        else:
            print(f"⚠️ Gar nicht zuweisbar ({entry.get('file', '??')}): '{text[start:end]}' @ {start}-{end}")

    return {
        "file": entry.get("file", ""),
        "text": text,
        "labels": cleaned_labels
    }

def process_file(input_path, output_path):
    with input_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned_data = [clean_labels(entry, nlp) for entry in data]

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Bereinigte Datei gespeichert unter: {output_path.resolve()}")

# Start
process_file(input_path, output_path)

🔍 Nicht exakt ausrichtbar (2.txt): '148' @ 202-205
➡️  Würde erweitert auf: 'von148' @ 199-205
    Tokens: ['von148']
    ❓ Akzeptieren oder anpassen?


✅ Bereinigte Datei gespeichert unter: /Users/timonmartens/Library/CloudStorage/OneDrive-Persönlich/Desktop/Veranstaltungen/Data Analytics in Applications/daia-eon/data/original/ground_truth_cleaned.json
