In [4]:
import json
import spacy
from pathlib import Path

# Lade spaCy Modell (oder ersatzweise nlp = spacy.blank("de"))
nlp = spacy.load("de_core_news_md")

# Eingabe- und Ausgabe-Dateien
input_path = Path("golden_dataset_with_spans_norm.json")
output_path = Path("golden_dataset_norm_cleaned.json")

def clean_labels(entry, nlp):
    text = entry["text"]
    labels = entry.get("labels", [])
    doc = nlp(text)
    cleaned_labels = []

    for label in labels:
        span = doc.char_span(label["start"], label["end"], label=label["label"], alignment_mode="expand")
        if span:
            cleaned_labels.append({
                "start": span.start_char,
                "end": span.end_char,
                "label": span.label_
            })
        else:
            print(f"⚠️ Label nicht ausrichtbar in {entry.get('file', '??')}: '{text[label['start']:label['end']]}' @ {label['start']}-{label['end']}")

    return {
        "file": entry.get("file", ""),
        "text": text,
        "labels": cleaned_labels
    }

def process_file(input_path, output_path):
    with input_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned_data = [clean_labels(entry, nlp) for entry in data]

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ Bereinigte Datei gespeichert unter: {output_path.resolve()}")

# Ausführen
process_file(input_path, output_path)


✅ Bereinigte Datei gespeichert unter: /Users/timonmartens/Library/CloudStorage/OneDrive-Persönlich/Desktop/Veranstaltungen/Data Analytics in Applications/daia-eon/data/original/golden_dataset_norm_cleaned.json
