In [1]:
import json
import spacy
from pathlib import Path
from spacy.tokens import DocBin

def convert_labeled_json_to_spacy(input_json, output_spacy):
    nlp = spacy.blank("de")
    doc_bin = DocBin()

    with open(input_json, "r", encoding="utf-8") as f:
        data = json.load(f)

    for entry in data:
        text = entry["text"]
        label_objs = entry.get("labels", [])
        entities = [[lbl["start"], lbl["end"], lbl["label"]] for lbl in label_objs]

        doc = nlp.make_doc(text)
        spans = []
        seen_tokens = set()

        for start, end, label in entities:
            span = doc.char_span(start, end, label=label)
            if span is None:
                continue
            # Prüfe, ob Token überlappen
            if any(t.i in seen_tokens for t in span):
                continue
            spans.append(span)
            seen_tokens.update(t.i for t in span)

        doc.ents = spans
        doc_bin.add(doc)

    doc_bin.to_disk(output_spacy)
    print(f"✅ Gespeichert: {output_spacy}")


# ⚙️ Eingabedateien (anpassen wenn nötig)
convert_labeled_json_to_spacy("../data/real_focus_zusammengefasst/train_zusammengefasst.json", "../data/train_zusammengefasst.spacy")
convert_labeled_json_to_spacy("../data/real_focus_zusammengefasst/dev_zusammengefasst.json", "../data/dev_zusammengefasst.spacy")





✅ Gespeichert: ../data/train_zusammengefasst.spacy
✅ Gespeichert: ../data/dev_zusammengefasst.spacy
