In [None]:
# 📓 Notebook: Konvertierung von JSONL zu spaCy .spacy Format

import json
from pathlib import Path
import spacy
from spacy.tokens import DocBin, Doc

# Sprache setzen (deutsch)
nlp = spacy.blank("de")  # Leeres spaCy-Modell

# Pfade
train_path = Path("../old_data/converted_piranha/train_converted_piranha.jsonl")
eval_path = Path("../old_data/converted_piranha/eval_converted_piranha.jsonl")
output_train = Path("../data/train.spacy")
output_dev = Path("../data/dev.spacy")

# Funktion zur Konvertierung einer JSONL-Datei in DocBin
def convert_to_spacy(input_path, output_path):
    doc_bin = DocBin()
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            example = json.loads(line)
            tokens = example["tokens"]
            labels = example["labels"]

            # Entferne ggf. [CLS] und [SEP] Token (aus BERT Tokenizer)
            if tokens[0] == "[CLS]":
                tokens = tokens[1:]
                labels = labels[1:]
            if tokens[-1] == "[SEP]":
                tokens = tokens[:-1]
                labels = labels[:-1]

            # spaCy-Dokument erstellen
            doc = Doc(nlp.vocab, words=tokens)

            # Entitäten als Spans setzen
            ents = []
            start = None
            label = None
            for i, tag in enumerate(labels):
                if tag.startswith("B-"):
                    if start is not None:
                        ents.append(doc.char_span(doc[start].idx, doc[i - 1].idx + len(doc[i - 1]), label=label))
                    start = i
                    label = tag[2:]
                elif tag.startswith("I-") and label is not None:
                    continue
                else:
                    if start is not None:
                        ents.append(doc.char_span(doc[start].idx, doc[i - 1].idx + len(doc[i - 1]), label=label))
                        start = None
                        label = None
            if start is not None:
                ents.append(doc.char_span(doc[start].idx, doc[len(labels) - 1].idx + len(doc[len(labels) - 1]), label=label))

            doc.ents = [e for e in ents if e is not None]
            doc_bin.add(doc)

    doc_bin.to_disk(output_path)
    print(f"Gespeichert: {output_path}")

# Konvertieren
convert_to_spacy(train_path, output_train)
convert_to_spacy(eval_path, output_dev)
