In [2]:
# 🧩 Schritt 1: Imports und Setup
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import json
import random
from pathlib import Path
from spacy.util import minibatch

# 📁 Schritt 2: Funktion zum Laden von Daten aus JSON
def load_data_from_json(path):
    with open(path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    if isinstance(raw_data, dict):
        raw_data = [raw_data]

    TRAIN_DATA = []
    for entry in raw_data:
        text = entry["text"]
        entities = [(label["start"], label["end"], label["label"]) for label in entry["labels"]]
        TRAIN_DATA.append((text, {"entities": entities}))
    return TRAIN_DATA

# 🔄 Lade Trainings- und Dev-Daten separat
train_data = load_data_from_json("./spacy_split/train.json")
dev_data = load_data_from_json("./spacy_split/dev.json")
print(f"📥 Trainingsbeispiele: {len(train_data)}, Dev-Beispiele: {len(dev_data)}")

# 🧠 Schritt 3: Lade spaCy-Basismodell
base_model = "de_core_news_md"
nlp = spacy.load(base_model)

# Stelle sicher, dass NER-Komponente existiert
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Registriere alle Labels aus beiden Datensätzen
for dataset in (train_data, dev_data):
    for _, annotations in dataset:
        for start, end, label in annotations["entities"]:
            ner.add_label(label)

# 🚀 Schritt 4: Modell-Initialisierung mit allen Daten (nur für Labels!)
def get_examples():
    for text, ann in train_data + dev_data:
        yield Example.from_dict(nlp.make_doc(text), ann)

optimizer = nlp.initialize(get_examples=get_examples)

# 🏋️ Schritt 5: Training (nur auf Trainingsdaten)
n_iter = 20
for i in range(n_iter):
    random.shuffle(train_data)
    losses = {}

    batches = minibatch(train_data, size=8)
    for batch in batches:
        examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in batch]
        nlp.update(examples, drop=0.35, losses=losses)

    print(f"🔁 Iteration {i+1}/{n_iter}, Loss: {losses['ner']:.4f}")

# 💾 Schritt 6: Modell speichern
output_dir = Path("custom_spacy_model_new")
output_dir.mkdir(exist_ok=True)
nlp.to_disk(output_dir)
print(f"\n✅ Modell gespeichert unter: {output_dir.resolve()}")

# 🔍 Schritt 7: Modell laden und auf dev_data testen
nlp2 = spacy.load(output_dir)

print("\n📊 Evaluation auf dev_data:")
for text, _ in random.sample(dev_data, min(5, len(dev_data))):  # max. 5 Beispiele
    doc = nlp2(text)
    print(f"\n> {text}")
    for ent in doc.ents:
        print(f"  - {ent.text} ({ent.label_})")


Lade 112 Trainingsbeispiele.


ValueError: [E103] Trying to set conflicting doc.ents: '(130, 136, 'VORNAME')' and '(130, 136, 'FIRMA')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.