In [None]:
### 📒 Jupyter Notebook zum Training eines benutzerdefinierten spaCy-Modells

# 🧩 Schritt 1: Imports und Setup
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import json
import random
from pathlib import Path

# 📁 Schritt 2: Lade und konvertiere deine Trainingsdaten aus JSON

def load_data_from_json(path):
    with open(path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    # Falls es eine Liste von Beispielen ist, bleibe bei Liste
    if isinstance(raw_data, dict):
        raw_data = [raw_data]

    TRAIN_DATA = []
    for entry in raw_data:
        text = entry["text"]
        entities = [(label["start"], label["end"], label["label"]) for label in entry["labels"]]
        TRAIN_DATA.append((text, {"entities": entities}))
    return TRAIN_DATA

train_data = load_data_from_json("templates_with_spans.json")
print(f"Lade {len(train_data)} Trainingsbeispiele.")

# 🔧 Schritt 3: Starte mit dem mittleren deutschen spaCy-Basismodell
base_model = "de_core_news_md"
nlp = spacy.load(base_model)

# Füge ggf. 'ner'-Komponente hinzu
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Alle Labels hinzufügen
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# 🏋️ Schritt 4: Training vorbereiten
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()

    # 🧠 Schritt 5: Trainieren
    n_iter = 20
    for i in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.35, losses=losses)
        print(f"Iteration {i+1}/{n_iter}, Losses: {losses}")

# 💾 Schritt 6: Speichern des Modells
output_dir = Path("custom_spacy_model_new")
output_dir.mkdir(exist_ok=True)
nlp.to_disk(output_dir)
print(f"✅ Modell gespeichert unter {output_dir.resolve()}")

# 🔍 Schritt 7: Testen des Modells
nlp2 = spacy.load(output_dir)
doc = nlp2("Hallo Herr Jacob Mangold, wie kann ich Ihnen helfen?")
for ent in doc.ents:
    print(ent.text, ent.label_)