In [4]:
import spacy
from spacy.training import offsets_to_biluo_tags

print("üîç Pr√ºfe auf nicht ausrichtbare Entities...")
for i, (text, ann) in enumerate(train_data):
    doc = nlp.make_doc(text)
    tags = offsets_to_biluo_tags(doc, ann["entities"])
    if "-" in tags:
        print(f"‚ùå Fehlerhafte Entity-Ausrichtung in Beispiel {i}:")
        for start, end, label in ann["entities"]:
            print(f"  ‚Üí '{text[start:end]}' ({label}) [{start}:{end}]")



text = train_data[2][0]  # Beispiel 2 aus deiner Liste
entities = train_data[2][1]["entities"]
doc = nlp.make_doc(text)

tags = offsets_to_biluo_tags(doc, entities)
print("Tokens + Tags:")
for token, tag in zip(doc, tags):
    print(f"{token.text:20} {tag}")

üîç Pr√ºfe auf nicht ausrichtbare Entities...
‚ùå Fehlerhafte Entity-Ausrichtung in Beispiel 2:
  ‚Üí 'SSE>> <<HAUSNUMMER>>' (STRASSE) [356:376]
  ‚Üí '<<EMAIL>>
<mailto:' (EMAIL) [81:99]
  ‚Üí '> gew√§hlt. Da wir ' (EMAIL) [108:126]
  ‚Üí 'geteilt, dass ich
' (EMAIL) [834:852]
  ‚Üí ' <mailto:<<EMAIL>>' (EMAIL) [861:879]
  ‚Üí '<<EMAIL>> <mailto:' (EMAIL) [1056:1074]
  ‚Üí '>
bedanke ich mich' (EMAIL) [1083:1101]
  ‚Üí '>>
Z√§hlernummer' (Z√ÑHLERNUMMER) [438:453]
  ‚Üí 'TZAHL>> <<W' (WOHNORT) [387:398]
  ‚Üí 'skonto:
<' (VERTRAGSNUMMER) [414:423]
  ‚Üí 'NAME>>' (VORNAME) [343:349]
  ‚Üí 'NAME>>' (VORNAME) [1155:1161]
  ‚Üí '<<STR' (NACHNAME) [350:355]
  ‚Üí 'B.A.
' (NACHNAME) [1162:1167]
  ‚Üí 'OSTLE' (POSTLEITZAHL) [381:386]
  ‚Üí ' <' (HAUSNUMMER) [377:379]
‚ùå Fehlerhafte Entity-Ausrichtung in Beispiel 3:
  ‚Üí '456123-5007643' (Z√ÑHLERNUMMER) [199:213]
  ‚Üí '409291555' (VERTRAGSNUMMER) [177:186]
  ‚Üí 'Nerger' (VORNAME) [164:170]
  ‚Üí 'Grauel' (NACHNAME) [171:177]
‚ùå Fehlerhaf

In [2]:
# üß© Schritt 1: Imports und Setup
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import json
import random
from pathlib import Path
from spacy.util import minibatch
!python -m spacy download de_core_news_md
!pip install spacy-lookups-data



# üìÅ Schritt 2: Funktion zum Laden von Daten aus JSON
def load_data_from_json(path):
    with open(path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    if isinstance(raw_data, dict):
        raw_data = [raw_data]

    TRAIN_DATA = []
    for entry in raw_data:
        text = entry["text"]
        entities = [(label["start"], label["end"], label["label"]) for label in entry["labels"]]
        TRAIN_DATA.append((text, {"entities": entities}))
    return TRAIN_DATA

# üîÑ Lade Trainings- und Dev-Daten separat
train_data = load_data_from_json("../../../data/original/granular_dataset_split_norm_cleaned/train_norm_cleaned.json")
dev_data = load_data_from_json("../../../data/original/granular_dataset_split_norm_cleaned/validation_norm_cleaned.json")
print(f"üì• Trainingsbeispiele: {len(train_data)}, Dev-Beispiele: {len(dev_data)}")

# üß† Schritt 3: Lade spaCy-Basismodell
base_model = "de_core_news_md"
nlp = spacy.load(base_model)


# Stelle sicher, dass NER-Komponente existiert
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Registriere alle Labels aus beiden Datens√§tzen
for dataset in (train_data, dev_data):
    for _, annotations in dataset:
        for start, end, label in annotations["entities"]:
            ner.add_label(label)

# üöÄ Schritt 4: Modell-Initialisierung mit allen Daten (nur f√ºr Labels!)
def get_examples():
    for text, ann in train_data + dev_data:
        yield Example.from_dict(nlp.make_doc(text), ann)

optimizer = nlp.resume_training()


# üèãÔ∏è Schritt 5: Training (nur auf Trainingsdaten)
n_iter = 20
for i in range(n_iter):
    random.shuffle(train_data)
    losses = {}

    batches = minibatch(train_data, size=8)
    for batch in batches:
        examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in batch]
        nlp.update(examples, drop=0.35, losses=losses)

    print(f"üîÅ Iteration {i+1}/{n_iter}, Loss: {losses['ner']:.4f}")

# üíæ Schritt 6: Modell speichern
output_dir = Path("custom_spacy_model_new")
output_dir.mkdir(exist_ok=True)
nlp.to_disk(output_dir)
print(f"\n‚úÖ Modell gespeichert unter: {output_dir.resolve()}")

# üîç Schritt 7: Modell laden und auf dev_data testen
nlp2 = spacy.load(output_dir)

print("\nüìä Evaluation auf dev_data:")
for text, _ in random.sample(dev_data, min(5, len(dev_data))):  # max. 5 Beispiele
    doc = nlp2(text)
    print(f"\n> {text}")
    for ent in doc.ents:
        print(f"  - {ent.text} ({ent.label_})")

Collecting de-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.8.0/de_core_news_md-3.8.0-py3-none-any.whl (44.4 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.4/44.4 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_md')
üì• Trainingsbeispiele: 96, Dev-Beispiele: 25


wie mit Ihrem Kolle..." with entities "[(177, 183, 'VORNAME'), (184, 192, 'NACHNAME'), (2...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
nach Tarifwechsel S..." with entities "[(164, 170, 'VORNAME'), (171, 183, 'NACHNAME'), (1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


üîÅ Iteration 1/20, Loss: 1108.7584
üîÅ Iteration 2/20, Loss: 999.4371
üîÅ Iteration 3/20, Loss: 809.8356
üîÅ Iteration 4/20, Loss: 728.1325
üîÅ Iteration 5/20, Loss: 565.4085
üîÅ Iteration 6/20, Loss: 453.6583
üîÅ Iteration 7/20, Loss: 397.7180
üîÅ Iteration 8/20, Loss: 516.4252
üîÅ Iteration 9/20, Loss: 376.2762
üîÅ Iteration 10/20, Loss: 340.9275
üîÅ Iteration 11/20, Loss: 400.3647
üîÅ Iteration 12/20, Loss: 326.5828
üîÅ Iteration 13/20, Loss: 285.6752
üîÅ Iteration 14/20, Loss: 325.3445
üîÅ Iteration 15/20, Loss: 304.2646
üîÅ Iteration 16/20, Loss: 277.3331
üîÅ Iteration 17/20, Loss: 247.3439
üîÅ Iteration 18/20, Loss: 344.3083
üîÅ Iteration 19/20, Loss: 295.9506
üîÅ Iteration 20/20, Loss: 305.7312

‚úÖ Modell gespeichert unter: /Users/timonmartens/Library/CloudStorage/OneDrive-PersoÃànlich/Desktop/Veranstaltungen/Data Analytics in Applications/daia-eon/notebooks/3_model_training_and_testing/spacy_pipeline/custom_spacy_model_new

üìä Evaluation auf dev_data:

>

In [5]:
from spacy.scorer import Scorer
from spacy.training import Example
from collections import defaultdict
import pandas as pd

def evaluate_model(nlp, examples):
    scorer = Scorer()
    example_list = []

    for text, ann in examples:
        doc = nlp(text)
        example = Example.from_dict(doc, ann)
        example_list.append(example)

    scores = scorer.score(example_list)

    # Strukturieren der Scores f√ºr jede Entit√§tskategorie
    results = []
    for label, metrics in scores["ents_per_type"].items():
        results.append({
            "Label": label,
            "Precision": round(metrics["p"] * 100, 2),
            "Recall": round(metrics["r"] * 100, 2),
            "F1-Score": round(metrics["f"] * 100, 2)
        })

    return pd.DataFrame(results)

In [6]:
results_df = evaluate_model(nlp, dev_data)
display(results_df)

Unnamed: 0,Label,Precision,Recall,F1-Score
0,VERTRAGSNUMMER,54.84,89.47,68.0
1,DATUM,57.14,70.59,63.16
2,VORNAME,90.32,90.32,90.32
3,NACHNAME,96.97,91.43,94.12
4,Z√ÑHLERNUMMER,0.0,0.0,0.0
5,STRASSE,66.67,88.89,76.19
6,GESENDET_MIT,0.0,0.0,0.0
7,HAUSNUMMER,77.78,77.78,77.78
8,POSTLEITZAHL,88.89,100.0,94.12
9,WOHNORT,85.71,75.0,80.0
