This notebook is used for the synthetic emails. For all generated emails with its spans in json format, it is checked whether the created emails are in line with the token borders of the spacy model. If this is the case, the emails are immediately added to the final json. Otherwise, it will be checked whether an extension of one char will lead to a correct borders (e.g. because of following point or comma). If this is applicable, the borders will be adapted and the email is also added to the final json. If this extension does not lead to correct boarders, the email will be added to the failed file.

In [1]:
import json
import spacy
from pathlib import Path

# Define input and output paths
nlp = spacy.load("de_core_news_md")

# Pfade
input_path = Path("../../data/synthetic/synthetic_mails_option_b.json")
cleaned_path = Path("../../data/synthetic/synthetic_mails_option_b_cleaned_new.json")
failed_path = Path("../../data/synthetic/synthetic_mails_option_b_failed_new.json")


def try_expand_one_char(doc, text, start, end, label_name):
    """
    Attempts to fix failed span alignment by expanding the label by one character
    to the left or right, using spaCy's strict char_span alignment.
    """
    if start > 0:
        span = doc.char_span(start - 1, end, label=label_name, alignment_mode="strict")
        if span:
            return span
    if end < len(text):
        span = doc.char_span(start, end + 1, label=label_name, alignment_mode="strict")
        if span:
            return span
    return None


def clean_labels(entry, nlp):
    """
    Attempts to create valid spaCy-compatible entity spans for a given entry.
    Returns:
      - cleaned_labels: list of fixed and valid span dictionaries
      - failures: list of failed original labels with optional alignment suggestions
    """
    text = entry.get("text", "")
    labels = entry.get("labels", [])
    doc = nlp(text)

    cleaned_labels = []
    failures = []

    for label in labels:
        start, end, label_name = label["start"], label["end"], label["label"]

        # Attempt 1: strict alignment
        span = doc.char_span(start, end, label=label_name, alignment_mode="strict")
        if span:
            cleaned_labels.append({
                "start": span.start_char,
                "end": span.end_char,
                "label": span.label_
            })
            continue

        # Attempt 2: one character left or right
        span = try_expand_one_char(doc, text, start, end, label_name)
        if span:
            cleaned_labels.append({
                "start": span.start_char,
                "end": span.end_char,
                "label": span.label_
            })
            continue

        # Attempt 3: fallback using token-based alignment (suggestion only)
        span_fb = doc.char_span(start, end, label=label_name, alignment_mode="expand")
        if span_fb:
            suggested = {
                "start": span_fb.start_char,
                "end": span_fb.end_char,
                "text": span_fb.text
            }
        else:
            suggested = None

        # Record as failed alignmen
        failures.append({
            "orig_label": {
                "start": start,
                "end": end,
                "label": label_name,
                "text": text[start:end]
            },
            "suggestion": suggested
        })

    return cleaned_labels, failures


def process_file(input_path, cleaned_path, failed_path):
    """
    Loads annotated JSON data, attempts to fix misaligned entity spans,
    and separates clean vs. problematic examples into two output files.
    """

    # 1. Read the input file
    with input_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned_data = []
    failed_data = []

    # 2. Process each entry
    for entry in data:
        cleaned_labels, failures = clean_labels(entry, nlp)

        if failures:
            # At least one label failed alignment → store full entry in failed_data
            failed_data.append({
                "file": entry.get("file", ""),
                "text": entry.get("text", ""),
                "orig_labels": entry.get("labels", []),
                "failures": failures
            })
        else:
            # All labels are valid → store cleaned version
            cleaned_data.append({
                "file": entry.get("file", ""),
                "text": entry.get("text", ""),
                "labels": cleaned_labels
            })

    # 3. Save cleaned and failed outputs
    with cleaned_path.open("w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
    with failed_path.open("w", encoding="utf-8") as f:
        json.dump(failed_data, f, ensure_ascii=False, indent=2)

    # 4. Report processing results
    total_emails = len(data)
    total_cleaned = len(cleaned_data)
    total_failed_mail = len(failed_data)

    print(f"Total emails processed: {total_emails}")
    print(f"Fully cleaned entries:     {total_cleaned}")
    print(f"Entries with label issues (failed):      {total_failed_mail}\n")

    # Print example failures
    if total_failed_mail:
        print("Examples of failed entries:")
        for rec in failed_data[:5]:
            print(f" • File {rec['file']!r}:")
            for f in rec["failures"]:
                orig = f["orig_label"]
                print(f"    – Orig: '{orig['text']}' @{orig['start']}-{orig['end']} ({orig['label']})")
                if f["suggestion"]:
                    s = f["suggestion"]
                    print(f"      ↳ Suggested: '{s['text']}' @{s['start']}-{s['end']}")
                else:
                    print("      ↳ No valid suggestion")
            print()

    print(f"Cleaned output saved to: {cleaned_path.resolve()}")
    print(f"Failed output saved to:   {failed_path.resolve()}")


# Execute cleaning
process_file(input_path, cleaned_path, failed_path)

Total emails processed: 14360
Fully cleaned entries:     13506
⚠️  Entries with label issues (failed):      854

BExamples of failed entries:
 • File '16':
    – Orig: 'Dipl.-Ing' @121-130 (TITEL)
      ↳ Suggested: 'ADipl.-Ingesse' @120-134

 • File '18':
    – Orig: 'Aurelia' @340-347 (VORNAME)
      ↳ Suggested: 'Aurelia_Löffler' @340-355
    – Orig: 'Löffler' @348-355 (NACHNAME)
      ↳ Suggested: 'Aurelia_Löffler' @340-355

 • File '49':
    – Orig: 'Eugen' @205-210 (VORNAME)
      ↳ Suggested: 'EugenKlemt' @205-215
    – Orig: 'Klemt' @210-215 (NACHNAME)
      ↳ Suggested: 'EugenKlemt' @205-215

 • File '76':
    – Orig: 'Mirjam' @184-190 (VORNAME)
      ↳ Suggested: 'MirjamDowerg' @184-196
    – Orig: 'Dowerg' @190-196 (NACHNAME)
      ↳ Suggested: 'MirjamDowerg' @184-196

 • File '79':
    – Orig: 'Klapp' @189-194 (NACHNAME)
      ↳ Suggested: 'Klapp402' @189-197
    – Orig: '402 862 472 874' @194-209 (VERTRAGSNUMMER)
      ↳ Suggested: 'Klapp402 862 472 874' @189-209

✅ Cleane