##  validate_alignment

This function checks whether the annotated entity offsets in a JSON file align correctly with token boundaries in a spaCy `Doc`.
It uses `offsets_to_biluo_tags` to detect misalignments that would cause errors during spaCy training.
Misaligned spans are printed along with their file name and position.

In [5]:
import json
import spacy

from spacy.training import offsets_to_biluo_tags
from spacy.lang.de import German
from pathlib import Path


def validate_alignment(file_path):
    """
    Checks whether the annotated entity offsets in a JSON file align correctly
    with token boundaries in a spaCy Doc. Reports all entries where alignment fails.
    """

    # Initialize a blank German tokenizer (no full pipeline needed for alignment check)
    nlp = German()
    file_path = Path(file_path)

    # Load annotated JSON data
    with file_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    total_files = len(data)
    misaligned_files = 0

    # Iterate through each annotated example
    for entry in data:
        text = entry["text"]
        labels_raw = entry.get("labels", [])
        labels = [(l["start"], l["end"], l["label"]) for l in labels_raw]
        doc = nlp.make_doc(text)

        try:
            # Convert offsets to BILUO tags to check alignment
            tags = offsets_to_biluo_tags(doc, labels)
        except Exception as e:
            print(f"Error in '{entry.get('file', 'unknown')}': {e}")
            continue

        # If BILUO contains "-", at least one span is not alignable
        if "-" in tags:
            misaligned_files += 1
            print(f"\nMisaligned labels in file {entry.get('file', 'unknown')}")
            for l in labels:
                s, e, t = l
                original = text[s:e].replace("\n", "\\n")
                print(f"  → {t}: '{original}' @ {s}-{e}")

     # Summary
    print("Result:")
    print(f"Total checked: {total_files}")
    print(f"Misaligned: {misaligned_files}")


# Run validation
validate_alignment("../../data/original/ground_truth.json")

Result:
Total checked: 160
Misaligned: 0


## anonymize_file

This function replaces all labeled entity spans in a JSON file with generic placeholders such as `[LABEL]`,
producing a fully anonymized version of the input texts.

In [7]:
import json
from pathlib import Path


def anonymize_file(input_path, output_path):
    # Load the input JSON file
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    anonymized_entries = []

    for entry in data:
        text = entry["text"]
        # Sort labels in reverse order to avoid offset shifting when replacing text
        labels = sorted(entry["labels"], key=lambda l: l["start"], reverse=True)

        # Replace each labeled span with a placeholder [LABEL]
        for label in labels:
            start, end = label["start"], label["end"]
            placeholder = f"[{label['label']}]"
            text = text[:start] + placeholder + text[end:]

        # Store the anonymized version of the entry
        anonymized_entries.append({
            "file": entry["file"],
            "anonymized_text": text
        })

    # Save the anonymized entries to the output file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(anonymized_entries, f, ensure_ascii=False, indent=2)

    print(f"Anonymized file saved to: {output_path.resolve()}")


# File paths (depending on test)
input_file = Path("../../data/original/ground_truth.json")
output_file = Path("../../data/original/ground_truth_anonymized.json")

# Run the anonymization
anonymize_file(input_file, output_file)

Anonymized file saved to: /Users/timonmartens/Library/CloudStorage/OneDrive-Persönlich/Desktop/Veranstaltungen/Data Analytics in Applications/daia-eon/data/original/ground_truth_anonymized.json
