In [5]:
import json
import spacy
from spacy.training import offsets_to_biluo_tags
from spacy.lang.de import German
from pathlib import Path

def validate_alignment(file_path):
    nlp = German()
    file_path = Path(file_path)

    with file_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    total_files = len(data)
    misaligned_files = 0
    misaligned_details = []

    for entry in data:
        text = entry["text"]
        labels_raw = entry.get("labels", [])
        labels = [(l["start"], l["end"], l["label"]) for l in labels_raw]
        doc = nlp.make_doc(text)

        try:
            tags = offsets_to_biluo_tags(doc, labels)
        except Exception:
            continue

        if "-" in tags:
            misaligned_files += 1
            details = []
            for l in labels:
                s, e, t = l
                original = text[s:e].replace("\n", "\\n")
                details.append(f"{t}: '{original}' @ {s}-{e}")
            misaligned_details.append((entry.get("file", "unknown"), details))

    print("\nüîç Ergebnis:")
    print(f"  Insgesamt √ºberpr√ºft: {total_files}")
    print(f"  Davon mit fehlerhaften Labels: {misaligned_files}\n")

    # Nur zusammengefasste Ausgabe:
   # for filename, issues in misaligned_details:
      #  print(f"‚ö†Ô∏è Datei: {filename}")
      #  for issue in issues:
       #     print(f"  ‚Üí {issue}")
        #print()

# Beispielaufruf
validate_alignment("synthetic_mails_option_b.json")


üîç Ergebnis:
  Insgesamt √ºberpr√ºft: 14360
  Davon mit fehlerhaften Labels: 4485



In [11]:
import json
from pathlib import Path


def anonymize_file(input_path, output_path):
    # Lade die JSON-Datei
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    anonymized_entries = []

    for entry in data:
        text = entry["text"]
        # Ersetze Labels von hinten nach vorne ‚Äì kein Offset n√∂tig!
        labels = sorted(entry["labels"], key=lambda l: l["start"], reverse=True)

        for label in labels:
            start, end = label["start"], label["end"]
            placeholder = f"[{label['label']}]"
            text = text[:start] + placeholder + text[end:]

        anonymized_entries.append({
            "file": entry["file"],
            "anonymized_text": text
        })

    # Speichere das Ergebnis als neue JSON-Datei
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(anonymized_entries, f, ensure_ascii=False, indent=2)

    print(f"‚úÖ Anonymisierte Datei gespeichert unter: {output_path.resolve()}")

# Beispiel: Nutze den tats√§chlichen Dateinamen, z.‚ÄØB. "granu_train.json"
input_file = Path("./golden_dataset_norm_cleaned.json")  # üëâ Passe das an deine hochgeladene Datei an
output_file = Path("./anonymized_output_cleaned.json")

anonymize_file(input_file, output_file)

‚úÖ Anonymisierte Datei gespeichert unter: /Users/timonmartens/Library/CloudStorage/OneDrive-PersoÃànlich/Desktop/Veranstaltungen/Data Analytics in Applications/daia-eon/data/original/anonymized_output_cleaned.json
