In [1]:
# !pip install pandas regex
import pandas as pd, re, random
from pathlib import Path

In [7]:
# Defining Paths 
RAW_DIR   = Path("../../../data/original/golden_dataset_original")           # 160 raw txt files
OUT_DIR   = Path("../../../data/original/golden_dataset_anonymized_granular")
EXCEL     = Path("../../../data/excel_manual_labeling/Daia_Manual_Labelling_granular.xlsx")         # your metadata workbook

OUT_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
# Fine-grained placeholder map 
# every key = placeholder, every value = list of substrings that may occur in column names

### TODO: ADD TITLE, GESENDET_MIT LATER

PLACEHOLDERS = {
    "TITEL"         :  ["TITEL"],
    "VORNAME"       :  ["VORNAME"],
    "NACHNAME"      :  ["NACHNAME"],
    "FIRMA"         :  ["FIRMA"],
    "TELEFONNUMMER" :  ["TELEFONNUMMER"],
    "EMAIL"         :  ["EMAIL"],
    "FAX"           :  ["FAX"],
    "STRASSE"       :  ["STRASSE"],
    "HAUSNUMMER"    :  ["HAUSNUMMER"],
    "POSTLEITZAHL"  :  ["POSTLEITZAHL","PLZ","ZIP"],
    "WOHNORT"       :  ["WOHNORT","ORT","CITY"],
    "ZÄHLERNUMMER"  :  ["ZÄHLERNUMMER","METER_ID"],
    "ZÄHLERSTAND"   :  ["ZÄHLERSTAND","METER_READING"],
    "VERTRAGSNUMMER":  ["VERTRAGSNUMMER","ANGEBOTSNUMMER", "KUNDENNUMMER"],
    "ZAHLUNG"       :  ["BETRAG","ZAHLUNG","AMOUNT"],
    "BANK"          :  ["BANK"],
    "IBAN"          :  ["IBAN"],
    "BIC"           :  ["BIC"],
    "DATUM"         :  ["DATUM","DATE"],
    "GESENDET_MIT"  :  ["GESENDET_MIT"],
    "LINK"          :  ["LINK"]
}


In [9]:
# Defining helper functions
def map_col(col: str) -> str | None:
    """Return <<PLACEHOLDER>> for a column header, else None."""
    up = col.upper()
    for tag, keys in PLACEHOLDERS.items():
        if any(k in up for k in keys):
            return f"<<{tag}>>"
    return None

def extract_repls(row: pd.Series):
    """[(escaped literal, <<TAG>>), ...] sorted longest→shortest."""
    repl = []
    for col, val in row.items():
        if pd.isna(val):               # skip blanks
            continue
        ph = map_col(col)
        if ph:
            text_val = str(val).strip()
            if text_val:
                repl.append((re.escape(text_val), ph))
    return sorted(repl, key=lambda x: len(x[0]), reverse=True)

In [10]:
# Loading metadata & anonymizing mails
meta = pd.read_excel(EXCEL, dtype=str)   # column "TextFile" must exist

for _, row in meta.iterrows():
    filename = row["TextFile"]
    raw_path = RAW_DIR / filename
    if not raw_path.exists():
        print("❌ missing:", filename); continue

    text = raw_path.read_text(encoding="utf-8", errors="ignore")

    for pattern, placeholder in extract_repls(row):
        text = re.sub(pattern, placeholder, text, flags=re.IGNORECASE)

    (OUT_DIR / filename).write_text(text, encoding="utf-8")

print("✓ anonymised templates written to", OUT_DIR)


✓ anonymised templates written to data/golden_dataset_anonymized_granular


In [13]:
# Quick visual sanity check 
sample = random.choice(list(OUT_DIR.glob("*.txt")))
print("\n--- sanity-check sample:", sample.name, "---\n")
print((OUT_DIR / sample.name).read_text()[:600])


--- sanity-check sample: 28.txt ---

Hallo, leider kann ich mich in mein Kundenkonto nicht einloggen
Mit freundlichen Grüßen
<<VORNAME>> <<NACHNAME>>
<<GESENDET_MIT>>

