In [7]:
!git clone https://github.com/AnnaGhost2713/daia-eon.git
%cd daia-eon/notebooks

Cloning into 'daia-eon'...
remote: Enumerating objects: 819, done.[K
remote: Counting objects: 100% (819/819), done.[K
remote: Compressing objects: 100% (603/603), done.[K
remote: Total 819 (delta 424), reused 576 (delta 208), pack-reused 0 (from 0)[K
Receiving objects: 100% (819/819), 1.07 MiB | 12.54 MiB/s, done.
Resolving deltas: 100% (424/424), done.
/content/daia-eon/notebooks


In [9]:
!pip install faker
from faker import Faker
from faker.providers import bank, internet, misc, date_time
import random, re, json, itertools
import string, random

# ---------- 1.  Faker setup ----------
fake = Faker("de_DE")          # German locale => names, streets, phone, PLZ …
fake.add_provider(bank)        # iban(), swift_ascii()
fake.add_provider(internet)    # email(), uri()
fake.add_provider(misc)
fake.add_provider(date_time)

random.seed()                  # remove or pass a seed for reproducibility


Collecting faker
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.0


In [10]:

### HOUSENUMBER ###

def german_house_number():
    num = random.randint(1, 2000)          # 1 … 2000
    if random.random() < 0.50:             # 50 % will *not* carry a letter
        return str(num)

    # choose letter case
    letter = random.choice(string.ascii_lowercase + string.ascii_uppercase)

    # 50 % chance of a space between number and letter
    sep = " " if random.random() < 0.50 else ""

    return f"{num}{sep}{letter}"

In [11]:
### ZAEHLERNUMMER ###

# --- helper: optionally sprinkle spaces into a sequence -----------
def insert_random_spaces(seq: str, prob: float = 0.4) -> str:
    """
    With probability *prob* return the sequence with random
    spaces (groups of 1–4 chars).  Otherwise return seq unchanged.
    """
    if random.random() > prob:
        return seq
    out, i = [], 0
    while i < len(seq):
        grp_len = random.randint(1, 4)
        out.append(seq[i: i + grp_len])
        i += grp_len
    return " ".join(out)

# --- main generator ----------------------------------------------
def zaehlernummer() -> str:
    """
    Pattern mix (weights: 0.4 / 0.3 / 0.3):
      1)  Alphanumeric – digit + 2-4 letters + 7-12 digits  (e.g. 1GMT00984726553)
      2)  Pure digits, 5-12 chars                           (e.g. 486498046387)
      3)  Digits (5-8) + hyphen + 4-digit year              (e.g. 63746253-1992)
    Each may contain random internal spaces.
    """
    r = random.random()

    if r < 0.4:                                 # --- variant 1
        prefix  = str(random.randint(1, 9))
        letters = ''.join(random.choices(string.ascii_uppercase,
                                         k=random.randint(2, 4)))
        digits  = ''.join(random.choices(string.digits,
                                         k=random.randint(7, 12)))
        core = prefix + letters + digits
        return insert_random_spaces(core)

    elif r < 0.7:                               # --- variant 2
        digits = ''.join(random.choices(string.digits,
                                        k=random.randint(5, 12)))
        return insert_random_spaces(digits)

    else:                                       # --- variant 3
        left  = ''.join(random.choices(string.digits,
                                       k=random.randint(5, 8)))
        year  = str(random.randint(1900, 2099))
        core  = f"{left}-{year}"
        return insert_random_spaces(core, prob=0.25)  # fewer spaces here



In [25]:
### VERTRAGSNUMMER ###

def vertragsnummer():
    a = str(random.randint(400, 409))
    b = str(random.randint(100_000_000, 999_999_999))
    if random.random() < 0.35:
        # join characters of b into 3-char groups with spaces
        b_spaced = " ".join(re.findall("...", b))
        return f"{a} {b_spaced}"                # ← SINGLE string
    return a + b

In [13]:
### ZÄHLERSTAND ###

# --- pre-compute all 8 case combinations of "kwh" -----------------
_KWH_VARIANTS = [''.join(p) for p in itertools.product(
    ('k', 'K'), ('w', 'W'), ('h', 'H')
)]
# -> ['kwh', 'kwH', 'kWh', 'kWH', 'Kwh', 'KwH', 'KWh', 'KWH']

def zaehlstand() -> str:
    """
    Returns a realistic German meter reading, e.g.
      1234567
      1.234 kWh
      7.890.123,45 KW H
      987,6kWh
    """
    # -------- 1.  integer part (1 … 9 999 999) -------------------
    value = random.randint(1, 9_999_999)

    # optional thousands dots
    if value >= 1000 and random.random() < 0.35:
        int_part = f"{value:,}".replace(",", ".")    # 1.234.567
    else:
        int_part = str(value)

    # -------- 2.  decimal part (0–2 digits, comma) ---------------
    if random.random() < 0.5:                        # 50 % with decimals
        dec_len = random.choice([1, 2])
        decimals = f",{random.randint(0, 10**dec_len - 1):0{dec_len}d}"
    else:
        decimals = ""

    # -------- 3.  unit (kWh variants or nothing) -----------------
    if random.random() < 0.65:                       # 65 % show unit
        unit = random.choice(_KWH_VARIANTS)
        spacer = " " if random.random() < 0.5 else ""  # optional space
        suffix = f"{spacer}{unit}"
    else:
        suffix = ""

    return f"{int_part}{decimals}{suffix}"


In [14]:
### ZAHLUNG ###
import random, string

# ── all common Euro tokens, upper-/lower-case variants ───────────
_EURO_TOKENS = ["€", " EUR", "EUR", " Euro", "Euro", " EURO",
                " eur", "eur", "EURO"]

def zahlung() -> str:
    """
    Builds a German-style payment amount such as
      512,30€
      € 9.800
      12.345,6 Euro
      EUR 1.234,56
      7400
    """
    # 1.  choose magnitude 10 … 50 000  (tweak upper bound as needed)
    amount = random.uniform(10, 50_000)

    # 2.  integer / decimal decision
    decimals = random.choices([0, 1, 2], weights=[0.4, 0.3, 0.3])[0]
    fmt = f"{{:,.{decimals}f}}".format(amount).replace(",", "X").replace(".", ",").replace("X", ".")
    # German format → thousands '.'  decimal ','

    # strip trailing ",0" or ",00" if decimals==0
    if decimals == 0:
        fmt = fmt.split(",")[0]

    # 3.  euro token (or none) and position
    token = random.choice(_EURO_TOKENS + [""])        # ~10 % chance of empty
    before = random.random() < 0.25 and token         # 25 % “€ 123”
    after  = not before and token                     # otherwise after / none

    # optional spaces around token
    space = " " if random.random() < 0.6 else ""      # 60 % get a space
    if before:
        return f"{token}{space}{fmt}"
    elif after:
        return f"{fmt}{space}{token.lstrip()}"        # keep trailing space logic
    else:
        return fmt

In [36]:
### IBAN ###
def iban_de():
    bban = fake.bban()                     # 18-digit basic bank account no.
    return "DE" + bban

In [15]:
### BIC ###

def bic():
    # Use faker's swift_ascii() (length 8 or 11) but force country DE
    base = fake.swift_ascii()
    return base if base[4:6] == "DE" else "DEUTDEFFXXX"

In [16]:
### GESENDET MIT ###

# ── building blocks ───────────────────────────────────────────────
PREFIXES_DE = [
    "Gesendet von meinem", "Von meinem", "Mit meinem",
    "Gesendet mit meinem", "Gesendet mit der", "Mit der"
]
PREFIXES_EN = ["Sent from my", "Sent using my"]

DEVICES = [
    "iPhone", "iPad", "MacBook Pro", "Samsung Galaxy S23",
    "Samsung Galaxy", "Google Pixel 8", "Fairphone 5",
    "Huawei P30", "Xiaomi Redmi Note 12", "Surface Pro 9",
    "Lenovo ThinkPad", "OnePlus 12", "Nokia 8.3",
    "BlackBerry Key2", "Galaxy Tab S9", "Steam Deck"
]

MAIL_APPS = [
    "Mail App", "Outlook", "Gmail", "GMX Mail", "web.de Mail",
    "Yahoo Mail", "Thunderbird", "Apple Mail", "BlueMail",
    "Telekom Mail", "Proton Mail", "Posteo", "Tutanota"
]

QUALIFIERS = ["", " für Android", " für iOS", " for Android", " for iOS", " Desktop"]

# ── generator ─────────────────────────────────────────────────────
def gesendet_mit() -> str:
    """Return a varied German/English mobile mail footer."""
    # Choose language flavour (30 % English, 70 % German)
    if random.random() < 0.30:
        prefix = random.choice(PREFIXES_EN)
        device = random.choice(DEVICES)
        # ~50 % add app + qualifier
        if random.random() < 0.5:
            app = random.choice(MAIL_APPS)
            qual = random.choice(QUALIFIERS).strip()
            return f"{prefix} {device} using {app}{(' ' + qual) if qual else ''}".strip()
        return f"{prefix} {device}"

    # German variant
    prefix = random.choice(PREFIXES_DE)
    device = random.choice(DEVICES)
    # ~65 % add “mit <App> <Qualifier>”
    if random.random() < 0.65:
        app = random.choice(MAIL_APPS)
        qual = random.choice(QUALIFIERS).strip()
        suffix = f" {app}{(' ' + qual) if qual else ''}"
    else:
        suffix = ""
    return f"{prefix} {device}{suffix}".strip()


In [18]:
### BANK ###
!pip install schwifty

from schwifty import registry
import random

# 👉 returns a list - not a dict
bank_entries = registry.get("bank")           # [{'bank_code': '10000000', 'name': 'Bundesbank', ...}, …]

# pick only German institutes and de-duplicate
banks_de = list({e["name"] for e in bank_entries if e.get("country_code") == "DE"})

def german_bank() -> str:
    return random.choice(banks_de)

In [37]:
# ---------- 3.  Placeholder → generator map ----------
from typing import Dict, Callable   # ← add this import

GEN: Dict[str, Callable[[], str]] = {
    "TITEL"         : lambda: fake.prefix().rstrip("."),
    "VORNAME"       : fake.first_name,
    "NACHNAME"      : fake.last_name,
    "FIRMA"         : fake.company,
    "TELEFONNUMMER" : fake.phone_number,
    "EMAIL"         : fake.email,
    "FAX"           : fake.phone_number,
    "STRASSE"       : fake.street_name,
    "HAUSNUMMER"    : german_house_number,
    "POSTLEITZAHL"  : fake.postcode,
    "WOHNORT"       : fake.city,
    "ZÄHLERNUMMER"  : zaehlernummer,
    "ZÄHLERSTAND"   : zaehlstand,
    "VERTRAGSNUMMER": vertragsnummer,
    "ZAHLUNG"       : zahlung,
    "BANK"          : german_bank,
    "IBAN"          : iban_de,
    "BIC"           : bic,
    "DATUM"         : lambda: fake.date(pattern="%d.%m.%Y"),
    "GESENDET_MIT"  : gesendet_mit,
    "LINK"          : fake.uri,
}

In [39]:
# ────────────────────────────────────────────────────────────────────
# 4.  Placeholder substitution helper
# ────────────────────────────────────────────────────────────────────
_alias_to_key = {
    alias: key
    for key, aliases in {
        "TITEL":["TITEL"], "VORNAME":["VORNAME"], "NACHNAME":["NACHNAME"],
        "FIRMA":["FIRMA"], "TELEFONNUMMER":["TELEFONNUMMER"], "EMAIL":["EMAIL"],
        "FAX":["FAX"], "STRASSE":["STRASSE"], "HAUSNUMMER":["HAUSNUMMER"],
        "POSTLEITZAHL":["POSTLEITZAHL","PLZ","ZIP"],
        "WOHNORT":["WOHNORT","ORT","CITY"],
        "ZÄHLERNUMMER":["ZÄHLERNUMMER","METER_ID"],
        "ZÄHLERSTAND":["ZÄHLERSTAND","METER_READING"],
        "VERTRAGSNUMMER":["VERTRAGSNUMMER","ANGEBOTSNUMMER","KUNDENNUMMER"],
        "ZAHLUNG":["BETRAG","ZAHLUNG","AMOUNT"],
        "BANK":["BANK"], "IBAN":["IBAN"], "BIC":["BIC"],
        "DATUM":["DATUM","DATE"], "GESENDET_MIT":["GESENDET_MIT"], "LINK":["LINK"],
    }.items() for alias in aliases
}
_pattern = re.compile(r"<<\s*([A-Z_]+)\s*>>")

def substitute_placeholders(text: str) -> str:
    def repl(match):
        alias = match.group(1)
        key   = _alias_to_key.get(alias)
        return GEN[key]() if key in GEN else match.group(0)
    return _pattern.sub(repl, text)

In [40]:
# Test if it works
with open("data/option_a_paraphrases.json", encoding="utf-8") as fh:
    data = json.load(fh)

first = data[0]
out   = [[substitute_placeholders(tpl) for _ in range(3)]
         for tpl in first["variants"]]

print(json.dumps(out, ensure_ascii=False, indent=2))


[
  [
    "Hallo liebes Eon Team, es geht um die Vertragsnummer 400 101 406 305. Bei der Errichtung meines neuen Vertrages wurde leider die Banküberweisung von dem jungen Kollegen an der Wohnungstür als Zahlungsmittel gewählt. Ich möchte, dass es wieder per Lastschrift belastet wird, um den Stress zu vermeiden. Das Konsumbüro ist immer noch die Guenther-Hövel-Gasse 1804 E in 45635 Uffenheim. Gruß Lars Wagenknecht",
    "Hallo liebes Eon Team, es geht um die Vertragsnummer 406691017632. Bei der Errichtung meines neuen Vertrages wurde leider die Banküberweisung von dem jungen Kollegen an der Wohnungstür als Zahlungsmittel gewählt. Ich möchte, dass es wieder per Lastschrift belastet wird, um den Stress zu vermeiden. Das Konsumbüro ist immer noch die Nelli-Gieß-Ring 1698 in 83159 Scheinfeld. Gruß Miguel Löwer",
    "Hallo liebes Eon Team, es geht um die Vertragsnummer 402646830823. Bei der Errichtung meines neuen Vertrages wurde leider die Banküberweisung von dem jungen Kollegen an der Woh

In [41]:
# Test if it works in NER-friendly output format # PART 1
# --- if _alias_to_key, _pattern, GEN are already in memory -----------------
def fill_and_tag(text: str):
    """
    Replace <<PLACEHOLDER>> markers and return
      filled_text,  [ [start, end, LABEL], … ]
    ready for spaCy JSONL.
    """
    spans = []
    offset = 0                             # byte drift as we substitute

    def repl(m):
        nonlocal offset
        alias = m.group(1)
        key   = _alias_to_key.get(alias)
        value = GEN[key]() if key in GEN else m.group(0)

        # record entity span only if we recognised the placeholder
        if key in GEN:
            start = m.start() + offset
            end   = start + len(value)
            spans.append([start, end, key])

        offset += len(value) - len(m.group(0))
        return value

    filled = _pattern.sub(repl, text)
    return filled, spans


In [42]:
# Test if it works in NER-friendly output format # PART 2
# ---------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------
SOURCE         = "data/option_a_paraphrases.json"   # original templates
OUT_PATH       = "filled_mails.json"                # final dataset
VARIANTS_EACH  = 3                                  # tweak to 10 later
START_INDEX    = 1                                  # first file number

# ---------------------------------------------------------------
# DRIVER
# ---------------------------------------------------------------
import json, pathlib, itertools, random

# 1) load templates ------------------------------------------------
with open(SOURCE, encoding="utf-8") as fh:
    data = json.load(fh)

# 2) generate ------------------------------------------------------
records = []
counter  = START_INDEX
for record in data:                        # each original “*.txt” group
    for template in record["variants"]:
        for _ in range(VARIANTS_EACH):
            text, ents = fill_and_tag(template)
            records.append({
                "file": str(counter),      # "1", "2", …
                "text": text,
                "labels": [{"start": s, "end": e, "label": L} for s, e, L in ents]
            })
            counter += 1

# 3) save ----------------------------------------------------------
pathlib.Path(OUT_PATH).write_text(
    json.dumps(records, ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print(f"✅ wrote {len(records):,} mails to {OUT_PATH}")


AttributeError: 'Generator' object has no attribute 'swift_ascii'