In [None]:
from faker import Faker
from faker.providers import bank, internet, misc, date_time
import random, re, json, itertools
import string, random

# ---------- 1.  Faker setup ----------
fake = Faker("de_DE")          # German locale => names, streets, phone, PLZ …
fake.add_provider(bank)        # iban(), swift_ascii()
fake.add_provider(internet)    # email(), uri()
fake.add_provider(misc)
fake.add_provider(date_time)

random.seed()                  # remove or pass a seed for reproducibility


In [None]:

### HOUSENUMBER ###

def german_house_number():
    num = random.randint(1, 2000)          # 1 … 2000
    if random.random() < 0.50:             # 50 % will *not* carry a letter
        return str(num)

    # choose letter case
    letter = random.choice(string.ascii_lowercase + string.ascii_uppercase)

    # 50 % chance of a space between number and letter
    sep = " " if random.random() < 0.50 else ""

    return f"{num}{sep}{letter}"

In [None]:
### ZAEHLERNUMMER ###

# --- helper: optionally sprinkle spaces into a sequence -----------
def insert_random_spaces(seq: str, prob: float = 0.4) -> str:
    """
    With probability *prob* return the sequence with random
    spaces (groups of 1–4 chars).  Otherwise return seq unchanged.
    """
    if random.random() > prob:
        return seq
    out, i = [], 0
    while i < len(seq):
        grp_len = random.randint(1, 4)
        out.append(seq[i: i + grp_len])
        i += grp_len
    return " ".join(out)

# --- main generator ----------------------------------------------
def zaehlernummer() -> str:
    """
    Pattern mix (weights: 0.4 / 0.3 / 0.3):
      1)  Alphanumeric – digit + 2-4 letters + 7-12 digits  (e.g. 1GMT00984726553)
      2)  Pure digits, 5-12 chars                           (e.g. 486498046387)
      3)  Digits (5-8) + hyphen + 4-digit year              (e.g. 63746253-1992)
    Each may contain random internal spaces.
    """
    r = random.random()

    if r < 0.4:                                 # --- variant 1
        prefix  = str(random.randint(1, 9))
        letters = ''.join(random.choices(string.ascii_uppercase,
                                         k=random.randint(2, 4)))
        digits  = ''.join(random.choices(string.digits,
                                         k=random.randint(7, 12)))
        core = prefix + letters + digits
        return insert_random_spaces(core)

    elif r < 0.7:                               # --- variant 2
        digits = ''.join(random.choices(string.digits,
                                        k=random.randint(5, 12)))
        return insert_random_spaces(digits)

    else:                                       # --- variant 3
        left  = ''.join(random.choices(string.digits,
                                       k=random.randint(5, 8)))
        year  = str(random.randint(1900, 2099))
        core  = f"{left}-{year}"
        return insert_random_spaces(core, prob=0.25)  # fewer spaces here



In [None]:
### VERTRAGSNUMMER ###

def vertragsnummer():
    blocks = [str(random.randint(400,409)), str(random.randint(100000000,999999999))]
    if random.random() < 0.35:  # sprinkle spaces like in sample data
        return " ".join(blocks[0]), " ".join(re.findall("...", blocks[1]))
    return "".join(blocks)

In [None]:
### ZÄHLERSTAND ###

# --- pre-compute all 8 case combinations of "kwh" -----------------
_KWH_VARIANTS = [''.join(p) for p in itertools.product(
    ('k', 'K'), ('w', 'W'), ('h', 'H')
)]
# -> ['kwh', 'kwH', 'kWh', 'kWH', 'Kwh', 'KwH', 'KWh', 'KWH']

def zaehlstand() -> str:
    """
    Returns a realistic German meter reading, e.g.
      1234567
      1.234 kWh
      7.890.123,45 KW H
      987,6kWh
    """
    # -------- 1.  integer part (1 … 9 999 999) -------------------
    value = random.randint(1, 9_999_999)

    # optional thousands dots
    if value >= 1000 and random.random() < 0.35:
        int_part = f"{value:,}".replace(",", ".")    # 1.234.567
    else:
        int_part = str(value)

    # -------- 2.  decimal part (0–2 digits, comma) ---------------
    if random.random() < 0.5:                        # 50 % with decimals
        dec_len = random.choice([1, 2])
        decimals = f",{random.randint(0, 10**dec_len - 1):0{dec_len}d}"
    else:
        decimals = ""

    # -------- 3.  unit (kWh variants or nothing) -----------------
    if random.random() < 0.65:                       # 65 % show unit
        unit = random.choice(_KWH_VARIANTS)
        spacer = " " if random.random() < 0.5 else ""  # optional space
        suffix = f"{spacer}{unit}"
    else:
        suffix = ""

    return f"{int_part}{decimals}{suffix}"


In [None]:
### ZAHLUNG ###
import random, string

# ── all common Euro tokens, upper-/lower-case variants ───────────
_EURO_TOKENS = ["€", " EUR", "EUR", " Euro", "Euro", " EURO",
                " eur", "eur", "EURO"]

def zahlung() -> str:
    """
    Builds a German-style payment amount such as
      512,30€
      € 9.800
      12.345,6 Euro
      EUR 1.234,56
      7400
    """
    # 1.  choose magnitude 10 … 50 000  (tweak upper bound as needed)
    amount = random.uniform(10, 50_000)

    # 2.  integer / decimal decision
    decimals = random.choices([0, 1, 2], weights=[0.4, 0.3, 0.3])[0]
    fmt = f"{{:,.{decimals}f}}".format(amount).replace(",", "X").replace(".", ",").replace("X", ".")
    # German format → thousands '.'  decimal ','

    # strip trailing ",0" or ",00" if decimals==0
    if decimals == 0:
        fmt = fmt.split(",")[0]

    # 3.  euro token (or none) and position
    token = random.choice(_EURO_TOKENS + [""])        # ~10 % chance of empty
    before = random.random() < 0.25 and token         # 25 % “€ 123”
    after  = not before and token                     # otherwise after / none

    # optional spaces around token
    space = " " if random.random() < 0.6 else ""      # 60 % get a space
    if before:
        return f"{token}{space}{fmt}"
    elif after:
        return f"{fmt}{space}{token.lstrip()}"        # keep trailing space logic
    else:
        return fmt

In [None]:
### BIC ###

def bic():
    # Use faker's swift_ascii() (length 8 or 11) but force country DE
    base = fake.swift_ascii()
    return base if base[4:6] == "DE" else "DEUTDEFFXXX"

In [None]:
### GESENDET MIT ###

# ── building blocks ───────────────────────────────────────────────
PREFIXES_DE = [
    "Gesendet von meinem", "Von meinem", "Mit meinem",
    "Gesendet mit meinem", "Gesendet mit der", "Mit der"
]
PREFIXES_EN = ["Sent from my", "Sent using my"]

DEVICES = [
    "iPhone", "iPad", "MacBook Pro", "Samsung Galaxy S23",
    "Samsung Galaxy", "Google Pixel 8", "Fairphone 5",
    "Huawei P30", "Xiaomi Redmi Note 12", "Surface Pro 9",
    "Lenovo ThinkPad", "OnePlus 12", "Nokia 8.3",
    "BlackBerry Key2", "Galaxy Tab S9", "Steam Deck"
]

MAIL_APPS = [
    "Mail App", "Outlook", "Gmail", "GMX Mail", "web.de Mail",
    "Yahoo Mail", "Thunderbird", "Apple Mail", "BlueMail",
    "Telekom Mail", "Proton Mail", "Posteo", "Tutanota"
]

QUALIFIERS = ["", " für Android", " für iOS", " for Android", " for iOS", " Desktop"]

# ── generator ─────────────────────────────────────────────────────
def gesendet_mit() -> str:
    """Return a varied German/English mobile mail footer."""
    # Choose language flavour (30 % English, 70 % German)
    if random.random() < 0.30:
        prefix = random.choice(PREFIXES_EN)
        device = random.choice(DEVICES)
        # ~50 % add app + qualifier
        if random.random() < 0.5:
            app = random.choice(MAIL_APPS)
            qual = random.choice(QUALIFIERS).strip()
            return f"{prefix} {device} using {app}{(' ' + qual) if qual else ''}".strip()
        return f"{prefix} {device}"

    # German variant
    prefix = random.choice(PREFIXES_DE)
    device = random.choice(DEVICES)
    # ~65 % add “mit <App> <Qualifier>”
    if random.random() < 0.65:
        app = random.choice(MAIL_APPS)
        qual = random.choice(QUALIFIERS).strip()
        suffix = f" {app}{(' ' + qual) if qual else ''}"
    else:
        suffix = ""
    return f"{prefix} {device}{suffix}".strip()


In [6]:
### BANK ###
!pip install schwifty            # run in your terminal or  !pip install schwifty  in a notebook
from schwifty import registry

# --- pull the entire bank registry (cached in ~/.cache/schwifty) ----
banks_reg = registry.get("bank")            # dict  {BIC: {...data...}, ...}

# --- slice out *German* institutes only -----------------------------
banks_de = [entry["name"]
            for entry in banks_reg.values()
            if entry["country_code"] == "DE"]

def german_bank():
    """Return one realistic German bank name."""
    return random.choice(banks_de)

Collecting schwifty
  Downloading schwifty-2025.6.0-py3-none-any.whl.metadata (5.0 kB)
Collecting pycountry (from schwifty)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting rstr (from schwifty)
  Downloading rstr-3.2.2-py3-none-any.whl.metadata (7.1 kB)
Downloading schwifty-2025.6.0-py3-none-any.whl (359 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m359.7/359.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rstr-3.2.2-py3-none-any.whl (10 kB)
Installing collected packages: rstr, pycountry, schwifty
Successfully installed pycountry-24.6.1 rstr-3.2.2 schwifty-2025.6.0


In [None]:
# ---------- 3.  Placeholder → generator map ----------
GEN: dict[str, callable[[], str]] = {
    "TITEL"         : lambda: fake.prefix().rstrip("."),
    "VORNAME"       : fake.first_name,
    "NACHNAME"      : fake.last_name,
    "FIRMA"         : fake.company,
    "TELEFONNUMMER" : fake.phone_number,
    "EMAIL"         : fake.email,
    "FAX"           : fake.phone_number,
    "STRASSE"       : fake.street_name,
    "HAUSNUMMER"    : german_house_number,
    "POSTLEITZAHL"  : fake.postcode,
    "WOHNORT"       : fake.city,
    "ZÄHLERNUMMER"  : zaehlernummer,
    "ZÄHLERSTAND"   : zaehlstand,
    "VERTRAGSNUMMER": vertragsnummer,
    "ZAHLUNG"       : zahlung,
    "BANK"          : german_bank,
    "IBAN"          : lambda: fake.iban(country_code="DE"),
    "BIC"           : bic,
    "DATUM"         : lambda: fake.date(pattern="%d.%m.%Y"),
    "GESENDET_MIT"  : gesendet_mit,
    "LINK"          : fake.uri,
}

In [None]:
# ────────────────────────────────────────────────────────────────────
# 4.  Placeholder substitution helper
# ────────────────────────────────────────────────────────────────────
_alias_to_key = {
    alias: key
    for key, aliases in {
        "TITEL":["TITEL"], "VORNAME":["VORNAME"], "NACHNAME":["NACHNAME"],
        "FIRMA":["FIRMA"], "TELEFONNUMMER":["TELEFONNUMMER"], "EMAIL":["EMAIL"],
        "FAX":["FAX"], "STRASSE":["STRASSE"], "HAUSNUMMER":["HAUSNUMMER"],
        "POSTLEITZAHL":["POSTLEITZAHL","PLZ","ZIP"],
        "WOHNORT":["WOHNORT","ORT","CITY"],
        "ZÄHLERNUMMER":["ZÄHLERNUMMER","METER_ID"],
        "ZÄHLERSTAND":["ZÄHLERSTAND","METER_READING"],
        "VERTRAGSNUMMER":["VERTRAGSNUMMER","ANGEBOTSNUMMER","KUNDENNUMMER"],
        "ZAHLUNG":["BETRAG","ZAHLUNG","AMOUNT"],
        "BANK":["BANK"], "IBAN":["IBAN"], "BIC":["BIC"],
        "DATUM":["DATUM","DATE"], "GESENDET_MIT":["GESENDET_MIT"], "LINK":["LINK"],
    }.items() for alias in aliases
}
_pattern = re.compile(r"<<\s*([A-Z_]+)\s*>>")

def substitute_placeholders(text: str) -> str:
    def repl(match):
        alias = match.group(1)
        key   = _alias_to_key.get(alias)
        return GEN[key]() if key in GEN else match.group(0)
    return _pattern.sub(repl, text)