In [48]:
!git clone https://github.com/AnnaGhost2713/daia-eon.git
%cd daia-eon/data/
%cd synthetic

Cloning into 'daia-eon'...
remote: Enumerating objects: 1204, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 1204 (delta 42), reused 60 (delta 28), pack-reused 1113 (from 1)[K
Receiving objects: 100% (1204/1204), 48.44 MiB | 16.08 MiB/s, done.
Resolving deltas: 100% (685/685), done.
/content/daia-eon/data
/content/daia-eon/data/synthetic


In [18]:
!pip install faker
from faker import Faker
from faker.providers import bank, internet, misc, date_time
import random, re, json, itertools
import string, random

# ---------- 1.  Faker setup ----------
fake = Faker("de_DE")          # German locale => names, streets, phone, PLZ …
fake.add_provider(bank)        # iban(), swift_ascii()
fake.add_provider(internet)    # email(), uri()
fake.add_provider(misc)
fake.add_provider(date_time)

random.seed()                  # remove or pass a seed for reproducibility




In [19]:

### HOUSENUMBER ###

### improved logic after evaluation results of first synthetic dataset
# -> used built-in faker set for more realistic house numbers

def german_house_number():
    return fake.building_number()

In [20]:
### ZAEHLERNUMMER ###

### improved logic after evaluation results of first synthetic dataset
# -> lowering the overall prob in insert_random_spaces to 0.3 so that most numbers are unbroken or only lightly broken up (especially for the all‑digit variant).
# -> biasing the group sizes toward 3–4 characters instead of 1–4, to mimic how people chunk things
# -> Variant 1 (alphanumeric) is great, but you could allow a leading zero (“0GMT…”) occasionally.
# -> Variant 2 (pure digits): real German meter numbers are often exactly 12 digits, so you might bias k toward 12 (e.g. k=random.choices([12,11,10,9], weights=[0.6,0.1,0.1,0.2])[0]).
# -> Variant 3 (year suffix): those are rare—maybe drop to 10% rather than 30%.

# --- helper: optionally sprinkle spaces into a sequence -----------
def insert_random_spaces(seq: str, prob: float = 0.3) -> str:
    """
    With probability *prob* return the sequence with random
    spaces (groups of 1–4 chars).  Otherwise return seq unchanged.
    """
    if random.random() > prob:
        return seq
    out, i = [], 0
    while i < len(seq):
        grp_len = random.randint(3, 4)
        out.append(seq[i: i + grp_len])
        i += grp_len
    return " ".join(out)

# --- main generator ----------------------------------------------
def zaehlernummer() -> str:
    r = random.random()
    # 1) Alphanumeric
    if r < 0.35:
        prefix = str(random.randint(0,9))
        letters = ''.join(random.choices(string.ascii_lowercase,
                                         k=random.randint(2,4)))
        digits  = ''.join(random.choices(string.digits,
                                         k=random.randint(7,12)))
        core = prefix + letters + digits

    # 2) Pure digits, biased to length 12
    elif r < 0.75:
        length = random.choices([12,11,10,9], weights=[0.6,0.1,0.1,0.2])[0]
        core = ''.join(random.choices(string.digits, k=length))

    # 3) Hyphen or slash separated
    else:
        left  = ''.join(random.choices(string.digits,
                                       k=random.randint(5,8)))
        right = ''.join(random.choices(string.digits,
                                       k=random.randint(4,6)))
        sep   = random.choice(["-", "/"])
        core  = f"{left}{sep}{right}"

    # sprinkle spaces more realistically
    return insert_random_spaces(core, prob=0.25).strip()



In [21]:
### VERTRAGSNUMMER ###

def vertragsnummer():
    a = str(random.randint(400, 409))
    b = str(random.randint(100_000_000, 999_999_999))
    if random.random() < 0.35:
        # join characters of b into 3-char groups with spaces
        b_spaced = " ".join(re.findall("...", b))
        return f"{a} {b_spaced}"                # ← SINGLE string
    return a + b

In [22]:
### ZÄHLERSTAND ###

### improved logic after evaluation results of first synthetic dataset
# -> Restricting “kWh” variants to what people actually write: Eight random letter‑case combinations plus random internal spaces produce things like “Kw H” or “kWH”, which you almost never see.
# -> Stripping any stray spaces: return f"{int_part}{decimals}{suffix}".strip()

_KWH_VARIANTS = ["kWh", "kwh", "KWh", "KWH"]

def zaehlstand() -> str:
    # 1) integer part
    value = random.randint(1, 9_999_999)
    if value >= 1000 and random.random() < 0.35:
        int_part = f"{value:,}".replace(",", ".")
    else:
        int_part = str(value)

    # 2) decimal part
    if random.random() < 0.5:
        dec_len = random.choice([1, 2])
        decimals = f",{random.randint(0, 10**dec_len - 1):0{dec_len}d}"
    else:
        decimals = ""

    # 3) unit
    if random.random() < 0.65:
        unit = random.choice(_KWH_VARIANTS)
        # space only 10% of the time
        spacer = " " if random.random() < 0.10 else ""
        suffix = f"{spacer}{unit}"
    else:
        suffix = ""

    return f"{int_part}{decimals}{suffix}"


In [23]:
### ZAHLUNG ###

### improved logic after evaluation results of first synthetic dataset
# -> Using integer cents instead of random.uniform: Floats can introduce odd rounding artifacts.
# -> Biasing toward smaller amounts: Invoices rarely top out at €50 000
# -> Formatting integer + decimal: Deciding 0–2 decimals, but base it on cent_part
# -> Euro token placement & spacing: Tightening the probabilities to mirror real invoices


import math, random

_EURO_TOKENS = ["€", "EUR", "Euro"]

def zahlung() -> str:
    # 1) sample log-uniform cents
    log_min, log_max = math.log(10), math.log(50_000)
    amount = math.exp(random.uniform(log_min, log_max))
    cents = int(amount * 100)
    euros, cent_part = divmod(cents, 100)

    # 2) choose decimals
    decimals = random.choices([0,1,2], weights=[0.4,0.3,0.3])[0]
    if decimals == 2:
        fmt = f"{euros:,}".replace(",",".") + f",{cent_part:02d}"
    elif decimals == 1:
        fmt = f"{euros:,}".replace(",",".") + f",{cent_part//10}"
    else:
        fmt = f"{euros:,}".replace(",",".")

    # 3) euro token placement
    r = random.random()
    if r < 0.10:
        pos, token = "before", random.choice(_EURO_TOKENS)
    elif r < 0.80:
        pos, token = "after", random.choice(_EURO_TOKENS)
    else:
        pos, token = None, ""
    space = " " if token and random.random() < 0.8 else ""

    if pos == "before":
        return f"{token}{space}{fmt}"
    elif pos == "after":
        return f"{fmt}{space}{token}"
    else:
        return fmt

In [24]:
### IBAN ###
def iban_de():
    bban = fake.bban()                     # 18-digit basic bank account no.
    return "DE" + bban

In [25]:
### BIC ###

def bic():
    # faker 19+    →  swift()
    # older faker  →  swift_ascii()
    try:
        code = fake.swift()          # preferred name
    except AttributeError:
        code = fake.swift_ascii()    # fallback for very old versions

    return code if code[4:6] == "DE" else "DEUTDEFFXXX"


In [26]:
### GESENDET MIT ###

### improved logic after evaluation results of first synthetic dataset
# -> Adjusting Qualifier Placement: Right now you sometimes end up with double‑qualifiers like “Gmail for Android” after already saying “using”. In English, you’d usually say either “Sent from my iPhone using Mail App for iOS” or “Sent from my iPhone for iOS” but not both
# -> Refine Probabilities for Realism
# -> Handling Punctuation Variants: People sometimes use a dash or parentheses instead of a space
# -> Adding a “no suffix” option



# ── building blocks ───────────────────────────────────────────────
PREFIXES_DE = [
    "Gesendet von meinem", "Von meinem", "Mit meinem",
    "Gesendet mit meinem", "Gesendet mit der", "Mit der"
]
PREFIXES_EN = ["Sent from my", "Sent using my"]

DEVICES = [
    "iPhone", "iPad", "MacBook Pro", "Samsung Galaxy S23",
    "Samsung Galaxy", "Google Pixel 8", "Fairphone 5",
    "Huawei P30", "Xiaomi Redmi Note 12", "Surface Pro 9",
    "Lenovo ThinkPad", "OnePlus 12", "Nokia 8.3",
    "BlackBerry Key2", "Galaxy Tab S9", "Steam Deck"
]

MAIL_APPS = [
    "Mail App", "Outlook", "Gmail", "GMX Mail", "web.de Mail",
    "Yahoo Mail", "Thunderbird", "Apple Mail", "BlueMail",
    "Telekom Mail", "Proton Mail", "Posteo", "Tutanota"
]

QUALIFIERS = ["", " für Android", " für iOS", " for Android", " for iOS", " Desktop"]

# ── generator ─────────────────────────────────────────────────────
import re

def gesendet_mit() -> str:
    # 20% English, 80% German
    is_english = random.random() < 0.20

    if is_english:
        prefix = random.choice(PREFIXES_EN)
    else:
        prefix = random.choice(PREFIXES_DE)

    device = random.choice(DEVICES)

    # ~10% chance of no app info at all
    if random.random() < 0.10:
        footer = f"{prefix} {device}"
    else:
        app = random.choice(MAIL_APPS)
        # Qualifier only if app present
        qual = random.choice(["", " for Android", " for iOS"]) if is_english else random.choice(["", " für Android", " für iOS"])
        # Choose separator style
        sep = random.choice([" ", " — ", " ("])
        suffix = f"{sep}{app}{qual}{')' if sep == ' (' else ''}"
        if is_english:
            footer = f"{prefix} {device} using{suffix}"
        else:
            footer = f"{prefix} {device}{suffix}"

    # Clean up whitespace
    footer = footer.strip()
    footer = re.sub(r"\s+", " ", footer)
    return footer


In [27]:
### BANK ###
!pip install schwifty

from schwifty import registry
import random

# 👉 returns a list - not a dict
bank_entries = registry.get("bank")           # [{'bank_code': '10000000', 'name': 'Bundesbank', ...}, …]

# pick only German institutes and de-duplicate
banks_de = list({e["name"] for e in bank_entries if e.get("country_code") == "DE"})

def german_bank() -> str:
    return random.choice(banks_de)



In [28]:
# ---------- 3.  Placeholder → generator map ----------
from typing import Dict, Callable   # ← add this import

GEN: Dict[str, Callable[[], str]] = {
    "TITEL"         : lambda: fake.prefix().rstrip("."),
    "VORNAME"       : fake.first_name,
    "NACHNAME"      : fake.last_name,
    "FIRMA"         : fake.company,
    "TELEFONNUMMER" : fake.phone_number,
    "EMAIL"         : fake.email,
    "FAX"           : fake.phone_number,
    "STRASSE"       : fake.street_name,
    "HAUSNUMMER"    : german_house_number,
    "POSTLEITZAHL"  : fake.postcode,
    "WOHNORT"       : fake.city,
    "ZÄHLERNUMMER"  : zaehlernummer,
    "ZÄHLERSTAND"   : zaehlstand,
    "VERTRAGSNUMMER": vertragsnummer,
    "ZAHLUNG"       : zahlung,
    "BANK"          : german_bank,
    "IBAN"          : iban_de,
    "BIC"           : bic,
    "DATUM"         : lambda: fake.date(pattern="%d.%m.%Y"),
    "GESENDET_MIT"  : gesendet_mit,
    "LINK"          : fake.uri,
}

In [29]:
# ────────────────────────────────────────────────────────────────────
# 4.  Placeholder substitution helper
# ────────────────────────────────────────────────────────────────────
_alias_to_key = {
    alias: key
    for key, aliases in {
        "TITEL":["TITEL"], "VORNAME":["VORNAME"], "NACHNAME":["NACHNAME"],
        "FIRMA":["FIRMA"], "TELEFONNUMMER":["TELEFONNUMMER"], "EMAIL":["EMAIL"],
        "FAX":["FAX"], "STRASSE":["STRASSE"], "HAUSNUMMER":["HAUSNUMMER"],
        "POSTLEITZAHL":["POSTLEITZAHL"],
        "WOHNORT":["WOHNORT","ORT","CITY"],
        "ZÄHLERNUMMER":["ZÄHLERNUMMER"],
        "ZÄHLERSTAND":["ZÄHLERSTAND"],
        "VERTRAGSNUMMER":["VERTRAGSNUMMER","ANGEBOTSNUMMER","KUNDENNUMMER"],
        "ZAHLUNG":["BETRAG","ZAHLUNG"],
        "BANK":["BANK"], "IBAN":["IBAN"], "BIC":["BIC"],
        "DATUM":["DATUM","DATE"], "GESENDET_MIT":["GESENDET_MIT"], "LINK":["LINK"],
    }.items() for alias in aliases
}
_pattern = re.compile(r"<<\s*([^\s<>]+?)\s*>>")

# old placeholder
# def substitute_placeholders(text: str) -> str:
#     def repl(match):
#         alias = match.group(1)
#         key   = _alias_to_key.get(alias)
#         return GEN[key]() if key in GEN else match.group(0)
#     return _pattern.sub(repl, text)

In [30]:
import random
from collections import Counter


def weighted_choice(templates, label_counts, observed_counts, target_dist):
    # compute a “deficit” for each label
    deficits = {lbl: target_dist[lbl] - label_counts[lbl]
                for lbl in observed_counts}

    # score each template by summing deficits of the labels it contains
    template_scores = []
    for t in templates:
        labels_in_t = re.findall(_pattern, t)  # list of aliases
        keys_in_t   = [_alias_to_key[lab] for lab in labels_in_t]
        # sum only positive deficits
        score = sum(max(deficits.get(key,0), 0) for key in keys_in_t)
        # ensure a minimum weight
        template_scores.append(score + 1e-3)

    # normalize and pick one
    total = sum(template_scores)
    weights = [s/total for s in template_scores]
    return random.choices(templates, weights)[0]

def generate_balanced_dataset(templates, N):
    label_counts = Counter()
    # init observed_counts & target_dist first
    observed_counts = Counter()
    for t in templates:
        labels = re.findall(_pattern, t)
        observed_counts.update(_alias_to_key[l] for l in labels)
    # e.g. equalize to max
    max_obs = max(observed_counts.values())
    target_dist = {lbl: max_obs for lbl in observed_counts}

    outputs = []
    for _ in range(N):
        tpl = weighted_choice(templates, label_counts, observed_counts, target_dist)
        filled = substitute_placeholders(tpl)
        # update counts based on which keys you actually substituted:
        for alias in re.findall(_pattern, tpl):
            key = _alias_to_key[alias]
            label_counts[key] += 1
        outputs.append(filled)

    return outputs

In [50]:
import json
import re
import math
import random
from collections import Counter

# -- Assumes the helper functions and mappings (weighted_choice, _pattern, _alias_to_key, GEN) are already defined --

# 1. Load paraphrases JSON
with open("option_b_paraphrased.json", encoding="utf-8") as f:
    entries = json.load(f)

# 2. Flatten templates
templates = [tpl for entry in entries for tpl in entry["paraphrases"]]

# 3. Compute observed label counts
observed_counts = Counter()
for t in templates:
    for alias in re.findall(_pattern, t):
        observed_counts[_alias_to_key[alias]] += 1

# 4. Define target distribution (equalize to max observed)
max_obs = max(observed_counts.values())
target_dist = { label: max_obs for label in observed_counts }

# 5. Define fill_and_label to replace placeholders and record spans
def fill_and_label(template: str):
    parts = []
    labels = []
    last_index = 0
    for match in re.finditer(_pattern, template):
        alias = match.group(1)
        key = _alias_to_key[alias]
        value = GEN[key]()
        # Append text before placeholder
        parts.append(template[last_index:match.start()])
        # Record start and end in new text
        start = sum(len(p) for p in parts)
        parts.append(value)
        end = start + len(value)
        labels.append({"start": start, "end": end, "label": key})
        last_index = match.end()
    # Append remainder
    parts.append(template[last_index:])
    return "".join(parts), labels

# 6. Generate labeled, balanced dataset
def generate_labeled_dataset(templates, N):
    label_counts = Counter()
    outputs = []
    for i in range(N):
        tpl = weighted_choice(templates, label_counts, observed_counts, target_dist)
        text, labels = fill_and_label(tpl)
        # update counts
        for lab in labels:
            label_counts[lab['label']] += 1
        outputs.append({"file": str(i+1), "text": text, "labels": labels})
    return outputs, label_counts

# Generate 14360 examples
N = 14_360
generated, final_counts = generate_labeled_dataset(templates, N)

# 7. Sanity check: print first two entries and final label frequencies
print("=== Sample Outputs ===")
for entry in generated[:2]:
    print(json.dumps(entry, ensure_ascii=False, indent=2))
print("\n=== Final Label Frequencies ===")
print(final_counts)

# 8. Save to JSON
with open("synthetic_emails_labeled.json", "w", encoding="utf-8") as f:
    json.dump(generated, f, ensure_ascii=False, indent=2)

print(f"\nGenerated {len(generated)} examples and saved to synthetic_emails_labeled.json")

=== Sample Outputs ===
{
  "file": "1",
  "text": "Sehr geehrte Damen und Herren, wir bitten um die Reduzierung der Abschlagskosten der Ehepaar Schmiedt, da das Anwesen in Girschnerplatz 3-5 in 81121 Gadebusch ab dem 07.10.1982 unbewohnt ist und verkauft wird. Mit freundlichen Grüßen Serpil Finke Steinberg AG Bärbel-Trommler-Weg 1-3 85894 Lichtenfels Tel.: +49(0)9097623861 http://knappe.net/wp-content/blogfaq.html https://klotz.de/category/category/postsindex.html http://etzold.de/search/listabout.jsp",
  "labels": [
    {
      "start": 93,
      "end": 101,
      "label": "NACHNAME"
    },
    {
      "start": 121,
      "end": 135,
      "label": "STRASSE"
    },
    {
      "start": 136,
      "end": 139,
      "label": "HAUSNUMMER"
    },
    {
      "start": 143,
      "end": 148,
      "label": "POSTLEITZAHL"
    },
    {
      "start": 149,
      "end": 158,
      "label": "WOHNORT"
    },
    {
      "start": 166,
      "end": 176,
      "label": "DATUM"
    },
    {
      "st

In [51]:
from google.colab import files
files.download("synthetic_emails_labeled.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>