In [18]:
!git clone https://github.com/AnnaGhost2713/daia-eon.git
%cd daia-eon/notebooks

Cloning into 'daia-eon'...
remote: Enumerating objects: 778, done.[K
remote: Counting objects: 100% (778/778), done.[K
remote: Compressing objects: 100% (566/566), done.[K
remote: Total 778 (delta 396), reused 568 (delta 204), pack-reused 0 (from 0)[K
Receiving objects: 100% (778/778), 986.67 KiB | 7.89 MiB/s, done.
Resolving deltas: 100% (396/396), done.
/content/daia-eon/notebooks/daia-eon/notebooks/daia-eon/notebooks


In [19]:
import json
from pathlib import Path

# Make absolutely sure the data/ folder is there
OUT = Path("data/synthetic_train.jsonl")
OUT.parent.mkdir(exist_ok=True)

# Dummy two‐record write for sanity check
sample = [
  {"src":"1.txt","variant":1,"text":"Dies ist ein Test <<VORNAME>>."},
  {"src":"2.txt","variant":1,"text":"Noch ein Test <<NACHNAME>>."}
]
# Overwrite any old file
OUT.unlink(missing_ok=True)
with OUT.open("w", encoding="utf-8") as f:
    for rec in sample:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# List directory and head of file
print("Files in data/:", list(Path("data").iterdir()))
print("\nFirst two lines:")
print(OUT.read_text().splitlines()[:2])


Files in data/: [PosixPath('data/golden_dataset_original'), PosixPath('data/original_with_spans.json'), PosixPath('data/golden_dataset_anonymized_granular'), PosixPath('data/synthetic_train.jsonl'), PosixPath('data/spacy_split'), PosixPath('data/granular_dataset_split'), PosixPath('data/separating_data.ipynb'), PosixPath('data/spacy_train_new.ipynb'), PosixPath('data/granular_data_split.ipynb'), PosixPath('data/generate_new_json_for training.ipynb'), PosixPath('data/Daia_Manual_Labelling_granular.xlsx')]

First two lines:
['{"src": "1.txt", "variant": 1, "text": "Dies ist ein Test <<VORNAME>>."}', '{"src": "2.txt", "variant": 1, "text": "Noch ein Test <<NACHNAME>>."}']


In [20]:
# Firstly checking the frequencies for underrepresented tags
# Data files we won't use for synthetic data generation (since these are preserved for the test data):
# 0, 142, 2, 3, 146, 145, 157, 165, 19, 18, 20, 166, 176, 177, 32, 34, 40, 45, 52, 57, 61, 65, 66, 70, 71, 73, 75, 78, 81, 96, 102, 105, 108, 109, 112, 115, 122, 129, 132, 134

import json
from collections import Counter
from pathlib import Path

# 1) List your test filenames (add “.txt” as needed)
test_ids = {0,142,2,3,146,145,157,165,19,18,20,
            166,176,177,32,34,40,45,52,57,61,65,
            66,70,71,73,75,78,81,96,102,105,108,
            109,112,115,122,129,132,134}
test_files = {f"{i}.txt" for i in test_ids}

# 2) Load your JSON of original‐with‐spans
records = json.loads(Path("data/original_with_spans.json")
                     .read_text(encoding="utf-8"))

# 3) Split into train vs. test
train_recs = [r for r in records if r["file"] not in test_files]
test_recs  = [r for r in records if r["file"] in test_files]

print(f"{len(train_recs)} training‐set emails, {len(test_recs)} test emails")

# 4) Compute tag frequencies **only on the training set**
tag_counts = Counter(
    lab["label"]
    for rec in train_recs
    for lab in rec["labels"]
)

# 5) See which tags are rare
max_count = max(tag_counts.values())
for tag, cnt in tag_counts.most_common():
    print(f"{tag:15s}: {cnt}   ({cnt/max_count:.2%} of max)")



120 training‐set emails, 40 test emails
NACHNAME       : 156   (100.00% of max)
VORNAME        : 143   (91.67% of max)
DATUM          : 69   (44.23% of max)
STRASSE        : 58   (37.18% of max)
VERTRAGSNUMMER : 57   (36.54% of max)
WOHNORT        : 57   (36.54% of max)
HAUSNUMMER     : 56   (35.90% of max)
POSTLEITZAHL   : 55   (35.26% of max)
ZÄHLERNUMMER   : 34   (21.79% of max)
TELEFONNUMMER  : 25   (16.03% of max)
GESENDET_MIT   : 20   (12.82% of max)
FIRMA          : 17   (10.90% of max)
ZAHLUNG        : 17   (10.90% of max)
EMAIL          : 15   (9.62% of max)
TITEL          : 13   (8.33% of max)
LINK           : 10   (6.41% of max)
ZÄHLERSTAND    : 9   (5.77% of max)
FAX            : 3   (1.92% of max)
IBAN           : 3   (1.92% of max)
BANK           : 2   (1.28% of max)
BIC            : 1   (0.64% of max)


In [21]:
# Compute per record variant counts
# For each training email, look at which labels it contains and take the rarest one (i.e. the one with the lowest frequency). Then set:
# variants_for_email = ceil(max_count / freq_of_rarest_label)
# That way an email carrying only BIC (freq = 1) will get 156 / 1 = 156 variants, whereas one with only NACHNAME (freq = 156) gets 156 / 156 = 1 variant. Mixed emails land in between.

from math import ceil

def n_variants_for(rec):
    # find the frequency of each label in this record
    freqs = [tag_counts[label["label"]] for label in rec["labels"]]
    if not freqs:
        return 1
    rarest = min(freqs)
    return ceil(max_count / rarest)

In [22]:
from math import ceil
from collections import Counter
import json
from pathlib import Path

# 1) load train records & your precomputed tag_counts
records     = json.loads(Path("data/original_with_spans.json").read_text())
test_ids    = {0,142,2,3,146,145,157,165,19,18,20,
            166,176,177,32,34,40,45,52,57,61,65,
            66,70,71,73,75,78,81,96,102,105,108,
            109,112,115,122,129,132,134}  # 40 test IDs
train_recs  = [r for r in records if r["file"] not in {f"{i}.txt" for i in test_ids}]
tag_counts  = Counter(lab["label"] for r in train_recs for lab in r["labels"])
max_count   = max(tag_counts.values())

def n_variants_for(rec):
    # frequencies of all labels in this record
    freqs = [tag_counts[lab["label"]] for lab in rec["labels"]]
    if not freqs:
        return 1
    rarest = min(freqs)
    return ceil(max_count / rarest)

# 2) preview a few
for rec in train_recs[:5]:
    n = n_variants_for(rec)
    tags = {lab["label"] for lab in rec["labels"]}
    print(f"{rec['file']}: tags={tags}, variants={n}")


1.txt: tags={'VERTRAGSNUMMER', 'HAUSNUMMER', 'NACHNAME', 'WOHNORT', 'STRASSE', 'POSTLEITZAHL', 'VORNAME'}, variants=3
4.txt: tags={'VERTRAGSNUMMER', 'HAUSNUMMER', 'FAX', 'FIRMA', 'WOHNORT', 'STRASSE', 'TELEFONNUMMER', 'VORNAME', 'POSTLEITZAHL', 'NACHNAME', 'ZÄHLERNUMMER'}, variants=52
5.txt: tags={'VERTRAGSNUMMER', 'TELEFONNUMMER', 'VORNAME', 'NACHNAME', 'ZÄHLERNUMMER'}, variants=7
6.txt: tags={'VERTRAGSNUMMER', 'ZAHLUNG', 'WOHNORT', 'STRASSE', 'DATUM', 'VORNAME', 'POSTLEITZAHL', 'NACHNAME'}, variants=10
7.txt: tags={'VERTRAGSNUMMER', 'HAUSNUMMER', 'DATUM', 'STRASSE', 'VORNAME', 'NACHNAME', 'ZÄHLERNUMMER'}, variants=5


In [23]:
### FULL PARAPHRASING GOING ON HERE

# 0) (Re)install & imports
!pip install -q transformers sentencepiece tqdm

import re, time, json
from math import ceil
from pathlib import Path
from collections import Counter
from transformers import pipeline
from tqdm.auto import tqdm

# 1) Load MT pipelines with beam search on GPU
de_en = pipeline("translation",
                 model="Helsinki-NLP/opus-mt-de-en",
                 device=-1, # cpu because didn't have enough free gpu credits left
                 do_sample=False, num_beams=5)
en_de = pipeline("translation",
                 model="Helsinki-NLP/opus-mt-en-de",
                 device=-1,
                 do_sample=False, num_beams=5)

# 2) Your back-translation + robust masking
def backtranslate_preserve_tags(text: str) -> str:
    tags     = re.findall(r"(<<[^>]+>>)", text)
    mask_map = {tag: f"[TAG{i}]" for i, tag in enumerate(tags,1)}
    masked   = text
    for tag, m in mask_map.items():
        masked = masked.replace(tag, m)
    en = de_en(masked, max_length=512, truncation=True)[0]["translation_text"]
    time.sleep(0.1)
    de = en_de(en,    max_length=512, truncation=True)[0]["translation_text"]
    time.sleep(0.1)
    for tag, m in mask_map.items():
        de = de.replace(m, tag)
    return de

# 3) Load your original-with-spans JSON & split train/test
ALL_RECS = json.loads(Path("data/original_with_spans.json")
                      .read_text(encoding="utf-8"))
TEST_IDS = {0,142,2,3,146,145,157,165,19,18,20,
            166,176,177,32,34,40,45,52,57,61,65,
            66,70,71,73,75,78,81,96,102,105,108,
            109,112,115,122,129,132,134}
TEST_FILES = {f"{i}.txt" for i in TEST_IDS}

train_recs = [r for r in ALL_RECS if r["file"] not in TEST_FILES]

# 4) Compute tag frequencies & max
tag_counts = Counter(
    lab["label"]
    for rec in train_recs
    for lab in rec["labels"]
)
max_count = max(tag_counts.values())

# 5) Decide per-record variants
def n_variants_for(rec):
    freqs = [tag_counts[lab["label"]] for lab in rec["labels"]]
    if not freqs:
        return 1
    rarest = min(freqs)
    return ceil(max_count / rarest)

# 6) Generate & save
OUT = Path("data/paraphrased_with_labels.jsonl")
OUT.parent.mkdir(parents=True, exist_ok=True)   # create notebooks/data/ if needed
OUT.unlink(missing_ok=True)

for rec in tqdm(train_recs, desc="Generating synthetics"):
    text = rec["text"]
    n    = n_variants_for(rec)
    for i in range(1, n+1):
        para = backtranslate_preserve_tags(text)
        out = {"src": rec["file"], "variant": i, "text": para}
        with OUT.open("a", encoding="utf-8") as f:
            f.write(json.dumps(out, ensure_ascii=False) + "\n")

print(f"✓ Done: wrote synthetic variants to {OUT}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Generating synthetics:   0%|          | 0/120 [00:00<?, ?it/s]

Your input_length: 504 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 504 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 504 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 504 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 504 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 504 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 504 is bigger than 0.9 * max_length: 512. You

✓ Done: wrote synthetic variants to data/paraphrased_with_labels.jsonl


In [24]:
from google.colab import files
files.download("data/synthetic_train.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>