In [None]:
# FOR SAVING IT PROPERLY TO GITHUB
# install nbformat if you haven't already
!pip install -q nbformat

import nbformat
from nbformat import NO_CONVERT

NB_PATH = "02_paraphrasing.ipynb"  # adjust if your file name is different

nb = nbformat.read(NB_PATH, as_version=NO_CONVERT)

# 2) remove any metadata.widgets and stray top-level state
nb.metadata.pop("widgets", None)
nb.metadata.pop("state",   None)
for cell in nb.cells:
    cell.metadata.pop("widgets", None)

# 3) overwrite the file
with open(NB_PATH, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print("✅ metadata.widgets stripped. Now save & push to GitHub!")

In [None]:
!git clone https://github.com/AnnaGhost2713/daia-eon.git
%cd daia-eon/notebooks

In [None]:
#### PREVIEW OF TXT FILES (WHETHER IT WORKS) ####

# 0) install & imports
!pip install -q transformers sentencepiece tqdm

import re, time, random
from math        import ceil
from pathlib     import Path
from random      import seed
from collections import Counter
from tqdm.auto   import tqdm
from transformers import pipeline, set_seed

# 1) CONFIG + load all .txt → records with spans
DATA_DIR = Path("../../data/original/golden_dataset_anonymized_granular")
all_txt  = sorted(DATA_DIR.glob("*.txt"))
records  = []
for f in all_txt:
    txt = f.read_text("utf-8")
    labs = [{"start":m.start(),"end":m.end(),"label":m.group(1)}
           for m in re.finditer(r"<<([^>]+)>>", txt)]
    records.append({"file":f.name, "text":txt, "labels":labs})

# 2) train/test split (same IDs as before)
TEST_IDS   = {0,142,2,3,146,145,157,165,19,18,20,166,176,177,
              32,34,40,45,52,57,61,65,66,70,71,73,75,78,81,
              96,102,105,108,109,112,115,122,129,132,134}
TEST_FILES = {f"{i}.txt" for i in TEST_IDS}
train_recs = [r for r in records if r["file"] not in TEST_FILES]

# 3) lock & sample a tiny preview
seed(1)
preview = random.sample(train_recs, k=5)
print("Previewing:", [r["file"] for r in preview])

# 4) compute variant counts on that preview
tag_counts = Counter(l["label"] for r in preview for l in r["labels"])
max_cnt    = max(tag_counts.values(), default=1)
def n_variants_for(r):
    freqs = [tag_counts.get(l["label"],1) for l in r["labels"]]
    return ceil(max_cnt / max(min(freqs),1)) if freqs else 1

# 5) instantiate your de↔en back-translator
# … (Schritte 0–4 wie gehabt) …

# 5) viel freieres Sampling
kw = dict(device=-1,
          do_sample=True,
          top_k=300,
          top_p=0.95,
          temperature=1.5)
de_en = pipeline("translation_de_to_en", model="Helsinki-NLP/opus-mt-de-en", **kw)
en_de = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", **kw)
en_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", **kw)
fr_en = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en", **kw)
en_es = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es", **kw)
es_en = pipeline("translation_es_to_en", model="Helsinki-NLP/opus-mt-es-en", **kw)
en_it = pipeline("translation_en_to_it", model="Helsinki-NLP/opus-mt-en-it", **kw)
it_en = pipeline("translation_it_to_en", model="Helsinki-NLP/opus-mt-it-en", **kw)

pivot_pipes = {
  "fr": (en_fr, fr_en),
  "es": (en_es, es_en),
  "it": (en_it, it_en),
}

def bt_super_diverse(text: str, want: int) -> list[str]:
    # 1) mask placeholders
    tags, masked = [], text
    for i, t in enumerate(re.findall(r"(<<[^>]+>>)", text), 1):
        tags.append(t)
        masked = masked.replace(t, f"[TAG{i}]")

    # 2) deutsch→englisch (mehr Beams)
    en_beams = de_en(
      masked,
      max_length=512, truncation=True,
      num_beams=want*2,
      num_return_sequences=want,
      early_stopping=True
    )
    out_variants = []
    for beam in en_beams:
        en = beam["translation_text"]
        time.sleep(0.1)

        # 3) zufällige Pivot-Hop-Logik
        hop = random.random()
        if hop < 0.3:
            lang, (e2p, p2e) = random.choice(list(pivot_pipes.items()))
            en = p2e(e2p(en, max_length=512, truncation=True)[0]["translation_text"],
                     max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.2)
        elif hop < 0.5:
            # Zweifach-Hop DE→EN→FR→EN
            mid = pivot_pipes["fr"][0](en, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)
            en  = pivot_pipes["fr"][1](mid, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)

        # 4) englisch→deutsch
        de = en_de(en, max_length=512, truncation=True)[0]["translation_text"]
        time.sleep(0.1)

        # 5) unmask
        for i, t in enumerate(tags, 1):
            de = de.replace(f"[TAG{i}]", t)
        out_variants.append(de)

    return out_variants

# 6) Preview
for rec in tqdm(preview, desc="Super-Diverse Preview"):
    want = n_variants_for(rec)
    print(f"\n→ {rec['file']} (need {want} variants)")
    for v in bt_super_diverse(rec["text"], want):
        print("  ", v)

In [None]:
#### PREVIEW OF JSON FILE (WHETHER IT WORKS) ####

# 0) install & imports
!pip install -q transformers sentencepiece tqdm

import re, time, random
from math        import ceil
from pathlib     import Path
from random      import seed
from collections import Counter
from tqdm.auto   import tqdm
from transformers import pipeline, set_seed
import json

# 1) CONFIG + load all .txt → records with spans
DATA_DIR = Path("../../data/original/golden_dataset_anonymized_granular")
all_txt  = sorted(DATA_DIR.glob("*.txt"))
records  = []
for f in all_txt:
    txt = f.read_text("utf-8")
    labs = [{"start":m.start(),"end":m.end(),"label":m.group(1)}
           for m in re.finditer(r"<<([^>]+)>>", txt)]
    records.append({"file":f.name, "text":txt, "labels":labs})

# 2) train/test split (same IDs as before)
TEST_IDS   = {0,142,2,3,146,145,157,165,19,18,20,166,176,177,
              32,34,40,45,52,57,61,65,66,70,71,73,75,78,81,
              96,102,105,108,109,112,115,122,129,132,134}
TEST_FILES = {f"{i}.txt" for i in TEST_IDS}
train_recs = [r for r in records if r["file"] not in TEST_FILES]

# 3) lock & sample a tiny preview
seed(1)
preview = random.sample(train_recs, k=2)
print("Previewing:", [r["file"] for r in preview])

# 4) compute variant counts on that preview
tag_counts = Counter(l["label"] for r in preview for l in r["labels"])
max_cnt    = max(tag_counts.values(), default=1)
def n_variants_for(r):
    freqs = [tag_counts.get(l["label"],1) for l in r["labels"]]
    return ceil(max_cnt / max(min(freqs),1)) if freqs else 1

# 5) instantiate your de↔en back-translator
# … (Schritte 0–4 wie gehabt) …

# 5) viel freieres Sampling
kw = dict(device=-1,
          do_sample=True,
          top_k=300,
          top_p=0.95,
          temperature=1.5)
de_en = pipeline("translation_de_to_en", model="Helsinki-NLP/opus-mt-de-en", **kw)
en_de = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", **kw)
en_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", **kw)
fr_en = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en", **kw)
en_es = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es", **kw)
es_en = pipeline("translation_es_to_en", model="Helsinki-NLP/opus-mt-es-en", **kw)
en_it = pipeline("translation_en_to_it", model="Helsinki-NLP/opus-mt-en-it", **kw)
it_en = pipeline("translation_it_to_en", model="Helsinki-NLP/opus-mt-it-en", **kw)

pivot_pipes = {
  "fr": (en_fr, fr_en),
  "es": (en_es, es_en),
  "it": (en_it, it_en),
}

def bt_super_diverse(text: str, want: int) -> list[str]:
    # 1) mask placeholders
    tags, masked = [], text
    for i, t in enumerate(re.findall(r"(<<[^>]+>>)", text), 1):
        tags.append(t)
        masked = masked.replace(t, f"[TAG{i}]")

    # 2) deutsch→englisch (mehr Beams)
    en_beams = de_en(
      masked,
      max_length=512, truncation=True,
      num_beams=want*2,
      num_return_sequences=want,
      early_stopping=True
    )
    out_variants = []
    for beam in en_beams:
        en = beam["translation_text"]
        time.sleep(0.1)

        # 3) zufällige Pivot-Hop-Logik
        hop = random.random()
        if hop < 0.3:
            lang, (e2p, p2e) = random.choice(list(pivot_pipes.items()))
            en = p2e(e2p(en, max_length=512, truncation=True)[0]["translation_text"],
                     max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.2)
        elif hop < 0.5:
            # Zweifach-Hop DE→EN→FR→EN
            mid = pivot_pipes["fr"][0](en, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)
            en  = pivot_pipes["fr"][1](mid, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)

        # 4) englisch→deutsch
        de = en_de(en, max_length=512, truncation=True)[0]["translation_text"]
        time.sleep(0.1)

        # 5) unmask
        for i, t in enumerate(tags, 1):
            de = de.replace(f"[TAG{i}]", t)
        out_variants.append(de)

    return out_variants


# 6) build & write preview JSON
OUT_FILE = Path("data/preview_paraphrases.json")
results = []

for rec in tqdm(preview, desc="Building preview JSON"):
    want     = n_variants_for(rec)
    variants = bt_super_diverse(rec["text"], want)
    results.append({
        "file":       rec["file"],
        "n_variants": want,
        "variants":   variants
    })

with OUT_FILE.open("w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✓ Wrote preview to {OUT_FILE}")

In [None]:
from google.colab import files
files.download('data/preview_paraphrases.json')

In [None]:
# 0) install & imports
!pip install -q transformers sentencepiece tqdm

import re, time, random, json
from math          import ceil
from pathlib       import Path
from random        import seed
from collections   import Counter
from tqdm.auto     import tqdm
from transformers  import pipeline, set_seed

# 1) CONFIG + load all .txt → records with spans
DATA_DIR = Path("../../data/original/golden_dataset_anonymized_granular")
all_txt  = sorted(DATA_DIR.glob("*.txt"))
records  = []
for f in all_txt:
    txt = f.read_text("utf-8")
    labs = [{"start":m.start(),"end":m.end(),"label":m.group(1)}
           for m in re.finditer(r"<<([^>]+)>>", txt)]
    records.append({"file":f.name, "text":txt, "labels":labs})

# 2) train/test split (same IDs as before)
TEST_IDS   = {0,142,2,3,146,145,157,165,19,18,20,166,176,177,
              32,34,40,45,52,57,61,65,66,70,71,73,75,78,81,
              96,102,105,108,109,112,115,122,129,132,134}
TEST_FILES = {f"{i}.txt" for i in TEST_IDS}
train_recs = [r for r in records if r["file"] not in TEST_FILES]

# 3) compute tag-frequency
tag_counts = Counter(l["label"] for r in train_recs for l in r["labels"])
max_cnt    = max(tag_counts.values(), default=1)
def n_variants_for(r):
    freqs = [tag_counts.get(l["label"],1) for l in r["labels"]]
    return ceil(max_cnt / max(min(freqs),1)) if freqs else 1

# 4) instantiate pipelines on GPU
kw = dict(device=0, do_sample=True, top_k=300, top_p=0.95, temperature=1.5)
de_en = pipeline("translation_de_to_en", model="Helsinki-NLP/opus-mt-de-en", **kw)
en_de = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", **kw)
en_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", **kw)
fr_en = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en", **kw)
en_es = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es", **kw)
es_en = pipeline("translation_es_to_en", model="Helsinki-NLP/opus-mt-es-en", **kw)
en_it = pipeline("translation_en_to_it", model="Helsinki-NLP/opus-mt-en-it", **kw)
it_en = pipeline("translation_it_to_en", model="Helsinki-NLP/opus-mt-it-en", **kw)

pivot_pipes = {
    "fr": (en_fr, fr_en),
    "es": (en_es, es_en),
    "it": (en_it, it_en),
}

# 5) super‐diverse back‐translator
def bt_super_diverse(text: str, want: int) -> list[str]:
    tags, masked = [], text
    for i, t in enumerate(re.findall(r"(<<[^>]+>>)", text), 1):
        tags.append(t)
        masked = masked.replace(t, f"[TAG{i}]")

    en_beams = de_en(
    masked,
    max_length=512,
    truncation=True,
    num_beams=max(want, 20),
    num_return_sequences=want,
    early_stopping=True
)
    out_variants = []
    for beam in en_beams:
        en = beam["translation_text"]
        time.sleep(0.1)

        hop = random.random()
        if hop < 0.3:
            lang, (e2p, p2e) = random.choice(list(pivot_pipes.items()))
            en = p2e(
                e2p(en, max_length=512, truncation=True)[0]["translation_text"],
                max_length=512, truncation=True
            )[0]["translation_text"]
            time.sleep(0.2)
        elif hop < 0.5:
            mid = pivot_pipes["fr"][0](en, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)
            en  = pivot_pipes["fr"][1](mid, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)

        de = en_de(en, max_length=512, truncation=True)[0]["translation_text"]
        time.sleep(0.1)

        for i, t in enumerate(tags, 1):
            de = de.replace(f"[TAG{i}]", t)
        out_variants.append(de)

    return out_variants

# 6) Only sample 5 records for now
seed(2)
sample_recs = random.sample(train_recs, 3)

# 7) build & write preview JSON
OUT_FILE = Path("data/option_a_paraphrases_preview.json")
results  = []

for rec in tqdm(sample_recs, desc="Building preview JSON"):
    want     = n_variants_for(rec)
    variants = bt_super_diverse(rec["text"], want)
    results.append({
        "file":       rec["file"],
        "n_variants": want,
        "variants":   variants
    })

OUT_FILE.parent.mkdir(exist_ok=True, parents=True)
with OUT_FILE.open("w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✓ Wrote preview to {OUT_FILE}")

# 8) Preview output
with OUT_FILE.open("r", encoding="utf-8") as f:
    data = json.load(f)

for entry in data:
    print(f"\n📂 File: {entry['file']} (generated {entry['n_variants']} variants)")
    for i, variant in enumerate(entry['variants'], 1):
        print(f"  Variant {i}: {variant}")

In [None]:
### CODE FOR THE TOTAL OF 120 TRAINING MAILS ###
# 0) install & imports
!pip install -q transformers sentencepiece tqdm

import re, time, random, json
from math          import ceil
from pathlib       import Path
from random        import seed
from collections   import Counter
from tqdm.auto     import tqdm
from transformers  import pipeline, set_seed

# 1) CONFIG + load all .txt → records with spans
DATA_DIR = Path("../../data/original/golden_dataset_anonymized_granular")
all_txt  = sorted(DATA_DIR.glob("*.txt"))
records  = []
for f in all_txt:
    txt = f.read_text("utf-8")
    labs = [{"start":m.start(),"end":m.end(),"label":m.group(1)}
           for m in re.finditer(r"<<([^>]+)>>", txt)]
    records.append({"file":f.name, "text":txt, "labels":labs})

# 2) train/test split (same IDs as before)
TEST_IDS   = {0,142,2,3,146,145,157,165,19,18,20,166,176,177,
              32,34,40,45,52,57,61,65,66,70,71,73,75,78,81,
              96,102,105,108,109,112,115,122,129,132,134}
TEST_FILES = {f"{i}.txt" for i in TEST_IDS}
train_recs = [r for r in records if r["file"] not in TEST_FILES]

# 3) compute tag‐frequency *on the full train set* (not just preview)
tag_counts = Counter(l["label"] for r in train_recs for l in r["labels"])
max_cnt    = max(tag_counts.values(), default=1)
def n_variants_for(r):
    freqs = [tag_counts.get(l["label"],1) for l in r["labels"]]
    return ceil(max_cnt / max(min(freqs),1)) if freqs else 1

# 4) instantiate all pipelines **on GPU** (device=0)
kw = dict(device=0, do_sample=True, top_k=300, top_p=0.95, temperature=1.5)
de_en = pipeline("translation_de_to_en", model="Helsinki-NLP/opus-mt-de-en", **kw)
en_de = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", **kw)
en_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr", **kw)
fr_en = pipeline("translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en", **kw)
en_es = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es", **kw)
es_en = pipeline("translation_es_to_en", model="Helsinki-NLP/opus-mt-es-en", **kw)
en_it = pipeline("translation_en_to_it", model="Helsinki-NLP/opus-mt-en-it", **kw)
it_en = pipeline("translation_it_to_en", model="Helsinki-NLP/opus-mt-it-en", **kw)

pivot_pipes = {
    "fr": (en_fr, fr_en),
    "es": (en_es, es_en),
    "it": (en_it, it_en),
}

# 5) super‐diverse back‐translator (same as before)
def bt_super_diverse(text: str, want: int) -> list[str]:
    tags, masked = [], text
    for i, t in enumerate(re.findall(r"(<<[^>]+>>)", text), 1):
        tags.append(t)
        masked = masked.replace(t, f"[TAG{i}]")

    num_beams = want + 1  # ensure enough beams to satisfy num_return_sequences
    en_beams = de_en(
        masked,
        max_length=512,
        truncation=True,
        num_beams=num_beams,
        num_return_sequences=want,
        early_stopping=True
    )

    out_variants = []
    for beam in en_beams:
        en = beam["translation_text"]
        time.sleep(0.1)

        hop = random.random()
        if hop < 0.3:
            lang, (e2p, p2e) = random.choice(list(pivot_pipes.items()))
            en = p2e(
                e2p(en, max_length=512, truncation=True)[0]["translation_text"],
                max_length=512, truncation=True
            )[0]["translation_text"]
            time.sleep(0.2)
        elif hop < 0.5:
            mid = pivot_pipes["fr"][0](en, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)
            en = pivot_pipes["fr"][1](mid, max_length=512, truncation=True)[0]["translation_text"]
            time.sleep(0.1)

        de = en_de(en, max_length=512, truncation=True)[0]["translation_text"]
        time.sleep(0.1)

        for i, t in enumerate(tags, 1):
            de = de.replace(f"[TAG{i}]", t)
        out_variants.append(de)

    return out_variants

# 6) build & write full JSON
OUT_FILE = Path("../../data/synthetic/option_a_paraphrases.json")
results  = []

for rec in tqdm(train_recs, desc="Building full JSON"):
    want     = n_variants_for(rec)
    variants = bt_super_diverse(rec["text"], want)
    results.append({
        "file":       rec["file"],
        "n_variants": want,
        "variants":   variants
    })

OUT_FILE.parent.mkdir(exist_ok=True, parents=True)
with OUT_FILE.open("w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"✓ Wrote all  paraphases to {OUT_FILE}")


In [None]:
from google.colab import files
files.download("data/option_a_paraphrases.json")