In [None]:
#!pip install torch transformers datasets evaluate sacrebleu
# -*- coding: utf-8 -*-
import logging

from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import evaluate

# ─── 1) SETUP LOGGING ──────────────────────────────────────────────────────────
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# ─── 2) LOAD & SPLIT EUROPARL EN–ES ────────────────────────────────────────────
logger.info("Loading Europarl English–Spanish dataset…")
raw = load_dataset("europarl_bilingual", "en-es")
if "validation" not in raw:
    logger.info("Creating a 10% validation split…")
    split = raw["train"].train_test_split(test_size=0.1, seed=42)
    raw = DatasetDict({
        "train": split["train"],
        "validation": split["test"],
        "test": raw.get("test",
                        split["train"].train_test_split(test_size=0.2, seed=42)["test"])
    })

# ─── 3) SUBSAMPLE FOR SPEED ──────────────────────────────────────────────────
max_train, max_val = 30_000, 3_000
raw["train"] = raw["train"].select(range(min(len(raw["train"]), max_train)))
raw["validation"] = raw["validation"].select(range(min(len(raw["validation"]), max_val)))

# ─── 4) TOKENIZER & MODEL ─────────────────────────────────────────────────────
MODEL_NAME = "Helsinki-NLP/opus-mt-en-es"
logger.info(f"Loading tokenizer and model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ─── 5) PREPROCESS FUNCTION ──────────────────────────────────────────────────
max_len = 128

def preprocess(batch):
    inputs = [t["en"] for t in batch["translation"]]
    targets = [t["es"] for t in batch["translation"]]
    encodings = tokenizer(
        inputs, max_length=max_len, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_len, truncation=True, padding="max_length"
        )
    encodings["labels"] = labels["input_ids"]
    return encodings

# ─── 6) FAST TOKENIZATION ─────────────────────────────────────────────────────
logger.info("Tokenizing dataset with fast mapping…")
tokenized = raw.map(
    preprocess,
    batched=True,
    batch_size=2000,
    num_proc=4,
    remove_columns=raw["train"].column_names,
    load_from_cache_file=True,
)

# ─── 7) DATA COLLATOR ─────────────────────────────────────────────────────────
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, padding="longest"
)

# ─── 8) METRICS ───────────────────────────────────────────────────────────────
bleu = evaluate.load("bleu")
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True
    )
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )
    result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    result["bleu"] *= 100
    return result

# ─── 9) TRAINING ARGUMENTS ────────────────────────────────────────────────────
training_args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints",
    # turn on training & evaluation
    do_train=True,
    do_eval=True,
    # run validation every N steps (adjust to match ~1 epoch)
    eval_steps=500,
    # save a checkpoint every N steps
    save_steps=500,
    save_total_limit=3,

    # hardware / performance
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    dataloader_num_workers=4,
    fp16=True,

    # optimizer & schedule
    learning_rate=3e-5,
    weight_decay=0.01,
    optim="adamw_torch",
    num_train_epochs=3,

    # logging
    logging_steps=100,

    # generation
    predict_with_generate=True,
    generation_max_length=max_len,
)

# ─── 10) TRAINER ─────────────────────────────────────────────────────────────
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ─── 11) TRAINING ─────────────────────────────────────────────────────────────
logger.info("Starting training...")
trainer.train()

# ─── 12) EVALUATION ───────────────────────────────────────────────────────────
logger.info("Evaluating on test set...")
scores = trainer.evaluate(tokenized["test"])
logger.info(f"Test results: {scores}")


In [3]:
# Grab the active args
args = trainer.args  # or just use training_args if you prefer

print(f"Learning rate                : {args.learning_rate}")
print(f"Per-device train batch size  : {args.per_device_train_batch_size}")
print(f"Weight decay                 : {args.weight_decay}")
print(f"Warmup steps                 : {args.warmup_steps}")
print(f"Num train epochs             : {args.num_train_epochs}")


Learning rate                : 3e-05
Per-device train batch size  : 16
Weight decay                 : 0.01
Warmup steps                 : 0
Num train epochs             : 3


In [None]:
import matplotlib.pyplot as plt

# 1) Bar chart of final test BLEU:
# assume you did: scores = trainer.evaluate(tokenized["test"])
test_bleu = scores.get("eval_bleu", scores.get("bleu", 0))
plt.figure()
plt.bar(["Test BLEU"], [test_bleu])
plt.ylim(0, 100)
plt.ylabel("BLEU Score")
plt.title("Test Set BLEU")
plt.show()


# 2) Line plot of validation BLEU per epoch:
history = trainer.state.log_history
# collect BLEU at each evaluation step
val_bleus = [x["eval_bleu"] for x in history if "eval_bleu" in x]
epochs = list(range(1, len(val_bleus) + 1))
plt.figure()
plt.plot(epochs, val_bleus, marker="o")
plt.xlabel("Epoch")
plt.ylabel("BLEU")
plt.title("Validation BLEU per Epoch")
plt.xticks(epochs)
plt.ylim(0, max(val_bleus) * 1.1)
plt.show()
