# Translation Service

### Authors: Mitchell Mahnke, Carter Shavitz, Kaden Young

# Version 1: BERT (bad)

Our first pass at a translation service using BERT. This is a simple implementation that uses the BERT model to translate text from one english to spanish. The model is not very accurate and often produces nonsensical translations. Additionally it is notable that optima is used to optimize the tuning of the model hyperparameters.

In [None]:
import logging

from datasets import load_dataset, DatasetDict
from transformers import (
    BertTokenizerFast,
    EncoderDecoderModel,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import optuna

# ─── 1) SETUP LOGGING ──────────────────────────────────────────────────────────
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

# ─── 2) LOAD & SPLIT EUROPARL EN–ES ────────────────────────────────────────────
logger.info("Loading Europarl English–Spanish dataset…")
raw = load_dataset("europarl_bilingual", "en-es")
if "validation" not in raw:
    logger.info("Creating a 10% validation split…")
    split = raw["train"].train_test_split(test_size=0.1, seed=42)
    raw = DatasetDict({
        "train": split["train"],
        "validation": split["test"],
        "test": raw.get("test",
                        split["train"].train_test_split(test_size=0.2, seed=42)["test"])
    })

# ─── 3) SUBSAMPLE FOR SPEED ──────────────────────────────────────────────────
max_train, max_val = 30_000, 3_000
if len(raw["train"]) > max_train:
    raw["train"] = raw["train"].select(range(max_train))
if len(raw["validation"]) > max_val:
    raw["validation"] = raw["validation"].select(range(max_val))

# ─── 4) TOKENIZATION ──────────────────────────────────────────────────────────
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
max_len = 128

def preprocess(batch, idxs):
    logger.info(f"Tokenizing examples {idxs[0]}–{idxs[-1]}…")
    inputs  = [t["en"] for t in batch["translation"]]
    targets = [t["es"] for t in batch["translation"]]
    enc = tokenizer(inputs,  max_length=max_len, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        lbl = tokenizer(targets, max_length=max_len, truncation=True, padding="max_length")
    enc["labels"] = lbl["input_ids"]
    return enc

tokenized = raw.map(
    preprocess,
    batched=True,
    batch_size=5000,
    with_indices=True,
    remove_columns=raw["train"].column_names,
)

# ─── 5) DATA COLLATOR ─────────────────────────────────────────────────────────
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None, padding="longest")

# ─── 6) MODEL INIT ────────────────────────────────────────────────────────────
def model_init():
    m = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "bert-base-multilingual-cased",
        "bert-base-multilingual-cased",
        tie_encoder_decoder=True,
    )

    # ─── Enable true seq2seq decoder with cross‐attention:
    m.config.decoder.is_decoder        = True
    m.config.decoder.add_cross_attention = True

    # ─── Special tokens & lengths
    m.config.decoder_start_token_id = tokenizer.cls_token_id
    m.config.eos_token_id           = tokenizer.sep_token_id
    m.config.pad_token_id           = tokenizer.pad_token_id
    m.config.max_length             = 128
    m.config.min_length             = 10
    m.config.no_repeat_ngram_size   = 3

    return m

# ─── 7) HYPERPARAMETER SPACE ─────────────────────────────────────────────────
def hp_space(trial: optuna.Trial):
    return {
        "learning_rate":               trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
        # smaller batch‐size choices to avoid OOM
        "per_device_train_batch_size": trial.suggest_categorical(
            "per_device_train_batch_size", [4, 8, 16]
        ),
        "weight_decay":                trial.suggest_uniform("weight_decay", 0.0, 0.3),
        "warmup_steps":                trial.suggest_int("warmup_steps", 0, 1000),
        "num_train_epochs":            trial.suggest_categorical("num_train_epochs", [2, 3, 4]),
    }

# ─── 8) TUNING ARGS ────────────────────────────────────────────────────────────
tuning_args = Seq2SeqTrainingArguments(
    output_dir="./hp_tuning",
    per_device_train_batch_size=8,      # default, overridden in hp_space
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Use HF’s Torch AdamW
    optim="adamw_torch",

    # Temporarily disable mixed precision until HPO+AMP bug is fixed
    fp16=False,
)

# ─── 9) TRAINER & HPO RUN ─────────────────────────────────────────────────────
trainer = Seq2SeqTrainer(
    model_init      = model_init,
    args            = tuning_args,
    train_dataset   = tokenized["train"],
    eval_dataset    = tokenized["validation"],
    data_collator   = data_collator,
    tokenizer       = tokenizer,
    compute_metrics = None,  # replace with your BLEU fn if desired
)

best = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=hp_space,
    n_trials=20,
    n_jobs=1,                       
    pruner=optuna.pruners.MedianPruner(),
    study_name="bert_translation_hp",
)

print("Best hyperparameters:", best.hyperparameters)


# Version 2: Encoder + Decoder transition

In this second attempt, a more sophisticated approach is taken. The model is now a sequence-to-sequence model that uses an encoder-decoder architecture. The encoder processes the input text and generates a context vector, which is then passed to the decoder to generate the output text. This approach allows for more accurate translations and better handling of long sentences. The model is trained on a large dataset of English-Spanish sentence pairs, and uses attention mechanisms to focus on relevant parts of the input when generating the output.

In [None]:
import logging
import math
import matplotlib.pyplot as plt

from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import evaluate


## Set up logging

In [None]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

## Load & split

In [None]:
logger.info("Loading Europarl English-Spanish dataset…")
raw = load_dataset("europarl_bilingual", "en-es")
if "validation" not in raw:
    logger.info("Creating a 10% validation split…")
    split = raw["train"].train_test_split(test_size=0.1, seed=42)
    raw = DatasetDict({
        "train": split["train"],
        "validation": split["test"],
        "test": raw.get(
            "test",
            split["train"].train_test_split(test_size=0.2, seed=42)["test"]
        ),
    })

## Subsample the data

In [None]:
frac = 0.01   # keep only 10% of each split
seed = 42
for split_name in ("train", "validation", "test"):
    ds = raw[split_name].shuffle(seed=seed)
    n = max(1, int(len(ds) * frac))
    logger.info(
        f"Subsampling {n} examples ({frac*100:.2f}%) from '{split_name}' "
        f"({len(ds)} total)…"
    )
    raw[split_name] = ds.select(range(n))


## Tokenizer

In [None]:
MODEL_NAME = "Helsinki-NLP/opus-mt-en-es"
logger.info(f"Loading tokenizer and model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Preprocess the data

In [None]:
max_len = 128
def preprocess(batch):
    inputs= [t["en"] for t in batch["translation"]]
    targets = [t["es"] for t in batch["translation"]]
    encodings = tokenizer(
        inputs, max_length=max_len, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_len, truncation=True, padding="max_length"
        )
    encodings["labels"] = labels["input_ids"]
    return encodings

## Tokenization

In [None]:
logger.info("Tokenizing dataset with fast mapping…")
tokenized = raw.map(
    preprocess,
    batched=True,
    batch_size=2000,
    num_proc=4,
    remove_columns=raw["train"].column_names,
    load_from_cache_file=True,
)

## Compute dynamic evaluation metrics

In [None]:
train_bs = 16
grad_accum = 2
epochs = 3

num_samples = len(tokenized["train"])
steps_per_epoch = math.ceil(num_samples / (train_bs * grad_accum))
total_steps = steps_per_epoch * epochs
eval_interval= max(1, total_steps // 20)
logger.info(f"Total training steps ≃ {total_steps}, will eval every {eval_interval} steps")


## Data collator

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")

# Metrics

In [None]:
bleu = evaluate.load("bleu")
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    decoded_labels= tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = bleu.compute(
        predictions=decoded_preds,
        references=[[l] for l in decoded_labels]
    )
    result["bleu"] *= 100
    return result

## Training & args

In [None]:

# ─── 9) TRAINING ARGUMENTS (modern eval + slower LR + dynamic eval + initial eval) ─
training_args = Seq2SeqTrainingArguments(
    output_dir="./checkpoints",
    do_train=True,
    do_eval=True,
    eval_on_start=True,
    # dynamic evaluation cadence during training
    eval_strategy="steps",
    eval_steps=eval_interval,
    # logging on same cadence
    logging_strategy="steps",
    logging_steps=eval_interval,
    # checkpointing every 500 steps
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    # batching
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=grad_accum,
    # mixed‑precision
    fp16=True,
    # slow down learning
    learning_rate=1e-5,
    warmup_steps=50,
    lr_scheduler_type="linear",
    weight_decay=0.01,
    optim="adamw_torch",
    num_train_epochs=epochs,
    # generation
    predict_with_generate=True,
    generation_max_length=max_len,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
logger.info("Starting training...")
trainer.train()

## Loss metrics from logs

In [None]:
log_history = trainer.state.log_history
train_logs  = [e for e in log_history if "loss" in e and "eval_loss" not in e and "step" in e]
train_steps = [e["step"] for e in train_logs]
train_losses= [e["loss"] for e in train_logs]

eval_logs   = [e for e in log_history if "eval_loss" in e]
eval_steps  = [e["step"]      for e in eval_logs]
val_losses  = [e["eval_loss"] for e in eval_logs]

# Evaluation

In [None]:
test_scores = trainer.evaluate(tokenized["test"])
test_loss   = test_scores.get("eval_loss", test_scores.get("loss", 0.0))
logger.info(f"Test Loss: {test_loss:.4f}")


In [None]:
for step, loss in zip(eval_steps, val_losses):
    logger.info(f"Validation loss at step {step}: {loss:.4f}")


# Plots

In [None]:
plt.figure()
plt.plot(train_steps, train_losses, marker="o", label="Train Loss")
plt.plot(eval_steps,  val_losses,   marker="o", label="Validation Loss")
plt.axhline(test_loss, linestyle="--", label=f"Test Loss ({test_loss:.4f})")
plt.title("Train, Validation, and Test Loss Over Steps")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.savefig("all_loss_curves.png")
plt.show()
plt.close()
logger.info("Saved all_loss_curves.png")