## 1. Install & Imports

In [5]:
!pip install -q transformers datasets sentencepiece tf-keras

# %%
import logging
import os

from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    EncoderDecoderModel,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch

# Configure logging so we can see progress in the notebook output
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


## 2. Load the Europarl English–Spanish Dataset

In [10]:
logger.info("Loading Europarl English–Spanish dataset…")
# The `europarl_bilingual` builder provides configs like "en-de", "en-fr", "en-es", etc.
raw_datasets = load_dataset("europarl_bilingual", "en-es")

# Inspect which splits are available
available_splits = list(raw_datasets.keys())
logger.info(f"Available splits: {available_splits}")

from datasets import DatasetDict

if "validation" not in raw_datasets:
    logger.info("No validation split found — creating one from the training set (10%)…")
    # Split off 10% of train as validation
    split = raw_datasets["train"].train_test_split(test_size=0.1, seed=42)
    # Rename the 'test' portion of that split to 'validation'
    split["validation"] = split.pop("test")
    # Reassemble into a new DatasetDict, preserving the original 'test' if present
    new_datasets = {"train": split["train"], "validation": split["validation"]}
    if "test" in raw_datasets:
        new_datasets["test"] = raw_datasets["test"]
    else:
        new_datasets["test"] = raw_datasets["train"].train_test_split(test_size=0.2, seed=42).pop('test')
    raw_datasets = DatasetDict(new_datasets)
    logger.info(
        f"Post‑split sizes -> train: {len(raw_datasets['train'])}, "
        f"validation: {len(raw_datasets['validation'])}"
        + (f", test: {len(raw_datasets['test'])}" if "test" in raw_datasets else "")
    )
else:
    logger.info(
        f"Train size: {len(raw_datasets['train'])}, "
        f"Validation size: {len(raw_datasets['validation'])}"
    )


2025/04/23 17:04:40 - INFO - __main__ -   Loading Europarl English–Spanish dataset…
2025/04/23 17:04:42 - INFO - __main__ -   Available splits: ['train']
2025/04/23 17:04:42 - INFO - __main__ -   No validation split found — creating one from the training set (10%)…
2025/04/23 17:04:42 - INFO - __main__ -   Post‑split sizes -> train: 1808165, validation: 200908, test: 401815


## 3. Initialize Tokenizer & Model

In [11]:
logger.info("Loading multilingual BERT tokenizer and initializing Encoder–Decoder model…")
# We use the multilingual cased BERT so it has vocab for both English & German
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

# Build an encoder-decoder from two copies of mBERT
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-multilingual-cased", "bert-base-multilingual-cased"
)

# Tie encoder & decoder embeddings (recommended)
model.config.tie_encoder_decoder = True

# Set generation parameters
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# (Optional) limit length of generated sequences
model.config.max_length = 128
model.config.min_length = 10
model.config.no_repeat_ngram_size = 3


2025/04/23 17:04:50 - INFO - __main__ -   Loading multilingual BERT tokenizer and initializing Encoder–Decoder model…
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.den

In [12]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1808165
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 200908
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 401815
    })
})

## 4. Preprocess (Tokenize) the Data (limit to first 500 000 examples)

In [13]:
# -- 4.1: Optionally trim each split to at most 500 000 examples
max_train_samples = 30_000
if len(raw_datasets["train"]) > max_train_samples:
    logger.info(f"Selecting first {max_train_samples} examples from the TRAIN split…")
    raw_datasets["train"] = raw_datasets["train"].select(range(max_train_samples))
    
# If you also want to cap validation, uncomment and adjust:
max_val_samples = 3_000
if len(raw_datasets["validation"]) > max_val_samples:
    logger.info(f"Selecting first {max_val_samples} examples from the VALIDATION split…")
    raw_datasets["validation"] = raw_datasets["validation"].select(range(max_val_samples))

logger.info(
    f"Post‑trim sizes -> train: {len(raw_datasets['train'])}, "
    f"validation: {len(raw_datasets['validation'])}"
)

# -- 4.2: Tokenization with per‑batch logging
max_input_length  = 128
max_target_length = 128
batch_size        = 5000  # adjust up/down to suit your RAM/GPU

def preprocess_function(examples, idx):
    # idx is list of the raw indices for this batch
    start_idx, end_idx = idx[0], idx[-1]
    logger.info(f"Tokenizing examples {start_idx}–{end_idx}…")
    
    # extract English <> Spanish texts
    translations = examples["translation"]
    inputs  = [t["en"] for t in translations]
    targets = [t["es"] for t in translations]
    
    # tokenize source (English)
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    # tokenize target (Spanish)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

logger.info("Starting tokenization of train/validation sets…")
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    batch_size=batch_size,
    with_indices=True,                          # feed indices to preprocess_function
    remove_columns=raw_datasets["train"].column_names,
)
logger.info("Tokenization complete for all batches.")


2025/04/23 17:11:37 - INFO - __main__ -   Selecting first 30000 examples from the TRAIN split…
2025/04/23 17:11:37 - INFO - __main__ -   Selecting first 3000 examples from the VALIDATION split…
2025/04/23 17:11:37 - INFO - __main__ -   Post‑trim sizes -> train: 30000, validation: 3000
2025/04/23 17:11:37 - INFO - __main__ -   Starting tokenization of train/validation sets…


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

2025/04/23 17:11:37 - INFO - __main__ -   Tokenizing examples 0–4999…
2025/04/23 17:11:38 - INFO - __main__ -   Tokenizing examples 5000–9999…
2025/04/23 17:11:38 - INFO - __main__ -   Tokenizing examples 10000–14999…
2025/04/23 17:11:39 - INFO - __main__ -   Tokenizing examples 15000–19999…
2025/04/23 17:11:40 - INFO - __main__ -   Tokenizing examples 20000–24999…
2025/04/23 17:11:41 - INFO - __main__ -   Tokenizing examples 25000–29999…


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

2025/04/23 17:11:42 - INFO - __main__ -   Tokenizing examples 0–2999…


Map:   0%|          | 0/401815 [00:00<?, ? examples/s]

2025/04/23 17:11:42 - INFO - __main__ -   Tokenizing examples 0–4999…
2025/04/23 17:11:43 - INFO - __main__ -   Tokenizing examples 5000–9999…
2025/04/23 17:11:44 - INFO - __main__ -   Tokenizing examples 10000–14999…
2025/04/23 17:11:44 - INFO - __main__ -   Tokenizing examples 15000–19999…
2025/04/23 17:11:45 - INFO - __main__ -   Tokenizing examples 20000–24999…
2025/04/23 17:11:46 - INFO - __main__ -   Tokenizing examples 25000–29999…
2025/04/23 17:11:47 - INFO - __main__ -   Tokenizing examples 30000–34999…
2025/04/23 17:11:48 - INFO - __main__ -   Tokenizing examples 35000–39999…
2025/04/23 17:11:49 - INFO - __main__ -   Tokenizing examples 40000–44999…
2025/04/23 17:11:49 - INFO - __main__ -   Tokenizing examples 45000–49999…
2025/04/23 17:11:51 - INFO - __main__ -   Tokenizing examples 50000–54999…
2025/04/23 17:11:51 - INFO - __main__ -   Tokenizing examples 55000–59999…
2025/04/23 17:11:52 - INFO - __main__ -   Tokenizing examples 60000–64999…
2025/04/23 17:11:53 - INFO - __m

In [14]:
pip install accelerate>=0.26.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 5. Prepare Data Collator & Training Arguments

In [15]:
logger.info("Setting up data collator and training arguments…")

# Data collator will dynamically pad to the longest sequence in each batch
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt",
)

# Seq2SeqTrainingArguments for fine‑tuning:
# - use `do_eval=True` to enable evaluation
# - drop both `evaluation_strategy` and `evaluate_during_training` so it's compatible
training_args = Seq2SeqTrainingArguments(
    output_dir="./bert2bert-europarl-en-es",    # where checkpoints & final model are saved
    per_device_train_batch_size=16,              # batch size per GPU/CPU for training
    per_device_eval_batch_size=16,               # batch size per GPU/CPU for evaluation
    predict_with_generate=True,                 # run generate() during evaluation
    do_eval=True,                               # perform evaluation every `eval_steps`
    eval_steps=500,                             # evaluate every 500 training steps
    logging_steps=100,                          # log training loss & lr every 100 steps
    save_steps=500,                             # checkpoint every 500 steps
    save_total_limit=3,                         # keep only the 3 most recent checkpoints
    num_train_epochs=3,                         # total number of training epochs
    learning_rate=5e-5,                         # initial learning rate
    weight_decay=0.01,                          # weight decay to apply
    warmup_steps=500,                           # number of warmup steps for LR scheduler
    logging_dir="./logs",                       # tensorboard log directory
)

logger.info(f"Training arguments:\n{training_args}")


2025/04/23 17:13:26 - INFO - __main__ -   Setting up data collator and training arguments…
2025/04/23 17:13:26 - INFO - __main__ -   Training arguments:
Seq2SeqTrainingArguments(
_n_gpu=2,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation

## 6. Initialize Trainer & Start Training

In [16]:
logger.info("Initializing Trainer…")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

logger.info("Beginning training…")
train_result = trainer.train()
logger.info("Training finished.")


2025/04/23 17:13:27 - INFO - __main__ -   Initializing Trainer…
  trainer = Seq2SeqTrainer(
2025/04/23 17:13:28 - INFO - __main__ -   Beginning training…
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,6.1328


KeyboardInterrupt: 

In [None]:
tokenized_datasets