In [1]:
# Install required libraries
!pip install datasets transformers sentencepiece transformers[torch] sacrebleu evaluate accelerate -U gradio

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting gradio
  Downloading gradio-4.42.0-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64

In [19]:
# Import necessary libraries
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate
import torch
import evaluate


In [3]:
# Load the "Helsinki-NLP/europarl" dataset for Czech to English translation
data = load_dataset("Helsinki-NLP/europarl", 'cs-en')

# Define maximum sequence length for tokenization
max_length = 256

# Load pre-trained tokenizer and model for Czech to English translation
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-cs-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-cs-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/76.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/647095 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/830k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.60M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [4]:
# Split the dataset into training, validation, and test sets
train_test = data['train'].train_test_split(test_size=0.005, seed=42)
valid_test = train_test['test'].train_test_split(test_size=0.5, seed=42)

dataset_dict = DatasetDict({
    'train': train_test['train'],
    'valid': valid_test['train'],
    'test': valid_test['test']
})

In [5]:
# Print dataset samples for verification
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 643859
    })
    valid: Dataset({
        features: ['translation'],
        num_rows: 1618
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1618
    })
})


In [6]:
# Get a sample article from the test dataset
article = dataset_dict['test'][2]['translation']['cs']

In [7]:
# Tokenize the sample article
inputs = tokenizer(article, return_tensors="pt")

In [8]:
# Generate translations using the model
translated_tokens = model.generate(**inputs, max_length=max_length)

In [9]:
# Decode the generated tokens to text
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

In [10]:
# Print the translated text
print(translation)

It is absolutely necessary to concentrate on the construction of infrastructure, which, by the way, my colleague, Mr Deva, also stressed during the last debate.


In [11]:
# Get the reference translation from the test dataset
_translation = dataset_dict['test'][2]['translation']['en']
print(_translation)

An absolutely fundamental matter is to concentrate on building infrastructure, which, by the way, my colleague Mr Deva emphasised during the last debate.


In [12]:
# Define the preprocessing function for tokenization
def preprocess_function(examples):
    """
    Preprocess and tokenize the dataset examples for training and evaluation.

    This function extracts the source (Czech) and target (English) translations from the examples,
    tokenizes them, and formats them into model inputs and labels.

    Args:
        examples (dict): A dictionary containing 'translation' key with source and target texts.

    Returns:
        dict: A dictionary containing tokenized inputs and labels ready for model training.
            - 'input_ids': Token IDs for model inputs.
            - 'attention_mask': Attention masks for model inputs.
            - 'labels': Token IDs for model outputs (labels).
    """
    # Extract source and target texts
    inputs = [ex["cs"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]

    # Tokenize the source and target texts
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)
    labels = tokenizer(targets, max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [13]:
# Define batch size for tokenization
batch_size = 2

In [14]:
# Tokenize the training, validation, and test datasets
tokenized_datasets_train = dataset_dict['train'].map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict['train'].column_names,
    batch_size=batch_size
)

tokenized_datasets_validation = dataset_dict['valid'].map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict['valid'].column_names,
    batch_size=batch_size
)

tokenized_datasets_test = dataset_dict['test'].map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict['test'].column_names,
    batch_size=batch_size
)

Map:   0%|          | 0/643859 [00:00<?, ? examples/s]

Map:   0%|          | 0/1618 [00:00<?, ? examples/s]

Map:   0%|          | 0/1618 [00:00<?, ? examples/s]

In [15]:
# Create a data collator for dynamic padding and batch formatting
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the selected device
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62509, 512, padding_idx=62508)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62509, 512, padding_idx=62508)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [17]:
# Define the training arguments for the Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="finetuned-nlp-en-",  # Directory for saving the model and checkpoints
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    per_device_train_batch_size=32,  # Batch size for training on each device (e.g., GPU)
    learning_rate=1e-5,  # Learning rate for the optimizer
    warmup_steps=2,  # Number of warmup steps for learning rate scheduler
    max_steps=2000,  # Total number of training steps
    fp16=True,  # Use mixed precision training to speed up training and reduce memory usage
    optim='adafactor',  # Optimizer to use (Adafactor optimizer)
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device
    metric_for_best_model="bleu",  # Metric to use for selecting the best model
    predict_with_generate=True,  # Generate predictions during evaluation
    push_to_hub=False,  # Do not push the model to the Hugging Face Hub
)

In [20]:
# Define the compute_metrics function for evaluating model performance


metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    """
    Compute evaluation metrics for the model predictions.

    Args:
        eval_preds (tuple): A tuple containing predictions and labels. Predictions are the model outputs, and labels are the true labels.

    Returns:
        dict: A dictionary with the BLEU score for the predictions.
    """
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process decoded predictions and labels
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [21]:
# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_train,  # Tokenized training dataset
    eval_dataset=tokenized_datasets_validation,  # Tokenized validation dataset
    data_collator=data_collator,  # Data collator for dynamic padding and batch formatting
    tokenizer=tokenizer,  # Tokenizer for encoding and decoding text
    compute_metrics=compute_metrics,  # Function for computing evaluation metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [22]:
# Start the training process
trainer.train()



Step,Training Loss
500,3.0927
1000,2.0892
1500,1.8248
2000,1.7431


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62508]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62508]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62508]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62508]], 'forced_eos_token_id': 0}


TrainOutput(global_step=2000, training_loss=2.187461395263672, metrics={'train_runtime': 723.2953, 'train_samples_per_second': 88.484, 'train_steps_per_second': 2.765, 'total_flos': 1232663193059328.0, 'train_loss': 2.187461395263672, 'epoch': 0.09939863823865613})

In [23]:
# Evaluate the model on the test dataset
results = trainer.evaluate(eval_dataset=tokenized_datasets_test)

In [24]:
# Print the evaluation results
print(results)

{'eval_loss': 1.5187643766403198, 'eval_bleu': 28.618106360299766, 'eval_runtime': 285.3572, 'eval_samples_per_second': 5.67, 'eval_steps_per_second': 0.357, 'epoch': 0.09939863823865613}
