In [1]:
!pip install -q datasets==3.2.0 bitsandbytes==0.45.1 accelerate==1.2.1 evaluate==0.4.3 \
    transformers==4.47.1 torch==2.5.1+cu124 numpy==1.26.4 peft==0.14.0 sacrebleu

In [2]:
import os
import torch
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)

In [3]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [4]:
# Configurations
MODEL_NAME = "google-t5/t5-small"
TRANSLATION_PREFIX = "translate from English to French: "
MAX_LENGTH = 128
TRAIN_BATCH_SIZE = 4
VAL_BATCH_SIZE = 2
NUM_TRAIN_EPOCHS = 1
LEARNING_RATE = 2e-4

In [5]:
# Load dataset
books = load_dataset("opus_books", "en-fr", split = "train[:2200]")
books = books.train_test_split(test_size = 0.1, seed = 42)
books

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1980
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 220
    })
})

In [6]:
# translation sample example
books["train"]["translation"][0]

{'en': 'And even when he decided to confide everything to me, during days of anguish of which I shall speak later, it remained for a long time the great secret of our youth.',
 'fr': 'Et même lorsqu’il se fut décidé à me tout confier, durant des jours de détresse dont je reparlerai, ce resta longtemps le grand secret de nos adolescences.'}

In [7]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
def preprocessing_function(examples):

    """
    Preprocess the translation dataset by tokenizing inputs and targets.

    Args:
        examples (dict): Batch of translation examples

    Returns:
        dict: Tokenized model inputs
    """

    inputs = [TRANSLATION_PREFIX + ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]

    model_inputs = tokenizer(
        inputs,
        text_target = targets,
        max_length = MAX_LENGTH,
        truncation = True
    )

    return model_inputs

In [9]:
# Tokenize dataset
tokenized_books = books.map(
    preprocessing_function,
    batched = True,
    remove_columns = books["train"].column_names
)

Map:   0%|          | 0/1980 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

In [10]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = MODEL_NAME,
    return_tensors = "pt"
)

In [11]:
# Load evaluation metric
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):

    """
    Postprocess predictions and labels for metric computation.

    Args:
        preds (list): Model predictions
        labels (list): Ground truth labels

    Returns:
        tuple: Processed predictions and labels
    """

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [12]:
def compute_metrics(eval_pred):
    """
    Compute translation metrics.

    Args:
        eval_pred (EvalPrediction): Model predictions and labels

    Returns:
        dict: Evaluation metrics
    """
    preds, labels = eval_pred

    # Decode predictions and labels
    if isinstance(preds, tuple):
        preds = preds[0]

    # Convert to token IDs
    decoded_preds = tokenizer.batch_decode(
        preds.argmax(-1),
        skip_special_tokens=True
    )

    # Handle labels (replace -100 with pad token)
    decoded_labels = tokenizer.batch_decode(
        np.where(labels != -100, labels, tokenizer.pad_token_id), # where labels equal -100 replace with pad token
        skip_special_tokens=True
    )

    # Postprocess
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels
    )

    # Compute BLEU score
    result = metric.compute(
        predictions = decoded_preds,
        references = decoded_labels
    )

    # Add generation length metric
    prediction_lens = [
        np.count_nonzero(pred) for pred in preds.argmax(-1)
    ]

    return {
        "bleu": result["score"],
        "gen_len": np.mean(prediction_lens)
    }

In [13]:
# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True ,
    bnb_4bit_compute_dtype = 'bfloat16',
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True ,
)

In [14]:
# Load base model with quantization
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    quantization_config = quantization_config,
)

# insted of storing activations in ram to be used in backword pass
# save some and calculate the other during backword pass, this saves some memory
base_model.gradient_checkpointing_enable()

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(base_model)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [15]:
# Find target modules for LoRA
target_modules = set()
for name, module in base_model.named_modules():
    if isinstance(module, torch.nn.Linear):
        module_name = name.split('.')[-1]
        target_modules.add(module_name)
target_modules = list(target_modules)
target_modules

['o', 'v', 'q', 'lm_head', 'wo', 'k', 'wi']

In [16]:
# Configure LoRA
lora_config = LoraConfig(
    r = 4,
    lora_alpha = 8,
    lora_dropout = 0.1,
    bias = 'none',
    task_type = TaskType.SEQ_2_SEQ_LM,
    target_modules = ["k", "q", "v"]
)

# Get PEFT model
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 221,184 || all params: 60,727,808 || trainable%: 0.3642


In [17]:
# Training arguments
training_args = TrainingArguments(
    output_dir = "./translation_results",
    overwrite_output_dir = True,
    num_train_epochs = NUM_TRAIN_EPOCHS,
    per_device_train_batch_size = TRAIN_BATCH_SIZE,
    per_device_eval_batch_size = VAL_BATCH_SIZE,
    learning_rate = LEARNING_RATE,
    warmup_ratio = 0.1,
    weight_decay = 0.01, # L2 regularization
    logging_dir = './logs',
    logging_strategy = 'steps',
    logging_steps = 50,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'bleu',
    dataloader_pin_memory = True, # data intially loadded into CPU then copied to GPU, set to True for transfer directly to GPU, it's faster
    bf16 = True,
    push_to_hub = False,  # Set to True if you want to push to HuggingFace Hub
)



In [18]:
# Initialize Trainer
trainer = Trainer(
    model = lora_model,
    args = training_args,
    train_dataset = tokenized_books['train'],
    eval_dataset = tokenized_books['test'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    processing_class = tokenizer
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mabdulrahman-ahmed20072[0m ([33mabdulrahman-ahmed20072-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.9271,1.659729,14.319504,54.7


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=495, training_loss=2.00308925744259, metrics={'train_runtime': 140.189, 'train_samples_per_second': 14.124, 'train_steps_per_second': 3.531, 'total_flos': 31575001989120.0, 'train_loss': 2.00308925744259, 'epoch': 1.0})

In [19]:
# Inference function
def translate_text(text, max_length=MAX_LENGTH):

    """
    Translate text from English to French.

    Args:
        text (str): Input text to translate
        max_length (int): Maximum length of generated translation

    Returns:
        str: Translated text
    """

    # Prepare input
    inputs = tokenizer(
        TRANSLATION_PREFIX + text,
        return_tensors = "pt",
        max_length = max_length,
        truncation = True
    ).to(lora_model.device)

    # Set model to evaluation mode
    lora_model.eval()

    # Generate translation
    with torch.no_grad():
        outputs = lora_model.generate(
            **inputs,
            max_length = max_length,
            num_return_sequences = 1,
            do_sample = False  # Use greedy decoding
        )

    # Decode and return translation
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [20]:
# Example usage
example_text = "Hello, how are you?"

print(f"Original: {example_text}")
print(f"Translation: {translate_text(example_text)}")

Original: Hello, how are you?
Translation: Bonjour, comment êtes-vous?
