# A full training

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, evaluation, and accelerate
!uv pip install datasets evaluate transformers[sentencepiece]
!uv pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !uv pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [None]:
# Complete setup: load and preprocess the MRPC dataset
# This is the same preprocessing pipeline we've used before
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Define tokenization function for sentence pairs
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Apply tokenization to all dataset splits
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Set up data collator for dynamic padding during training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Prepare datasets for PyTorch training loop
# Remove text columns and rename 'label' to 'labels' (required by PyTorch models)
# Set format to 'torch' to return PyTorch tensors instead of lists
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
["attention_mask", "input_ids", "labels", "token_type_ids"]

In [None]:
# Create PyTorch DataLoaders for training and evaluation
# shuffle=True for training to randomize examples, batch_size=8 for small batches
# collate_fn handles dynamic padding within each batch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
# Test the DataLoader by examining one batch
# This shows the structure and shapes of tensors in each batch
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
# Load the pre-trained model for sequence classification
# This model will be fine-tuned on our MRPC dataset
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
# Test the model with one batch to verify it works
# This shows the loss value and output shape (8 examples, 2 classes)
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
# Set up the AdamW optimizer
# lr=5e-5 is a good learning rate for fine-tuning BERT models
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Set up learning rate scheduler
# Linear scheduler with warmup - gradually increases then decreases learning rate
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)  # Total steps for 3 epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # No warmup steps
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
# Move model to GPU if available, otherwise use CPU
# This significantly speeds up training if GPU is available
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
# Complete training loop with progress bar
# This is the full PyTorch training loop from scratch
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()  # Set model to training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Move batch to the same device as model
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()  # Compute gradients

        optimizer.step()  # Update model parameters
        lr_scheduler.step()  # Update learning rate
        optimizer.zero_grad()  # Clear gradients for next iteration
        progress_bar.update(1)  # Update progress bar

In [None]:
# Evaluate the trained model on validation set
# This computes accuracy and F1 score using the evaluate library
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()  # Set model to evaluation mode
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Convert logits to predicted classes
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()  # Calculate final metrics

In [None]:
# Alternative training approach with different learning rate
# This demonstrates how changing hyperparameters affects training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)  # Slightly lower learning rate

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Same training setup as before
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Using 🤗 Accelerate for distributed training and mixed precision
# Accelerate simplifies multi-GPU training and other optimizations
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()  # Initialize accelerator

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

# Prepare all objects for accelerated training (handles device placement automatically)
train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)  # Use accelerator's backward method

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Launch training function with notebook_launcher
# This enables distributed training across multiple GPUs if available
from accelerate import notebook_launcher

notebook_launcher(training_function)