# Training with Hugging Face Transformers on GPU and Accelerators

Training machine learning models, especially deep learning models, can be computationally intensive. Utilizing GPUs (Graphics Processing Units) and accelerators (like TPUs or custom hardware) significantly speeds up this process. Hugging Face Transformers provides robust support for training models on such hardware.



## Data Processing

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
# Load the MRPC dataset from the GLUE benchmark using the Hugging Face datasets library
raw_dataset = load_dataset("glue", "mrpc")

# Define the checkpoint for the BERT model (BERT-base, uncased version) to be used in the pipeline
checkpoint = "bert-base-uncased"

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
# Initialize the tokenizer using a pre-trained model checkpoint.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize_function(sample):
  """Tokenizes a pair of sentences using the tokenizer."""
  return tokenizer(sample["sentence1"], sample["sentence2"], truncation=True)

In [None]:
# Tokenize the raw dataset using the tokenize_function and apply the tokenization in batches
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)

# Create a data collator that will dynamically pad the tokenized sequences to the maximum length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

## Postprocessing

* Remove columns that are not needed by the model (e.g., `sentence1` and `sentence2`)

* Rename the column `label` to `labels` to match the expected format for the model

* Set the format of the dataset to return PyTorch tensors

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

## Define Dataloaders

In [None]:
from torch.utils.data import DataLoader

In [None]:
# Create a DataLoader for the training dataset with shuffling enabled
train_dataloader = DataLoader(
                                tokenized_dataset["train"],
                                shuffle = True,
                                batch_size = 8,
                                collate_fn = data_collator)

# Create a DataLoader for the validation dataset
valid_dataloader = DataLoader(
                                tokenized_dataset["validation"],
                                batch_size = 8,
                                collate_fn = data_collator)

Inspecting a batch:

In [None]:
for batch in train_dataloader:
  break

{k:v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 66]),
 'token_type_ids': torch.Size([8, 66]),
 'attention_mask': torch.Size([8, 66])}

## Model Instatiation

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
# Load a pre-trained sequence classification model from the specified checkpoint,
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
output = model(**batch)
print(output.loss, output.logits.shape)

tensor(1.2856, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


## Optimizer

In [None]:
from transformers import AdamW

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

## Scheduler

In [None]:
from transformers import get_scheduler

In [None]:
num_epochs = 5
training_steps = num_epochs * len(train_dataloader)

In [None]:
scheduler = get_scheduler(
                              "linear",
                              optimizer,
                              num_warmup_steps = 0,
                              num_training_steps = training_steps)

In [None]:
print(training_steps)

2295


## Training Loop

In [None]:
import torch

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
print(device)

cuda


In [None]:
from tqdm.auto import tqdm

In [None]:
progress_bar = tqdm(range(training_steps))

model.train()

for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k:v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/2295 [00:00<?, ?it/s]

## Evaluation Loop

In [None]:
import evaluate

In [None]:
metric = evaluate.load("glue", "mrpc")
model.eval()

for batch in valid_dataloader:
    batch = {k:v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

In [None]:
metric.compute()

{'accuracy': 0.8259803921568627, 'f1': 0.8826446280991737}

## Training Loop with Accelerate

In [None]:
from accelerate import Accelerator

In [None]:
accelerator = Accelerator()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
train_dataloader, valid_dataloader, model, optimizer = accelerator.prepare(train_dataloader, valid_dataloader, model, optimizer)

In [None]:
num_epochs = 5
training_steps = num_epochs * len(train_dataloader)

scheduler = get_scheduler(
                              "linear",
                              optimizer,
                              num_warmup_steps = 0,
                              num_training_steps = training_steps)

In [None]:
progress_bar = tqdm(range(training_steps))

model.train()

for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/2295 [00:00<?, ?it/s]