# Fine-tunning of a LM model
- PyTorch (with [GPU-Acceleration](https://towardsdatascience.com/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1))

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset
import numpy as np
import evaluate
from tqdm.auto import tqdm
from accelerate import Accelerator

device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")

accelerator = Accelerator()
print(accelerator.device)  # Uses mps

  from .autonotebook import tqdm as notebook_tqdm


mps


In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

print()

def tokenize_function(example):
    # To avoid loading the full dataset in RAM
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load and preprocess dataset

In [3]:
# Fine-tune model with MRPC dataset
raw_datasets = load_dataset("glue", "mrpc")  # Contains train, validation and test dataset
raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset[0])
print(raw_train_dataset.features)  # Type of each column

# Preprocessing
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)  # This adds new fields to the dataset
# print(tokenized_datasets)

# Dynamic Padding: the collate function puts samples inside a batch, it pads each sample in batch to the longest length within it to speed up training and reduce extra padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


## Train model using the 'Trainer' class

In [4]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", use_cpu=False)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)
metrics = trainer.train()

# Without mps: Time 71min 36.8s
# {'eval_loss': 0.644670844078064, 'eval_accuracy': 0.8455882352941176, 'eval_f1': 0.8941176470588236, 'eval_runtime': 8.1069, 'eval_samples_per_second': 50.327, 'eval_steps_per_second': 6.291, 'epoch': 3.0}
# {'train_runtime': 4296.356, 'train_samples_per_second': 2.561, 'train_steps_per_second': 0.321, 'train_loss': 0.2788682742316219, 'epoch': 3.0}

# With mps: 13min 58.1s
# {'eval_loss': 0.5484141111373901, 'eval_accuracy': 0.8799019607843137, 'eval_f1': 0.9147826086956522, 'eval_runtime': 6.244, 'eval_samples_per_second': 65.343, 'eval_steps_per_second': 8.168, 'epoch': 3.0}
# {'train_runtime': 837.7688, 'train_samples_per_second': 13.135, 'train_steps_per_second': 1.644, 'train_loss': 0.3463223221168712, 'epoch': 3.0}

  0%|          | 0/1377 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 33%|███▎      | 459/1377 [04:40<10:25,  1.47it/s]

{'eval_loss': 0.41232964396476746, 'eval_accuracy': 0.8186274509803921, 'eval_f1': 0.8762541806020068, 'eval_runtime': 7.01, 'eval_samples_per_second': 58.203, 'eval_steps_per_second': 7.275, 'epoch': 1.0}


 36%|███▋      | 500/1377 [05:04<08:23,  1.74it/s]

{'loss': 0.5159, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


                                                  
 67%|██████▋   | 918/1377 [09:17<05:06,  1.50it/s]

{'eval_loss': 0.5764635801315308, 'eval_accuracy': 0.8480392156862745, 'eval_f1': 0.8973509933774835, 'eval_runtime': 5.9732, 'eval_samples_per_second': 68.305, 'eval_steps_per_second': 8.538, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [10:05<03:34,  1.76it/s]

{'loss': 0.3109, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


                                                   
100%|██████████| 1377/1377 [13:57<00:00,  1.64it/s]

{'eval_loss': 0.5484141111373901, 'eval_accuracy': 0.8799019607843137, 'eval_f1': 0.9147826086956522, 'eval_runtime': 6.244, 'eval_samples_per_second': 65.343, 'eval_steps_per_second': 8.168, 'epoch': 3.0}
{'train_runtime': 837.7688, 'train_samples_per_second': 13.135, 'train_steps_per_second': 1.644, 'train_loss': 0.3463223221168712, 'epoch': 3.0}





## Train without trainer class
### Preprocess data

In [4]:
# Remove columns not useful to the model
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
print('Columns to the model:', tokenized_datasets["train"].column_names)

# Data loaders to iterate over batches
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=32, collate_fn=data_collator)

# Select optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)  # number of epochs * number of batches
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

progress_bar = tqdm(range(num_training_steps))

Columns to the model: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']


  0%|          | 0/1377 [00:00<?, ?it/s]

### Train model using PyTorch

In [5]:
# Train
model.to(device)  # To run in M1 GPU

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# With mps, batch size of 8: 24 mins 3.5s , {'accuracy': 0.8553921568627451, 'f1': 0.899488926746167}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1377/1377 [24:06<00:00,  2.63s/it]

### Train with Accelerate

In [None]:
# Train ussing Accelerator
def training_function():
    accelerator = Accelerator()
    train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer)

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)


from accelerate import notebook_launcher
notebook_launcher(training_function, num_processes=1)  # Or put in a file and run: accelerate launch train.py

### Evaluate model

In [6]:
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

# accelerator.gather_for_metrics

{'accuracy': 0.8553921568627451, 'f1': 0.899488926746167}