# Fine-tuning a model with the Trainer API or Keras

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [3]:
!pip install datasets evaluate transformers[sentencepiece]




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

Documentation for TrainingArguments can be [found here](https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/trainer#transformers.TrainingArguments).

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test-trainer", num_train_epochs=5, lr_scheduler_type="cosine_with_restarts"
)

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [9]:
trainer.train()

  0%|          | 0/2295 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 22%|██▏       | 500/2295 [00:22<01:20, 22.16it/s]

{'loss': 0.4996, 'learning_rate': 4.4369285612260874e-05, 'epoch': 1.09}


 44%|████▎     | 1000/2295 [00:45<00:57, 22.38it/s]

{'loss': 0.2784, 'learning_rate': 3.001353801034688e-05, 'epoch': 2.18}


 65%|██████▌   | 1500/2295 [01:09<00:36, 22.05it/s]

{'loss': 0.1211, 'learning_rate': 1.339940635976592e-05, 'epoch': 3.27}


 87%|████████▋ | 2000/2295 [01:32<00:13, 22.12it/s]

{'loss': 0.0399, 'learning_rate': 2.0108450704954348e-06, 'epoch': 4.36}


100%|██████████| 2295/2295 [01:46<00:00, 21.55it/s]

{'train_runtime': 106.5187, 'train_samples_per_second': 172.176, 'train_steps_per_second': 21.546, 'train_loss': 0.20613847485295048, 'epoch': 5.0}





TrainOutput(global_step=2295, training_loss=0.20613847485295048, metrics={'train_runtime': 106.5187, 'train_samples_per_second': 172.176, 'train_steps_per_second': 21.546, 'train_loss': 0.20613847485295048, 'epoch': 5.0})

In [10]:
import numpy as np
import evaluate

predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)


100%|██████████| 51/51 [00:00<00:00, 132.12it/s]

(408, 2) (408,)





In [11]:
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8553921568627451, 'f1': 0.8998302207130731}

In [12]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer.train()

                                                  
 33%|███▎      | 460/1377 [00:23<04:33,  3.35it/s]

{'eval_loss': 0.5414108037948608, 'eval_accuracy': 0.7892156862745098, 'eval_f1': 0.8599348534201955, 'eval_runtime': 2.5174, 'eval_samples_per_second': 162.074, 'eval_steps_per_second': 20.259, 'epoch': 1.0}


 36%|███▋      | 500/1377 [00:25<00:42, 20.85it/s]

{'loss': 0.5851, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


                                                  
 67%|██████▋   | 918/1377 [00:47<00:20, 22.86it/s]

{'eval_loss': 0.5383989214897156, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8918032786885245, 'eval_runtime': 2.4851, 'eval_samples_per_second': 164.182, 'eval_steps_per_second': 20.523, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [00:51<00:16, 22.73it/s]

{'loss': 0.3924, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


                                                   
100%|██████████| 1377/1377 [01:11<00:00, 19.26it/s]

{'eval_loss': 0.5481570959091187, 'eval_accuracy': 0.8651960784313726, 'eval_f1': 0.9056603773584906, 'eval_runtime': 2.492, 'eval_samples_per_second': 163.722, 'eval_steps_per_second': 20.465, 'epoch': 3.0}
{'train_runtime': 71.5005, 'train_samples_per_second': 153.901, 'train_steps_per_second': 19.259, 'train_loss': 0.42891574546575717, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.42891574546575717, metrics={'train_runtime': 71.5005, 'train_samples_per_second': 153.901, 'train_steps_per_second': 19.259, 'train_loss': 0.42891574546575717, 'epoch': 3.0})