# Fine-tuning BERT using Transformers Trainer API

In [1]:
!pip install --quiet transformers[torch]
!pip install --quiet datasets

Collecting transformers[torch]
[?25l  Downloading https://files.pythonhosted.org/packages/00/92/6153f4912b84ee1ab53ab45663d23e7cf3704161cb5ef18b0c07e207cef2/transformers-4.7.0-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 8.6MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 49.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |

## Import modules

In [2]:
import torch # pytorch
import transformers # the transformers library from huggingface
from datasets import load_dataset # this is used to load dataset of our choice

## Load the data

In [3]:
raw_datasets = load_dataset("imdb")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1916.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1054.0, style=ProgressStyle(description…


Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=84125825.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


## Tokenize each sentence from the IMDB dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
# Define a tokenizer function to map the tokenizer to each sentence in dataset
def tokenize(data):
  return tokenizer(data["text"], padding="max_length", truncation=True) # Pad inputs until the maximum sentence length and truncate

tokenized_dataset = raw_datasets.map(tokenize, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-e2e173b3844ced56.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-8050c3774d2e0901.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-45c3647589346488.arrow


## Collect a small sample of the whole dataset for experimentation purposes

In [None]:
small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))    # Choose 1000 random samples from train for training
small_test = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))      # Choose 1000 random samples from test for testing

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-606a21fdee6a6e32.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-ce2c5c066df987ba.arrow


## Fine-tuning using the Trainer API

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)    # The pre-trained BERT model for sequence classification (transfer learning)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
device = torch.device('cuda')

In [None]:
from transformers import TrainingArguments # lets us tune hyperparameters for our Trainer

training_args = TrainingArguments("test_trainer", evaluation_strategy="epoch")

In [None]:
from transformers import Trainer

# Instantiate a trainer and feed in our BERT model, default hyperparameters, training data split, and testing data split
trainer = Trainer(model=model.to(device), args=training_args, train_dataset=small_train, eval_dataset=small_test)

In [None]:
# Now call the train method to allow model to train on training split
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.49836
2,No log,0.303827
3,No log,0.487729


TrainOutput(global_step=375, training_loss=0.4004019368489583, metrics={'train_runtime': 426.6241, 'train_samples_per_second': 0.879, 'total_flos': 998201640960000.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 524288, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 134565888, 'train_mem_gpu_alloc_delta': 1361708544, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6505104384})

## Evaluating our model

In [None]:
import numpy as np
from datasets import load_metric

# The metric we are interested in using
metric = load_metric("accuracy")

# define a function that takes in the evaluation predictions
def compute_metrics(eval_pred):
  logits, labels = eval_pred   # softmax probabilities, and the actual labels to compare to
  predictions = np.argmax(logits, axis=-1)   # get the max of the softmax probabilities and predict that class
  return metric.compute(predictions=predictions, references=labels)   # evaluate accuracy

In [None]:
# Instantiate another trainer and feed in our BERT model, default hyperparameters, training data split, testing data split, and evaluation metric
trainer = Trainer(
    model=model.to(device), 
    args=training_args, 
    train_dataset=small_train, 
    eval_dataset=small_test, 
    compute_metrics=compute_metrics # this will feed in our predictions to the evaluation function we defined
)

# run model over test split
trainer.evaluate()

{'eval_accuracy': 0.876,
 'eval_loss': 0.48772919178009033,
 'eval_mem_cpu_alloc_delta': 323584,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 390205440,
 'eval_runtime': 36.0661,
 'eval_samples_per_second': 27.727,
 'init_mem_cpu_alloc_delta': 0,
 'init_mem_cpu_peaked_delta': 0,
 'init_mem_gpu_alloc_delta': 0,
 'init_mem_gpu_peaked_delta': 0}

## Predictions

In [None]:
pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good',
                  'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie']

In [None]:
batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors="pt") # returns the batch of sentences encoded as a tensor

In [None]:
batch

{'input_ids': tensor([[  101,  1188,  1108,  1126, 14918,  2523,   119,   146,  2824,  1122,
          3059,  1139,  1159,  2903,  1142,  2712,  2523,  1191,   146,  1138,
          1227,  1122,  1108,  1142,  1363,   102],
        [  101,  1448,  1104,  1103,  4997,  5558,  1104,  1155,  1159,   119,
           146,  2834,  2059,   146, 15445,  1160,  2005,  1104,  1139,  1297,
          1111,  1142,  2523,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0]])}

In [None]:
for ids in batch["input_ids"]:
  print(tokenizer.decode(ids))

[CLS] This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good [SEP]
[CLS] One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie [SEP] [PAD] [PAD]


In [None]:
val = model(batch["input_ids"].to(device))

In [None]:
print(f'Sentiment for Sentence 1: {torch.argmax(val.logits[0], axis=-1)}')
print(f'Sentiment for Sentence 2: {torch.argmax(val.logits[1], axis=-1)}')

Sentiment for Sentence 1: 1
Sentiment for Sentence 2: 0


## Clear memory

In [None]:
del model
del trainer
torch.cuda.empty_cache()