In [None]:
from urllib.request import urlopen
from PIL import Image
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer


In [4]:
# !pip install huggingface_hub==0.23.5

In [3]:
from setfit import sample_dataset
from setfit import SetFitModel
from setfit import TrainingArguments as SetFitTrainingArguments
from setfit import Trainer as SetFitTrainer

In this chapter, we will go through several methods and applications for fine-tuning BERT models

So far we have used frozen models to do classification:
- Using task-specific models trained on classification tasks
- Using an embedding model to train a LR on top to classify


<img src="imgs/pretrainedmodels.png" alt="Hugging Face" height=400 width=600>

**In this section, we will take a similar approach but allow both the model and the classification head to be updated during training.**

## Training a BERT and a Classification Head

As illustrated below, we will use a pretrained BERT model and add a neural network as a classification head, both of which will be fine-tuned for classification.

<img src="imgs/tunebert.png" alt="Hugging Face" height=400 width=600>

### Fine-Tuning a Pretrained BERT Model

We will be using the same dataset we used in Chapter 4 to fine-tune our model, namely the Rotten Tomatoes dataset, which contains 5,331 positive and 5,331 negative movie reviews from Rotten Tomatoes:

In [5]:
# Prepare data and splits
tomatoes = load_dataset("rotten_tomatoes")
train_data, test_data = tomatoes["train"], tomatoes["test"]

The first step in our classification task is to select the underlying model we want to use. 

We use "bert-base-cased", which was pretrained on the English Wikipedia as well as a large dataset consisting of unpublished books

We define the number of labels that we want to predict beforehand. This is necessary to create the feedforward neural network that is applied on top of our pretrained model:

In [6]:
# Load Model and Tokenizer
model_id = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Pad to the longest sequence in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
def preprocess_function(examples):
   """Tokenize input data"""
   return tokenizer(examples["text"], truncation=True)


In [9]:
# Tokenize train/test data
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [10]:
train_data[0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [11]:
tokenized_train[0]["input_ids"][:4]

[101, 1103, 2067, 1110]

In [12]:
def compute_metrics(eval_pred):
    """Calculate F1 score"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    load_f1 = evaluate.load("f1")
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"f1": f1}

With compute_metrics we can define any number of metrics that we are interested in and that can be printed out or logged during training.

In [13]:
# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)

In [14]:
# Trainer which executes the training process
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [15]:
import torch

# Ensure the model is on the same device as the data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create a dummy batch with tokenized inputs
dummy_input = torch.randint(0, tokenizer.vocab_size, (4, 10)).to(device)  # Batch of 4 sequences, each of length 10
dummy_labels = torch.tensor([0, 1, 0, 1]).to(device)  # Binary classification labels

# Forward pass
output = model(input_ids=dummy_input, labels=dummy_labels)

# Inspect the output
print("Loss:", output.loss)
print("Logits:", output.logits)

Loss: tensor(0.7173, device='cuda:0', grad_fn=<NllLossBackward0>)
Logits: tensor([[0.5500, 0.2029],
        [0.7816, 0.3994],
        [0.8093, 0.2740],
        [0.7848, 0.2894]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [16]:
output.logits.shape

torch.Size([4, 2])

In [33]:
trainer.train()

  0%|          | 0/534 [00:00<?, ?it/s]

{'loss': 0.419, 'grad_norm': 7.954373359680176, 'learning_rate': 1.2734082397003748e-06, 'epoch': 0.94}
{'train_runtime': 19.1303, 'train_samples_per_second': 445.889, 'train_steps_per_second': 27.914, 'train_loss': 0.413470564710067, 'epoch': 1.0}


TrainOutput(global_step=534, training_loss=0.413470564710067, metrics={'train_runtime': 19.1303, 'train_samples_per_second': 445.889, 'train_steps_per_second': 27.914, 'total_flos': 227605451772240.0, 'train_loss': 0.413470564710067, 'epoch': 1.0})

In [34]:
trainer.evaluate()

  0%|          | 0/67 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'eval_loss': 0.37388619780540466,
 'eval_f1': 0.8382213812677389,
 'eval_runtime': 1.6045,
 'eval_samples_per_second': 664.364,
 'eval_steps_per_second': 41.756,
 'epoch': 1.0}

### Freezing Layers

We will freeze the main BERT model and allow only updates to pass through the classification head. 


In [17]:
# Load Model and Tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Print layer names
for name, param in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

There are 12 (0–11) encoder blocks consisting of attention heads, dense networks, and layer normalization. 

<img src="imgs/bertarch.png" alt="Hugging Face" height=400 width=600>

Generally, we want frozen layers to be followed by trainable layers.

In [37]:
for name, param in model.named_parameters():

     # Trainable classification head
     if name.startswith("classifier"):
        param.requires_grad = True

      # Freeze everything else
     else:
        param.requires_grad = False

In [38]:
# Check
# We can check whether the model was correctly updated
for name, param in model.named_parameters():
     print(f"Parameter: {name} ----- {param.requires_grad}")

Parameter: bert.embeddings.word_embeddings.weight ----- False
Parameter: bert.embeddings.position_embeddings.weight ----- False
Parameter: bert.embeddings.token_type_embeddings.weight ----- False
Parameter: bert.embeddings.LayerNorm.weight ----- False
Parameter: bert.embeddings.LayerNorm.bias ----- False
Parameter: bert.encoder.layer.0.attention.self.query.weight ----- False
Parameter: bert.encoder.layer.0.attention.self.query.bias ----- False
Parameter: bert.encoder.layer.0.attention.self.key.weight ----- False
Parameter: bert.encoder.layer.0.attention.self.key.bias ----- False
Parameter: bert.encoder.layer.0.attention.self.value.weight ----- False
Parameter: bert.encoder.layer.0.attention.self.value.bias ----- False
Parameter: bert.encoder.layer.0.attention.output.dense.weight ----- False
Parameter: bert.encoder.layer.0.attention.output.dense.bias ----- False
Parameter: bert.encoder.layer.0.attention.output.LayerNorm.weight ----- False
Parameter: bert.encoder.layer.0.attention.output

In [39]:
# Trainer which executes the training process
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)
trainer.train()

  0%|          | 0/534 [00:00<?, ?it/s]

{'loss': 0.7015, 'grad_norm': 3.9625017642974854, 'learning_rate': 1.2734082397003748e-06, 'epoch': 0.94}
{'train_runtime': 5.21, 'train_samples_per_second': 1637.248, 'train_steps_per_second': 102.496, 'train_loss': 0.7005022348982565, 'epoch': 1.0}


TrainOutput(global_step=534, training_loss=0.7005022348982565, metrics={'train_runtime': 5.21, 'train_samples_per_second': 1637.248, 'train_steps_per_second': 102.496, 'total_flos': 227605451772240.0, 'train_loss': 0.7005022348982565, 'epoch': 1.0})

In [40]:
trainer.evaluate()

  0%|          | 0/67 [00:00<?, ?it/s]

{'eval_loss': 0.6876858472824097,
 'eval_f1': 0.6636971046770601,
 'eval_runtime': 1.2351,
 'eval_samples_per_second': 863.059,
 'eval_steps_per_second': 54.245,
 'epoch': 1.0}

### Freezing up until a given block of Encoders (1-5 frozen)

In [41]:
# Load model
model_id = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Encoder block 10 starts at index 165 and
# we freeze everything before that block
for index, (name, param) in enumerate(model.named_parameters()):
    if index < 165:
        param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Trainer which executes the training process
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()

  0%|          | 0/534 [00:00<?, ?it/s]

{'loss': 0.4739, 'grad_norm': 2.742690086364746, 'learning_rate': 1.2734082397003748e-06, 'epoch': 0.94}
{'train_runtime': 6.7426, 'train_samples_per_second': 1265.095, 'train_steps_per_second': 79.198, 'train_loss': 0.4699943199586333, 'epoch': 1.0}


  0%|          | 0/67 [00:00<?, ?it/s]

{'eval_loss': 0.41191136837005615,
 'eval_f1': 0.8180076628352491,
 'eval_runtime': 1.1766,
 'eval_samples_per_second': 905.988,
 'eval_steps_per_second': 56.943,
 'epoch': 1.0}

 It demonstrates that although we generally want to train as many layers as possible, you can get away with training less if you do not have the necessary computing power.

 If we iteratively do this and plot the F1 evolution:

 <img src="imgs/frozen.png" alt="Hugging Face" height=400 width=600>

# Few-Shot Classification

Few-shot classification is a technique within supervised classification where you have a classifier learn target labels based on only a few labeled examples. 

## SetFit: Efficient Fine-Tuning with Few Training Examples

It generate high-quality textual representations that are updated during training. 

Only a few labeled examples are needed for this framework to be competitive with fine-tuning a BERT-like model on a large, labeled dataset as we explored in the previous example.

1. Sampling training data
    - Based on in-class and out-class selection of labeled data it generates positive (similar) and negative (dissimilar) pairs of sentences

2. Fine-tuning embeddings
    - Fine-tuning a pretrained embedding model based on the previously generated training data

3. Training a classifier
    - Create a classification head on top of the embedding model and train it using the previously generated training data


### Generate Training Data

Before fine-tuning an embedding model, we need to generate training data. The model assumes the training data to be samples of positive (similar) and negative (dissimilar) pairs of sentences. 

#### STEP 1

Say, for example, we have the training dataset in Figure 11-9 that classifies text into two categories: text about programming languages, and text about pets.

In step 1, SetFit handles this problem by generating the necessary data based on in-class and out-class selection

 when we have 16 sentences about sports, we can create 16 * (16 – 1) / 2 = 120 pairs that we label as positive pairs. We can use this process to generate negative pairs by collecting pairs from different classes.

 <img src="imgs/dataset.png" alt="Hugging Face" height=200 width=500>

#### STEP 2

In step 2, we can use the generated sentence pairs to fine-tune the embedding model. This leverages a method called contrastive learning to fine-tune a pretrained BERT model.

Since we generated these pairs in the previous step, we can use them to fine-tune a SentenceTransformers model. 

 <img src="imgs/ftsentenceemb.png" alt="Hugging Face" height=400 width=500>

The goal of fine-tuning this embedding model is that it can create embeddings that are tuned to the classification task.

#### STEP 3

In step 3, we generate embeddings for all sentences and use those as the input of a classifier.

 <img src="imgs/step3.png" alt="Hugging Face" height=400 width=500>

#### All STEP


 <img src="imgs/steps.png" alt="Hugging Face" height=400 width=600>

In [21]:
from collections import Counter

In [18]:
# We simulate a few-shot setting by sampling 16 examples per class
sampled_train_data = sample_dataset(tomatoes["train"], num_samples=16)

  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))


In [19]:
sampled_train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 32
})

In [22]:
Counter(sampled_train_data["label"])

Counter({1: 16, 0: 16})

However, since this is a few-shot setting, we will only sample 16 examples per class. With two classes, we will only have 32 documents to train on compared to the 8,500 movie reviews we used before!

After sampling the data, we choose a pretrained SentenceTransformer model to fine-tune.

In [23]:
# Load a pre-trained SentenceTransformer model
model = SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


By default, a logistic regression model is chosen as the classifier to train.

Similar to what we did with Hugging Face Transformers, we can use the trainer to define and play around with relevant parameters. For example, we set the num_epochs to 3 so that contrastive learning will be performed for three epochs:

In [24]:
args = SetFitTrainingArguments(
    num_epochs=3, # The number of epochs to use for contrastive learning
    num_iterations=20  # The number of text pairs to generate
)
args.eval_strategy = args.evaluation_strategy

# Create trainer
trainer = SetFitTrainer(
    model=model,
    args=args,
    train_dataset=sampled_train_data,
    eval_dataset=test_data,
    metric="f1"
)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [25]:
# Training loop
trainer.train()

***** Running training *****
  Num unique pairs = 1280
  Batch size = 16
  Num epochs = 3
  Total optimization steps = 240


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

{'embedding_loss': 0.2271, 'learning_rate': 8.333333333333333e-07, 'epoch': 0.01}
{'embedding_loss': 0.0016, 'learning_rate': 1.7592592592592595e-05, 'epoch': 0.62}
{'embedding_loss': 0.0003, 'learning_rate': 1.2962962962962964e-05, 'epoch': 1.25}
{'embedding_loss': 0.0002, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.88}
{'embedding_loss': 0.0002, 'learning_rate': 3.7037037037037037e-06, 'epoch': 2.5}
{'train_runtime': 12.385, 'train_samples_per_second': 310.052, 'train_steps_per_second': 19.378, 'epoch': 3.0}


In [26]:
# Evaluate the model on our test data
trainer.evaluate()

***** Running evaluation *****


{'f1': 0.8373764600179695}

In [27]:
model.model_head

# Continued Pretraining with Masked Language Modeling

The pretrained model is often trained on very general data, like Wikipedia pages, and might not be tuned to your domain-specific words.

<img src="imgs/ft.png" alt="Hugging Face" height=400 width=600>

Instead of adopting this two-step approach, we can squeeze another step between them, namely continue pretraining an already pretrained BERT model. 

In other words, we can simply continue training the BERT model using masked language modeling (MLM) but instead use data from our domain. 

It is like going from a general BERT model to a BioBERT model specialized for the medical domain, to a fine-tuned BioBERT model to classify medication.

This will update the subword representations to be more tuned toward words it would not have seen before.

<img src="imgs/continuedpretrain.png" alt="Hugging Face" height=400 width=600>

Instead of having to pretrain an entire model from scratch, we can simply continue pretraining before fine-tuning it for classification

**In this example, we will demonstrate how to apply step 2 and continue pretraining an already pretrained BERT model**

In [38]:
from transformers import DataCollatorForLanguageModeling
from transformers import pipeline

In [31]:
# Load model for Masked Language Modeling (MLM)
model = AutoModelForMaskedLM.from_pretrained("bert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architect

In [32]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

# Tokenize data
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_train = tokenized_train.remove_columns("label")
tokenized_test = test_data.map(preprocess_function, batched=True)
tokenized_test = tokenized_test.remove_columns("label")

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

**Instead, we will have a DataCollator that will perform the masking of tokens for us.**

- token masking (15% tokens randomly remove) (CONVERGES FASTER, BUT THE MODEL LEARNS LESS)
- whole-word masking

In [None]:

# Masking Tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)
# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [36]:
# Save pre-trained tokenizer
tokenizer.save_pretrained("mlm")

# Train model
trainer.train()

# Save updated model
model.save_pretrained("mlm")

  0%|          | 0/5340 [00:00<?, ?it/s]

{'loss': 2.5959, 'grad_norm': 13.989809036254883, 'learning_rate': 1.812734082397004e-05, 'epoch': 0.94}
{'loss': 2.3715, 'grad_norm': 17.446483612060547, 'learning_rate': 1.6254681647940076e-05, 'epoch': 1.87}
{'loss': 2.3091, 'grad_norm': 23.203062057495117, 'learning_rate': 1.4382022471910113e-05, 'epoch': 2.81}
{'loss': 2.1879, 'grad_norm': 13.093008041381836, 'learning_rate': 1.250936329588015e-05, 'epoch': 3.75}
{'loss': 2.151, 'grad_norm': 14.838418006896973, 'learning_rate': 1.0636704119850187e-05, 'epoch': 4.68}
{'loss': 2.088, 'grad_norm': 22.59937858581543, 'learning_rate': 8.764044943820226e-06, 'epoch': 5.62}
{'loss': 2.0599, 'grad_norm': 18.066375732421875, 'learning_rate': 6.891385767790263e-06, 'epoch': 6.55}
{'loss': 1.9871, 'grad_norm': 13.970683097839355, 'learning_rate': 5.0187265917603005e-06, 'epoch': 7.49}
{'loss': 1.9874, 'grad_norm': 18.521894454956055, 'learning_rate': 3.146067415730337e-06, 'epoch': 8.43}
{'loss': 1.9633, 'grad_norm': 14.674639701843262, 'lea

The tokenizer is not updated during training so there is no need to save it after training.

To evaluate its performance we would normally fine-tune the model on a variety of tasks. For our purposes, however, we can run some masking tasks to see if it has learned from its continued training.

In [39]:
# Load and create predictions
mask_filler = pipeline("fill-mask", model="bert-base-cased")
preds = mask_filler("What a horrible [MASK]!")

# Print results
for pred in preds:
    print(f">>> {pred['sequence']}")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


>>> What a horrible idea!
>>> What a horrible dream!
>>> What a horrible thing!
>>> What a horrible day!
>>> What a horrible thought!


The output demonstrates concepts like “idea,” “dream,” and “day,” which definitely make sense. Next, let’s see what our updated model predicts

In [41]:
# Load and create predictions
mask_filler = pipeline("fill-mask", model="mlm")
preds = mask_filler("What a horrible [MASK]!")

# Print results
for pred in preds:
    print(f">>> {pred['sequence']}")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


>>> What a horrible movie!
>>> What a horrible film!
>>> What a horrible mess!
>>> What a horrible story!
>>> What a horrible comedy!


**A horrible movie, film, mess, etc. clearly shows us that the model is more biased toward the data that we fed it compared to the pretrained model.**

The next step would be to fine-tune this model on the classification task that we did at the beginning of this chapter. Simply load the model as follows and you are good to go:

In [None]:
# Fine-tune for classification
model = AutoModelForSequenceClassification.from_pretrained("mlm", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("mlm")

# Repeat the same process as before

# Named Entity Recognition

Fine-tuning the pretrained BERT model follows a similar architecture akin to what we observed with document classification. However, there is a fundamental shift in the classification approach. 

 Rather than relying on the aggregation or pooling of token embeddings, the model now makes predictions for individual tokens in a sequence. 

 # <img src="imgs/ner.png" alt="Patching" width="600" height="300">

In [55]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate

In [44]:
# The CoNLL-2003 dataset for NER
dataset = load_dataset("conll2003", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [45]:
example = dataset["train"][848]
example

{'id': '848',
 'tokens': ['Dean',
  'Palmer',
  'hit',
  'his',
  '30th',
  'homer',
  'for',
  'the',
  'Rangers',
  '.'],
 'pos_tags': [22, 22, 38, 29, 16, 21, 15, 12, 23, 7],
 'chunk_tags': [11, 12, 21, 11, 12, 12, 13, 11, 12, 0],
 'ner_tags': [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]}

In [46]:
label2id = {
    'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4,
    'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8
}
id2label = {index: label for label, index in label2id.items()}
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

Take a look at Datasets with NER for Restaurants and Dish: https://huggingface.co/datasets/tner/mit_restaurant

These entities correspond to specific categories: a person (PER), organization (ORG), location (LOC), miscellaneous entities (MISC), and no entity (O). Note that these entities are prefixed with either a B (beginning) or an I (inside). 

 If two tokens that follow each other are part of the same phrase, then the start of that phrase is indicated with B, which is followed by an I to show that they belong to each other and are not independent entities.

<img src="imgs/nertags.png" alt="Patching" width="500" height="200">

Our data is preprocessed and split up into words but not yet tokens. To do so, we will tokenize it further with the tokenizer of the pretrained model 

In [47]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# Split individual tokens into sub-tokens
token_ids = tokenizer(example["tokens"], is_split_into_words=True)["input_ids"]
sub_tokens = tokenizer.convert_ids_to_tokens(token_ids)
sub_tokens

['[CLS]',
 'Dean',
 'Palmer',
 'hit',
 'his',
 '30th',
 'home',
 '##r',
 'for',
 'the',
 'Rangers',
 '.',
 '[SEP]']

The tokenizer added the [CLS] and [SEP]

This creates a bit of a problem for us since we have labeled data at the word level but not at the token level. 

This can be resolved by aligning the labels with their subtoken counterparts during tokenization.

Let’s consider the word 'Maarten', which has the label B-PER to signal that this is a person. If we pass that word through the tokenizer, it splits the word up into the tokens 'Ma', '##arte', and '##n'. We cannot use the B-PER entity for all tokens as that would signal that the three tokens are all independent people. Whenever an entity is split into tokens, the first token should have B (for beginning) and the following should be I (for inner).

In [49]:
def align_labels(examples):
    token_ids = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = examples["ner_tags"]

    updated_labels = []
    for index, label in enumerate(labels):

        # Map tokens to their respective word
        word_ids = token_ids.word_ids(batch_index=index)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:

            # The start of a new word
            if word_idx != previous_word_idx:

                previous_word_idx = word_idx
                updated_label = -100 if word_idx is None else label[word_idx]
                label_ids.append(updated_label)

            # Special token is -100
            elif word_idx is None:
                label_ids.append(-100)

            # If the label is B-XXX we change it to I-XXX
            else:
                updated_label = label[word_idx]
                if updated_label % 2 == 1:
                    updated_label += 1
                label_ids.append(updated_label)

        updated_labels.append(label_ids)

    token_ids["labels"] = updated_labels
    return token_ids

tokenized = dataset.map(align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [51]:
# Difference between original and updated labels
print(f"Original: {example['ner_tags']}")
print(f"Updated: {tokenized['train'][848]['labels']}")


Original: [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]
Updated: [-100, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, -100]


Now that we have tokenized and aligned the labels, we can start thinking about defining our **evaluation metrics.** 

In [53]:
# Load sequential evaluation
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    # Create predictions
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)

    true_predictions = []
    true_labels = []

    # Document-level iteration
    for prediction, label in zip(predictions, labels):

      # token-level iteration
      for token_prediction, token_label in zip(prediction, label):

        # We ignore special tokens
        if token_label != -100:
          true_predictions.append([id2label[token_prediction]])
          true_labels.append([id2label[token_label]])

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {"f1": results["overall_f1"]}

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

### Fine-Tuning for Named-Entity Recognition

We are nearly there. Instead of DataCollatorWithPadding, we need a collator that works with classification on a token level, namely DataCollatorForTokenClassification:

In [56]:
# Token-classification Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [57]:
# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  0%|          | 0/878 [00:00<?, ?it/s]

{'loss': 0.2311, 'grad_norm': 1.5110145807266235, 'learning_rate': 8.610478359908885e-06, 'epoch': 0.57}
{'train_runtime': 29.6735, 'train_samples_per_second': 473.183, 'train_steps_per_second': 29.589, 'train_loss': 0.16675895438922023, 'epoch': 1.0}


TrainOutput(global_step=878, training_loss=0.16675895438922023, metrics={'train_runtime': 29.6735, 'train_samples_per_second': 473.183, 'train_steps_per_second': 29.589, 'total_flos': 351240792638148.0, 'train_loss': 0.16675895438922023, 'epoch': 1.0})

In [58]:
# Evaluate the model on our test data
trainer.evaluate()

  0%|          | 0/216 [00:00<?, ?it/s]

{'eval_loss': 0.14443840086460114,
 'eval_f1': 0.9053670966714444,
 'eval_runtime': 2.3446,
 'eval_samples_per_second': 1472.721,
 'eval_steps_per_second': 92.125,
 'epoch': 1.0}

In [59]:
from transformers import pipeline

# Save our fine-tuned model
trainer.save_model("training_ner_model")

# Run inference on the fine-tuned model
token_classifier = pipeline(
    "token-classification",
    model="training_ner_model",
)
token_classifier("My name is Maarten.")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'B-PER',
  'score': 0.98807156,
  'index': 4,
  'word': 'Ma',
  'start': 11,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.9644239,
  'index': 5,
  'word': '##arte',
  'start': 13,
  'end': 17},
 {'entity': 'I-PER',
  'score': 0.9787199,
  'index': 6,
  'word': '##n',
  'start': 17,
  'end': 18}]