# Simple NER Training

There are 2 main ways to train a Named Entity Recognition (NER) model using transformers:
1. Using the predefined `Trainer` class and `.train()` method
2. Using the Trainer class with a `custom training loop`

In this notebook we explore the first option

## Prepare Data

As we did in [NER/Intro.ipynb](NER/Intro.ipynb):
0. Load data using, e.g. `datasets.load_dataset()`
1. Chunk text into, e.g. sentences
2. Pre-tokenize sentences to words and assign then the NER labels
3. Tokenize words and align old labels to tokens (e.g. Parlament [B-LOC] → Parl## [B-LOC] + ##ament [B-LOC])
4. Convert everything to I-CLASS tags (i.e. remove B-tags) except O’s. Parl## [I-LOC] + ##ament [I-LOC].

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
label_names = raw_datasets["train"].features["ner_tags"].feature.names

print(raw_datasets)
print(label_names)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [2]:
from transformers import AutoTokenizer
from NER.utils.loaders import tokenize_and_align_labels

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenized_datasets = raw_datasets.map(lambda x: tokenize_and_align_labels(x, tokenizer),
                                      batched=True,
                                      remove_columns=raw_datasets["train"].column_names)
print(tokenized_datasets)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})


We can’t just use a DataCollatorWithPadding like in Chapter 3 because that only pads the inputs (input IDs, attention mask, and token type IDs). Here our labels should be padded the exact same way as the inputs so that they stay the same size, using -100 as a value so that the corresponding predictions are ignored in the loss computation.

In [3]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="pt")
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

# Define the metrics

In [4]:
import evaluate

# This metric compares list of string labels so decoding is necessary
metric = evaluate.load("seqeval")

# Example
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
print("Ground Truth:", labels)

predictions = labels.copy()
predictions[2] = "O"
print("Prediction:", predictions)

ex_metric = metric.compute(predictions=[predictions], references=[labels])
ex_metric

Ground Truth: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
Prediction: ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


{'MISC': {'precision': np.float64(1.0),
  'recall': np.float64(0.5),
  'f1': np.float64(0.6666666666666666),
  'number': np.int64(2)},
 'ORG': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(0.6666666666666666),
 'overall_f1': np.float64(0.8),
 'overall_accuracy': 0.8888888888888888}

We have to do that for the whole predictions and labels.

In [5]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Define the model

In [6]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
print(id2label)
print(label2id)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}


In [7]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

print("Important to check that the number of labels is correct")
print(model.config.num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Important to check that the number of labels is correct
9


# Training (easy way)

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    batch_size=16,
    push_to_hub=False,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.21,0.068273,0.879557,0.921744,0.900156,0.979838
2,0.0607,0.058143,0.91488,0.940592,0.927558,0.984194
3,0.0344,0.056726,0.922054,0.945641,0.933699,0.985253


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1758, training_loss=0.09113581948177264, metrics={'train_runtime': 130.7668, 'train_samples_per_second': 322.123, 'train_steps_per_second': 13.444, 'total_flos': 1120174042177170.0, 'train_loss': 0.09113581948177264, 'epoch': 3.0})

In [12]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "bert-finetuned-ner/checkpoint-1758"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple", device="cuda"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': np.float32(0.99724674),
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': np.float32(0.7249463),
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': np.float32(0.9909299),
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]