In [1]:
# Reference: https://github.com/entbappy/NLP-Projects-Notebooks/blob/master/Fine-Tuning-BERT-for-NER.ipynb

In [2]:
!pip install datasets



In [3]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification # Can use BertForTokenClassification ; Auto.. infers the model
import torch



In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:

conll2003 = datasets.load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
conll2003["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [8]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) # Unseen words
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
! pip install accelerate -U



In [12]:
from transformers import TrainingArguments, Trainer


args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
)

In [16]:
label_list = conll2003["train"].features["ner_tags"].feature.names

In [17]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [21]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=a1ed1349bffb4047ce81e5bdc1eb43d4b3b404ea1481b9bbbd08647803d952c7
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval")
trainer = Trainer(
   model.to(device),
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0173,0.073021,0.930556,0.936906,0.93372,0.984336
2,0.0193,0.065907,0.935298,0.944401,0.939827,0.985305


Checkpoint destination directory test-ner/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1756, training_loss=0.0173758958086609, metrics={'train_runtime': 344.035, 'train_samples_per_second': 81.625, 'train_steps_per_second': 5.104, 'total_flos': 680885981018658.0, 'train_loss': 0.0173758958086609, 'epoch': 2.0})

In [24]:
from transformers import pipeline

In [25]:
nlp = pipeline("ner", model=model.to(device), tokenizer=tokenizer)


In [35]:
example = "Sydney University and University of New South Wales are in the city of Sydney"

ner_results = nlp(example)

print(ner_results)

[{'entity': 'LABEL_3', 'score': 0.99746144, 'index': 1, 'word': 'sydney', 'start': 0, 'end': 6}, {'entity': 'LABEL_4', 'score': 0.9837022, 'index': 2, 'word': 'university', 'start': 7, 'end': 17}, {'entity': 'LABEL_0', 'score': 0.9994636, 'index': 3, 'word': 'and', 'start': 18, 'end': 21}, {'entity': 'LABEL_3', 'score': 0.9937377, 'index': 4, 'word': 'university', 'start': 22, 'end': 32}, {'entity': 'LABEL_4', 'score': 0.996897, 'index': 5, 'word': 'of', 'start': 33, 'end': 35}, {'entity': 'LABEL_4', 'score': 0.9972976, 'index': 6, 'word': 'new', 'start': 36, 'end': 39}, {'entity': 'LABEL_4', 'score': 0.9952148, 'index': 7, 'word': 'south', 'start': 40, 'end': 45}, {'entity': 'LABEL_4', 'score': 0.994017, 'index': 8, 'word': 'wales', 'start': 46, 'end': 51}, {'entity': 'LABEL_0', 'score': 0.9998388, 'index': 9, 'word': 'are', 'start': 52, 'end': 55}, {'entity': 'LABEL_0', 'score': 0.9998481, 'index': 10, 'word': 'in', 'start': 56, 'end': 58}, {'entity': 'LABEL_0', 'score': 0.9998041, '

In [36]:
id2label = {
    'LABEL_' + str(i): label for i,label in enumerate(label_list)
}

id2label

for i in ner_results:
  i['entity'] = id2label[i['entity']]
  print(i['word'], i['entity'])

sydney B-ORG
university I-ORG
and O
university B-ORG
of I-ORG
new I-ORG
south I-ORG
wales I-ORG
are O
in O
the O
city O
of O
sydney B-LOC
