In [20]:
try:
    import evaluate
except:
    !pip -q install evaluate seqeval
    import evaluate

import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
                          AutoModelForTokenClassification, 
                          TrainingArguments,
                          DataCollatorForTokenClassification,
                          Trainer,
                          AutoTokenizer,
                          pipeline
                         )

In [2]:
dataset = load_dataset("conll2003")

dataset

README.md: 0.00B [00:00, ?B/s]

conll2003.py: 0.00B [00:00, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
label2id =  {
    "O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4, 
    "B-LOC": 5, "I-LOC": 6, "B-MISC": 7, "I-MISC": 8
 }

id2label = {index:label for label, index in label2id.items()}

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", 
                                                        num_labels=len(label2id),
                                                        id2label=id2label,
                                                        label2id=label2id)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def align_labels(data):
    token_ids = tokenizer(data["tokens"], 
                           truncation=True, 
                           is_split_into_words=True)
    
    labels = data["ner_tags"]
    updated_labels = []
    
    for index, label in enumerate(labels):
        word_ids = token_ids.word_ids(batch_index=index)  
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids: 
            if word_idx != previous_word_idx:
                previous_word_idx = word_idx
                updated_label = -100 if word_idx is None else label[word_idx]
                label_ids.append(updated_label)
                
            elif word_idx is None:
                label_ids.append(-100)
                
            else:
                updated_label = label[word_idx]
                
                if updated_label % 2 == 1:
                    updated_label += 1
                    
                label_ids.append(updated_label)
                
        updated_labels.append(label_ids)

    token_ids["labels"] = updated_labels
    return token_ids

In [6]:
tokenized = dataset.map(align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [8]:
seqeval = evaluate.load("seqeval")

Downloading builder script: 0.00B [00:00, ?B/s]

In [9]:
def ComputeMetrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)

    true_predictions = []
    true_labels = []

    for prediction, label in zip(predictions, labels):
        sequence_predictions = []
        sequence_labels = []

        for token_pred, token_label in zip(prediction, label):
            if token_label != -100:
                sequence_predictions.append(id2label[token_pred])
                sequence_labels.append(id2label[token_label])

        true_predictions.append(sequence_predictions)
        true_labels.append(sequence_labels)

    results = seqeval.compute(
        predictions=true_predictions,
        references=true_labels
    )

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [13]:
training_arg = TrainingArguments(
                                "ner_model",
                                learning_rate=1e-5,
                                per_device_train_batch_size=32,
                                per_device_eval_batch_size=32,
                                num_train_epochs=10,
                                weight_decay=0.01,
                                save_strategy="epoch",
                                eval_strategy="epoch",
                                report_to="none"
                            )

trainer = Trainer(model=model,
                  args=training_arg, 
                  data_collator=data_collator,
                  train_dataset=tokenized["train"],
                  eval_dataset=tokenized["validation"],
                  compute_metrics=ComputeMetrics)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.058819,0.905744,0.931505,0.918444,0.98387
2,0.048600,0.06114,0.917939,0.941266,0.929456,0.98493
3,0.035700,0.0549,0.923077,0.945136,0.933976,0.986033
4,0.025000,0.057859,0.922333,0.947324,0.934662,0.985857
5,0.017000,0.05962,0.925226,0.947492,0.936227,0.986416
6,0.013100,0.059955,0.922648,0.947492,0.934905,0.986313
7,0.010300,0.06437,0.924036,0.947829,0.935781,0.986107
8,0.007400,0.068436,0.929125,0.94867,0.938796,0.986652
9,0.007400,0.066111,0.926633,0.947997,0.937193,0.986446
10,0.006000,0.066296,0.924049,0.947997,0.93587,0.986607


TrainOutput(global_step=4390, training_loss=0.01905335728291227, metrics={'train_runtime': 867.6443, 'train_samples_per_second': 161.829, 'train_steps_per_second': 5.06, 'total_flos': 3891613834801746.0, 'train_loss': 0.01905335728291227, 'epoch': 10.0})

In [21]:
pd.DataFrame([trainer.evaluate(tokenized["test"])])

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.207945,0.881391,0.915722,0.898229,0.971557,6.4527,535.128,16.737,10.0


In [26]:
trainer.save_model("ner_model")

In [28]:
token_classifier = pipeline(task="token-classification", model="ner_model")

Device set to use cuda:0


In [36]:
token_classifier("My name is Edifon Emmanuel Jimmy, I work at Amazon.")

[{'entity': 'B-PER',
  'score': 0.99901354,
  'index': 4,
  'word': 'Ed',
  'start': 11,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.99924374,
  'index': 5,
  'word': '##if',
  'start': 13,
  'end': 15},
 {'entity': 'I-PER',
  'score': 0.9990858,
  'index': 6,
  'word': '##on',
  'start': 15,
  'end': 17},
 {'entity': 'I-PER',
  'score': 0.99887115,
  'index': 7,
  'word': 'Emmanuel',
  'start': 18,
  'end': 26},
 {'entity': 'I-PER',
  'score': 0.99887997,
  'index': 8,
  'word': 'Jimmy',
  'start': 27,
  'end': 32},
 {'entity': 'B-ORG',
  'score': 0.99864286,
  'index': 13,
  'word': 'Amazon',
  'start': 44,
  'end': 50}]