<a href="https://colab.research.google.com/github/Eddy-Emmanuel/NER-Transformer/blob/main/NER_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install datasets

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [3]:
conllpp = load_dataset("ZihanWangKi/conllpp")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
conllpp

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
conllpp_cpy = conllpp.map(lambda batch: {"ner_tags_str": [conllpp["train"].features["ner_tags"].feature.int2str(i) for i in batch["ner_tags"]]}, batched=True)

In [6]:
model_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
conllpp_cpy["train"].to_pandas().head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags,ner_tags_str
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[3, 0, 7, 0, 0, 0, 7, 0, 0]","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[1, 2]","[B-PER, I-PER]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[5, 0]","[B-LOC, O]"
3,3,"[The, European, Commission, said, on, Thursday...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ...","[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,..."
4,4,"[Germany, 's, representative, to, the, Europea...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ...","[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-P..."


In [8]:
model_tokenizer(conllpp_cpy["train"]["tokens"], is_split_into_words=True).word_ids(1)

[None, 0, 1, None]

In [29]:
def align_labels_with_tokens(batch):
    tokenized_inputs = model_tokenizer(batch["tokens"], is_split_into_words=True, truncation=True)

    aligned_labels = []

    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(batch["tokens"]))):
        previous_word = None
        labels = []

        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)
            elif word_id != previous_word:
                labels.append(batch["ner_tags"][i][word_id])
            else:
                labels.append(batch["ner_tags"][i][word_id])

            previous_word = word_id

        aligned_labels.append(labels)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

In [30]:
prep_conllpp_cpy = conllpp_cpy.map(align_labels_with_tokens, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [31]:
filt_prep_conllpp_cpy = prep_conllpp_cpy.remove_columns(['id', 'tokens', 'pos_tags',
                                                         'chunk_tags', 'ner_tags', 'ner_tags_str'])

filt_prep_conllpp_cpy

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [78]:
filt_prep_conllpp_cpy["train"].to_pandas().head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 7327, 19164, 2446, 2655, 2000, 17757, 23...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]"
1,"[101, 2848, 13934, 102]","[1, 1, 1, 1]","[-100, 1, 2, -100]"
2,"[101, 9371, 2727, 1011, 5511, 1011, 2570, 102]","[1, 1, 1, 1, 1, 1, 1, 1]","[-100, 5, 0, 0, 0, 0, 0, -100]"
3,"[101, 1996, 2647, 3222, 2056, 2006, 9432, 2009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, ..."
4,"[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, ..."


In [32]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=model_tokenizer)

In [13]:
!pip -q install seqeval
!pip -q install evaluate

In [33]:
import evaluate, numpy as np

In [34]:
metrics = evaluate.load("seqeval")

In [35]:
conllpp["train"].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [99]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [conllpp["train"].features["ner_tags"].feature.names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [conllpp["train"].features["ner_tags"].feature.names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metrics.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
}

In [100]:
from transformers import AutoModelForTokenClassification

id2label = {idx:lbl for idx,lbl in enumerate(conllpp["train"].features["ner_tags"].feature.names)}
label2id = {lbl:idx for idx,lbl in enumerate(conllpp["train"].features["ner_tags"].feature.names)}

In [101]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased",
                                                        num_labels=len(id2label),
                                                        id2label=id2label,
                                                        label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [102]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments("ner_bert",
                                  eval_strategy="epoch",
                                  logging_strategy="epoch",
                                  learning_rate=2e-5,
                                  num_train_epochs=20,
                                  weight_decay=0.01,
                                  per_device_train_batch_size=64,
                                  per_device_eval_batch_size=64,)

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=filt_prep_conllpp_cpy["train"],
                  eval_dataset=filt_prep_conllpp_cpy["validation"],
                  data_collator=data_collator,
                  processing_class=model_tokenizer,
                  compute_metrics=compute_metrics)

In [103]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.335,0.086105,0.877853,0.894843,0.886267,0.975487
2,0.071,0.065062,0.911925,0.919678,0.915785,0.980571
3,0.0436,0.059995,0.923486,0.931648,0.927549,0.982827
4,0.0291,0.061629,0.920348,0.934556,0.927398,0.982795
5,0.0218,0.06277,0.922358,0.938248,0.930235,0.983558
6,0.0164,0.067694,0.932835,0.93534,0.934086,0.983875
7,0.0123,0.070641,0.923761,0.940709,0.932158,0.983875
8,0.01,0.069572,0.929513,0.938248,0.93386,0.984066
9,0.008,0.073,0.92649,0.939031,0.932718,0.984018
10,0.0073,0.072471,0.932247,0.940486,0.936348,0.984606


TrainOutput(global_step=4400, training_loss=0.029541979445652528, metrics={'train_runtime': 1929.6117, 'train_samples_per_second': 145.532, 'train_steps_per_second': 2.28, 'total_flos': 4122506573911152.0, 'train_loss': 0.029541979445652528, 'epoch': 20.0})

In [106]:
import pandas as pd
pd.DataFrame([trainer.evaluate(filt_prep_conllpp_cpy["test"])])

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.122908,0.906949,0.905776,0.906362,0.978351,35.1956,98.109,1.534,20.0
