<a href="https://colab.research.google.com/github/Eddy-Emmanuel/NER-Transformer/blob/main/NER_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
conllpp = load_dataset("ZihanWangKi/conllpp")

In [None]:
conllpp

In [None]:
conllpp_cpy = conllpp.map(lambda batch: {"ner_tags_str": [conllpp["train"].features["ner_tags"].feature.int2str(i) for i in batch["ner_tags"]]}, batched=True)

In [None]:
model_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [None]:
conllpp_cpy["train"].to_pandas().head()

In [None]:
model_tokenizer(conllpp_cpy["train"]["tokens"], is_split_into_words=True).word_ids(1)

In [None]:
def align_labels_with_tokens(batch):
    tokenized_inputs = model_tokenizer(batch["tokens"], is_split_into_words=True)

    aligned_labels = []

    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(batch["tokens"]))):
        previous_word = None
        labels = []

        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)
            elif word_id != previous_word:
                labels.append(batch["ner_tags"][i][word_id])
            else:
                labels.append(batch["ner_tags"][i][word_id])

            previous_word = word_id

        aligned_labels.append(labels)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

In [None]:
prep_conllpp_cpy = conllpp_cpy.map(align_labels_with_tokens, batched=True)

In [None]:
filt_prep_conllpp_cpy = prep_conllpp_cpy.remove_columns(['id', 'tokens', 'pos_tags',
                                                         'chunk_tags', 'ner_tags', 'ner_tags_str'])

filt_prep_conllpp_cpy

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=model_tokenizer)

In [None]:
!pip -q install seqeval
!pip -q install evaluate

In [None]:
import evaluate, numpy as np

In [None]:
metrics = evaluate.load("seqeval")

In [None]:
def compute_metrics(p):
    predictions, labels = np.argmax(p.predictions, axis=1), p.label_ids
    predictions = [[conllpp["train"].features["ner_tags"].feature.int2str(pred) for pred in pred_seq] for pred_seq in predictions]
    labels = [[conllpp["train"].features["ner_tags"].feature.int2str(label) for label in label_seq] for label_seq in labels]

    results = metrics.compute(predictions=predictions, references=labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import AutoModelForTokenClassification

id2label = {idx:lbl for idx,lbl in enumerate(conllpp["train"].features["ner_tags"].feature.names)}
label2id = {lbl:idx for idx,lbl in enumerate(conllpp["train"].features["ner_tags"].feature.names)}

In [None]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-cased",
                                                        id2label=id2label,
                                                        label2id=label2id)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments("ner_bert",
                                  evaluation_strategy="epoch",
                                  logging_strategy="epoch",
                                  learning_rate=2e-5,
                                  num_train_epochs=20,
                                  weight_decay=0.01,
                                  per_device_train_batch_size=64,
                                  per_device_eval_batch_size=64,)

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=filt_prep_conllpp_cpy["train"],
                  eval_dataset=filt_prep_conllpp_cpy["validation"],
                  data_collator=data_collator,
                  tokenizer=model_tokenizer,
                  compute_metrics=compute_metrics)

In [None]:
trainer.train()