### Installing dependencies



In [None]:
! pip install transformers[torch] sentencepiece datasets evaluate seqeval huggingface_hub wandb

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb
  Downloading wandb-0.16.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [None]:
! huggingface-cli login
! wandb login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

### Importing dependencies and setting paths

In [None]:
from google.colab import drive
import os

drive.mount("/content/drive/")
os.chdir("/content/drive/MyDrive/Colab Notebooks/NER/")

os.environ["WANDB_PROJECT"]="DeBERTa-v3-NER"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

Mounted at /content/drive/


In [None]:
import numpy as np
from tqdm.notebook import tqdm, trange
from spacy import displacy
import typing as tp
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from transformers import DataCollatorForTokenClassification
import evaluate
import torch

from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [None]:
DATASET_NAME = "Babelscape/multinerd"
BASE_MODEL_NAME = "microsoft/deberta-v3-base"
OUTPUT_DIR = "./deberta-v3-base-ner-A"

label2id = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }

id2label = {v: k for k, v in label2id.items()}

label_list = [id2label[i] for i in range(len(id2label))]

### Loading and filtering dataset

In [None]:
dataset = load_dataset(DATASET_NAME)
dataset = dataset.filter(lambda x: x["lang"] == "en", num_proc=10)
# dataset.save_to_disk("./data/multinerd_en")
# dataset = load_from_disk("./data/multinerd_en")
dataset

Downloading readme:   0%|          | 0.00/5.66k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/32.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/50.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.04M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.18M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.51M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.47M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter (num_proc=10):   0%|          | 0/2678400 [00:00<?, ? examples/s]

Filter (num_proc=10):   0%|          | 0/334800 [00:00<?, ? examples/s]

Filter (num_proc=10):   0%|          | 0/335986 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 262560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 32820
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 32908
    })
})

### Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
def tokenize_and_preserve_tags(example, tokenizer, label_field):
    encoded = tokenizer(example["tokens"], is_split_into_words=True)
    encoded.update(example)

    word_ids = encoded.word_ids()
    labels = []
    for i in range(len(word_ids)):
        label = None
        if word_ids[i] is not None:
            label = example[label_field][word_ids[i]]
        if label is None:
            labels.append(-100)
        elif word_ids[i - 1] != word_ids[i]:
            labels.append(label)
        else:
            labels.append((label + 1) // 2 * 2)
    encoded["labels"] = labels
    encoded["word_ids"] = word_ids
    for key in ["input_ids", "attention_mask", "token_type_ids", "labels"]:
        encoded[key] = torch.LongTensor(encoded[key])
    return encoded

In [None]:
dataset = dataset.map(lambda x: tokenize_and_preserve_tags(x, tokenizer, "ner_tags"))
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'], output_all_columns=True)
dataset

Map:   0%|          | 0/262560 [00:00<?, ? examples/s]

Map:   0%|          | 0/32820 [00:00<?, ? examples/s]

Map:   0%|          | 0/32908 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 262560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 32820
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 32908
    })
})

### Loading pretrained model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids

### Fine-tuning

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    report_to="wandb",
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=(1 / 3 / 8),
    fp16=True,
    save_steps=(1 / 3 / 2),
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

In [None]:
import wandb

wandb.init()

In [None]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
257,0.0668,0.070336,0.896227,0.837737,0.865995,0.976668
514,0.0511,0.065183,0.834769,0.921144,0.875832,0.976003
771,0.0536,0.054134,0.879959,0.899812,0.889775,0.980201
1028,0.0392,0.055197,0.871154,0.922622,0.89615,0.979699
1285,0.0433,0.053816,0.871057,0.924158,0.896822,0.979937
1542,0.0411,0.050207,0.88502,0.925752,0.904928,0.980724
1799,0.0341,0.047285,0.902111,0.916574,0.909285,0.982765
2056,0.042,0.047532,0.911136,0.915422,0.913274,0.982716
2313,0.0277,0.048622,0.913246,0.916612,0.914926,0.982772
2570,0.026,0.048361,0.905628,0.924964,0.915194,0.983061


TrainOutput(global_step=6153, training_loss=0.034111085311050594, metrics={'train_runtime': 8152.5551, 'train_samples_per_second': 96.618, 'train_steps_per_second': 0.755, 'total_flos': 2.1005260134144e+16, 'train_loss': 0.034111085311050594, 'epoch': 3.0})

### Testing

In [None]:
trainer.evaluate(dataset["test"])

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.04336942359805107,
 'eval_precision': 0.9409321291109333,
 'eval_recall': 0.9627997660362644,
 'eval_f1': 0.951740353852677,
 'eval_accuracy': 0.9879600576718854,
 'eval_runtime': 99.1271,
 'eval_samples_per_second': 331.978,
 'eval_steps_per_second': 20.751}

In [None]:
def align_tags_with_words(token_tags, word_indices):
    word_tags = []
    for i in range(len(token_tags)):

        if token_tags[i] == -100:
            continue
        elif not word_tags or word_indices[i] != word_indices[i - 1]:
            word_tags.append(token_tags[i])
        elif word_tags[-1] == 0 and token_tags[i] != 0:
            word_tags[-1] = token_tags[i]

    for i in range(1, len(word_tags)):
        if (
            word_tags[i] > 0 and word_tags[i] % 2 == 0 and
            word_tags[i] - word_tags[i - 1] not in [0, 1]
        ):
            word_tags[i] -= 1
    return word_tags

In [None]:
preds = trainer.predict(dataset["test"])

pred_ner_tags = []
true_ner_tags = []
for i in trange(len(dataset["test"])):
    word_ids = dataset["test"][i]["word_ids"][1:-1]
    pred = preds.predictions[i][1:len(word_ids) + 1]
    pred_ner_tags.append([id2label[x] for x in align_tags_with_words(pred, word_ids)])
    true_ner_tags.append([id2label[x] for x in dataset["test"][i]["ner_tags"]])

  0%|          | 0/32908 [00:00<?, ?it/s]

In [None]:
print(classification_report(true_ner_tags, pred_ner_tags, digits=3))

              precision    recall  f1-score   support

        ANIM      0.740     0.808     0.773      3208
         BIO      0.636     0.875     0.737        16
         CEL      0.708     0.829     0.764        82
         DIS      0.776     0.848     0.811      1518
         EVE      0.953     0.980     0.966       704
        FOOD      0.684     0.761     0.721      1132
        INST      0.900     0.750     0.818        24
         LOC      0.995     0.995     0.995     24048
       MEDIA      0.953     0.972     0.962       916
        MYTH      0.824     0.875     0.848        64
         ORG      0.983     0.988     0.986      6618
         PER      0.996     0.997     0.996     10530
       PLANT      0.633     0.791     0.703      1788
        TIME      0.821     0.858     0.839       578
        VEHI      0.871     0.844     0.857        64

   micro avg      0.943     0.963     0.953     51290
   macro avg      0.832     0.878     0.852     51290
weighted avg      0.948   

### Inference

In [None]:
import numpy as np
from spacy import displacy
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

FINETUNED_MODEL = "alexeyak/deberta-v3-base-ner-A"

In [None]:
def align_tags_with_words(token_tags, word_indices):
    word_tags = []
    for i in range(len(token_tags)):
        if token_tags[i] == -100:
            continue
        elif not word_tags or word_indices[i] != word_indices[i - 1]:
            word_tags.append(token_tags[i])
        elif word_tags[-1] == 0 and token_tags[i] != 0:
            word_tags[-1] = token_tags[i]

    for i in range(1, len(word_tags)):
        if (
            word_tags[i] > 0 and word_tags[i] % 2 == 0 and
            word_tags[i] - word_tags[i - 1] not in [0, 1]
        ):
            word_tags[i] -= 1
    return word_tags

def ner_render(tokens, ner_tags, title=None, **kwargs):
    pos = 0
    ents = []
    for word, tag_idx in zip(tokens, ner_tags):
        tag = id2label[tag_idx]
        if tag.startswith('B'):
            ents.append({
                "start": pos,
                "end": pos + len(word),
                "label": tag.split("-")[1]
            })
        elif tag.startswith('I'):
            ents[-1]["end"] = pos + len(word)
        pos += (len(word) + 1)
    displacy.render({
        "text": " ".join(tokens),
        "ents": ents,
        "title": title
    }, style="ent", manual=True, jupyter=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)
model = AutoModelForTokenClassification.from_pretrained(FINETUNED_MODEL)

In [None]:
text = "Four days later, he scored his first Premier League goal of the season, assisting Fernando Torres for the opener and then scoring directly from a free-kick in a 2–1 away win against Arsenal at the Emirates Stadium."
# text = "Takuya Takagi scored the winner in the 88th minute, rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net."
inputs = tokenizer(text.split(), is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    token_tags = model(**inputs).logits[0].numpy().argmax(-1)

ner_tags = align_tags_with_words(token_tags[1:-1], inputs.word_ids()[1:-1])

In [None]:
ner_render(text.split(), ner_tags)