### Installing dependencies

In [None]:
! pip install peft transformers[torch] sentencepiece datasets evaluate seqeval huggingface_hub wandb

Collecting peft
  Downloading peft-0.7.0-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.4 

In [None]:
! huggingface-cli login
! wandb login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key i

### Importing dependencies and setting paths

In [None]:
from google.colab import drive
import os

drive.mount("/content/drive/")
os.chdir("/content/drive/MyDrive/Colab Notebooks/NER/")

os.environ["WANDB_PROJECT"]="DeBERTa-v3-large-NER-lora"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

Mounted at /content/drive/


In [None]:
import numpy as np
from tqdm.notebook import tqdm, trange
from spacy import displacy
import typing as tp
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from transformers import DataCollatorForTokenClassification
import evaluate
import torch
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType


from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [None]:
DATASET_NAME = "Babelscape/multinerd"
BASE_MODEL_NAME = "microsoft/deberta-v3-large"
OUTPUT_DIR = "./deberta-v3-large-ner-lora"

label2id = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }

id2label = {v: k for k, v in label2id.items()}

label_list = [id2label[i] for i in range(len(id2label))]

### Loading and filtering dataset

In [None]:
dataset = load_dataset("Babelscape/multinerd")
dataset = dataset.filter(lambda x: x["lang"] == "en", num_proc=10)
# # dataset.save_to_disk("./data/multinerd_en")
# dataset = load_from_disk("./data/multinerd_en")
dataset

Downloading readme:   0%|          | 0.00/5.66k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/32.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/50.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.04M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.18M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.51M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.47M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter (num_proc=10):   0%|          | 0/2678400 [00:00<?, ? examples/s]

Filter (num_proc=10):   0%|          | 0/334800 [00:00<?, ? examples/s]

Filter (num_proc=10):   0%|          | 0/335986 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 262560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 32820
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 32908
    })
})

### Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [None]:
def tokenize_and_preserve_tags(example, tokenizer, label_field):
    encoded = tokenizer(example["tokens"], is_split_into_words=True)
    encoded.update(example)

    word_ids = encoded.word_ids()
    labels = []
    for i in range(len(word_ids)):
        label = None
        if word_ids[i] is not None:
            label = example[label_field][word_ids[i]]
        if label is None:
            labels.append(-100)
        elif word_ids[i - 1] != word_ids[i]:
            labels.append(label)
        else:
            labels.append((label + 1) // 2 * 2)
    encoded["labels"] = labels
    encoded["word_ids"] = word_ids
    for key in ["input_ids", "attention_mask", "token_type_ids", "labels"]:
        encoded[key] = torch.LongTensor(encoded[key])
    return encoded

In [None]:
dataset = dataset.map(lambda x: tokenize_and_preserve_tags(x, tokenizer, "ner_tags"))
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'], output_all_columns=True)
dataset

Map:   0%|          | 0/262560 [00:00<?, ? examples/s]

Map:   0%|          | 0/32820 [00:00<?, ? examples/s]

Map:   0%|          | 0/32908 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 262560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 32820
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang', 'input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_ids'],
        num_rows: 32908
    })
})

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, )

### Loading pretrained model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=4, target_modules=["query_proj", "key_proj"], lora_alpha=8, lora_dropout=0.1, bias="none"
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 171,295 || all params: 184,026,686 || trainable%: 0.0930816088271024


In [None]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    # print(type(predictions), type(labels))

    # print(len(predictions), len(labels), type(predictions), type(labels))
    # predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)
    return pred_ids

In [None]:
n_epochs = 5
n_eval_per_epoch = 8
n_save_per_epoch = 2
train_batch_size=8
eval_batch_size=128
grad_acc_steps=16
n_train_steps = dataset["train"].shape[0] * n_epochs / train_batch_size / grad_acc_steps
n_warmup_steps = round(n_train_steps * 0.05)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    report_to="wandb",
    logging_steps=10,
    learning_rate=1e-4,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=grad_acc_steps,
    gradient_checkpointing=True,
    # eval_accumulation_steps=1_000,
    warmup_steps=n_warmup_steps,
    lr_scheduler_type="cosine",
    num_train_epochs=n_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=(1 / n_epochs / n_eval_per_epoch),
    fp16=True,
    save_steps=(1 / n_epochs / n_save_per_epoch),
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,

)

In [None]:
import wandb

wandb.init()

In [None]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
257,0.7075,0.596488,0.07473,0.008103,0.01462,0.861986
514,0.1723,0.150141,0.769727,0.685515,0.725184,0.960752
771,0.1142,0.100669,0.811527,0.789447,0.800335,0.969581
1028,0.0897,0.085664,0.832448,0.821051,0.82671,0.972429
1285,0.0807,0.078811,0.828553,0.846895,0.837624,0.973524
1542,0.0747,0.073963,0.840025,0.855267,0.847577,0.974532
1799,0.0665,0.071003,0.844199,0.858512,0.851295,0.975384


In [None]:
trainer.train(resume_from_checkpoint=True)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
2056,0.0763,0.068758,0.846045,0.863849,0.854855,0.975624
2313,0.066,0.068258,0.843367,0.870377,0.85666,0.97569
2570,0.0687,0.066431,0.84721,0.873123,0.859971,0.976392
2827,0.0611,0.064979,0.850116,0.874371,0.862073,0.976553
3084,0.0696,0.064398,0.843125,0.882301,0.862268,0.97648
3341,0.0633,0.063248,0.85063,0.881629,0.865852,0.977122
3598,0.0595,0.061938,0.853514,0.883127,0.868068,0.977458
3855,0.0588,0.062022,0.856552,0.886007,0.87103,0.977562


### Testing

In [None]:
trainer.evaluate(dataset["test"])

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.04365752637386322,
 'eval_precision': 0.9069042400423248,
 'eval_recall': 0.9357964515500098,
 'eval_f1': 0.9211238413265012,
 'eval_accuracy': 0.9842004178269255,
 'eval_runtime': 223.0074,
 'eval_samples_per_second': 147.565,
 'eval_steps_per_second': 1.157}

In [None]:
def align_tags_with_words(token_tags, word_indices):
    word_tags = []
    for i in range(len(token_tags)):

        if token_tags[i] == -100:
            continue
        elif not word_tags or word_indices[i] != word_indices[i - 1]:
            word_tags.append(token_tags[i])
        elif word_tags[-1] == 0 and token_tags[i] != 0:
            word_tags[-1] = token_tags[i]

    for i in range(1, len(word_tags)):
        if (
            word_tags[i] > 0 and word_tags[i] % 2 == 0 and
            word_tags[i] - word_tags[i - 1] not in [0, 1]
        ):
            word_tags[i] -= 1
    return word_tags

In [None]:
preds = trainer.predict(dataset["test"])

pred_ner_tags = []
true_ner_tags = []
for i in trange(len(dataset["test"])):
    word_ids = dataset["test"][i]["word_ids"][1:-1]
    pred = preds.predictions[i][1:len(word_ids) + 1]
    pred_ner_tags.append([id2label[x] for x in align_tags_with_words(pred, word_ids)])
    true_ner_tags.append([id2label[x] for x in dataset["test"][i]["ner_tags"]])

  0%|          | 0/32908 [00:00<?, ?it/s]

In [None]:
print(classification_report(true_ner_tags, pred_ner_tags, digits=3))

              precision    recall  f1-score   support

        ANIM      0.660     0.760     0.706      3208
         BIO      0.000     0.000     0.000        16
         CEL      0.688     0.805     0.742        82
         DIS      0.686     0.773     0.727      1518
         EVE      0.889     0.935     0.911       704
        FOOD      0.557     0.443     0.494      1132
        INST      0.250     0.250     0.250        24
         LOC      0.991     0.989     0.990     24048
       MEDIA      0.888     0.950     0.918       916
        MYTH      0.719     0.719     0.719        64
         ORG      0.950     0.977     0.963      6618
         PER      0.992     0.996     0.994     10530
       PLANT      0.533     0.742     0.620      1788
        TIME      0.747     0.696     0.720       578
        VEHI      0.694     0.781     0.735        64

   micro avg      0.916     0.941     0.929     51290
   macro avg      0.683     0.721     0.699     51290
weighted avg      0.923   

### Inference

In [None]:
import numpy as np
from spacy import displacy
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from peft import PeftModel, PeftConfig

BASE_MODEL_NAME = "microsoft/deberta-v3-large"
FINETUNED_MODEL = "alexeyak/deberta-v3-large-ner-lora"

In [None]:
def align_tags_with_words(token_tags, word_indices):
    word_tags = []
    for i in range(len(token_tags)):
        if token_tags[i] == -100:
            continue
        elif not word_tags or word_indices[i] != word_indices[i - 1]:
            word_tags.append(token_tags[i])
        elif word_tags[-1] == 0 and token_tags[i] != 0:
            word_tags[-1] = token_tags[i]

    for i in range(1, len(word_tags)):
        if (
            word_tags[i] > 0 and word_tags[i] % 2 == 0 and
            word_tags[i] - word_tags[i - 1] not in [0, 1]
        ):
            word_tags[i] -= 1
    return word_tags

def ner_render(tokens, ner_tags, title=None, **kwargs):
    pos = 0
    ents = []
    for word, tag_idx in zip(tokens, ner_tags):
        tag = id2label[tag_idx]
        if tag.startswith('B'):
            ents.append({
                "start": pos,
                "end": pos + len(word),
                "label": tag.split("-")[1]
            })
        elif tag.startswith('I'):
            ents[-1]["end"] = pos + len(word)
        pos += (len(word) + 1)
    displacy.render({
        "text": " ".join(tokens),
        "ents": ents,
        "title": title
    }, style="ent", manual=True, jupyter=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id)
peft_model = PeftModel.from_pretrained(model, FINETUNED_MODEL)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
text = "Four days later, he scored his first Premier League goal of the season, assisting Fernando Torres for the opener and then scoring directly from a free-kick in a 2–1 away win against Arsenal at the Emirates Stadium."
# text = "Takuya Takagi scored the winner in the 88th minute, rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net."
inputs = tokenizer(text.split(), is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    token_tags = peft_model(**inputs).logits[0].numpy().argmax(-1)

ner_tags = align_tags_with_words(token_tags[1:-1], inputs.word_ids()[1:-1])

In [None]:
ner_render(text.split(), ner_tags)