<a href="https://colab.research.google.com/github/Batmobil/deberta_ner/blob/main/DeBERTa_NER_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Fine Tuning deberta base model for NER Task

##### Install needed packages

In [1]:
!pip install transformers
!pip install datasets
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Load wikiann dataset that contains connll2003 NER data

In [2]:
from datasets import load_dataset

In [3]:
# raw_datasets = load_dataset("conll2003")
raw_datasets = load_dataset("wikiann", "en")
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [4]:
raw_datasets["train"][0]["tokens"]

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']

In [5]:
raw_datasets["train"][0]["ner_tags"]

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]

In [6]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

### Load Tokenizer


In [7]:
from transformers import AutoTokenizer

In [8]:
# adding prefix space for deberta model
model_name = "microsoft/deberta-base"
model_tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True) 

In [9]:
# checking fast tokenizer
model_tokenizer.is_fast

True

In [10]:
# tokenize pre-tokenized inputs
inputs = model_tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'ĠR',
 '.',
 'H',
 '.',
 'ĠSaunders',
 'Ġ(',
 'ĠSt',
 '.',
 'ĠLawrence',
 'ĠRiver',
 'Ġ)',
 'Ġ(',
 'Ġ9',
 '68',
 'ĠMW',
 'Ġ)',
 '[SEP]']

In [11]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

In [12]:
raw_datasets["train"][0]["ner_tags"]

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]

### Tokenize dataset and align labels

In [13]:
# function to align labels with tokens 
# --> special tokens: -100 label id (ignored by cross entropy),
# --> if tokens are inside a word, replace 'B-' with 'I-' 
def align_labels_with_tokens(labels, word_ids):
  aligned_label_ids = []
  previous_word_id = None
  for word_id in word_ids:
    if word_id is None:
      aligned_label_ids.append(-100)
    elif word_id != previous_word_id:
      # new word!
      label_id = labels[word_id]
      aligned_label_ids.append(label_id)
      previous_word_id = word_id
    else:
      # inside of word
      label = labels[previous_word_id]
      # if label starts with B- change it to I-
      # all B- label ids have an odd index in dataset features
      if label % 2 == 1:
        label += 1
      aligned_label_ids.append(label)

  return aligned_label_ids

# test on first sentence
test_labels = raw_datasets["train"][0]["ner_tags"]
test_word_ids = inputs.word_ids()
print(test_labels)
print(align_labels_with_tokens(test_labels, test_word_ids))

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]
[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]


In [14]:
# define tokenize and align labels in one function to use on Dataset with map
def tokenize_and_align_labels(examples):
  tokenized_inputs = model_tokenizer(examples["tokens"], truncation=True,
                                     is_split_into_words=True)
  all_labels = examples["ner_tags"]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs
# note: inputs are noter padded, will be done dynamically with data collator

In [15]:
# Now we can apply it on the whole dataset the optimized way with map
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)



  0%|          | 0/10 [00:00<?, ?ba/s]



In [17]:
# using specific data collator to pad labels (specific to toekn classification task)
from transformers import DataCollatorForTokenClassification

In [18]:
 data_collator = DataCollatorForTokenClassification(tokenizer= model_tokenizer)
 # test
 batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
 batch["labels"]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    4,    4,    4,    4,    0,    3,    4,    4,    4,    0,
            0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    1,    2,    2,    2,    2,    0,    0, -100,
         -100, -100, -100, -100, -100, -100]])

In [19]:
# test original (no padding)
tokenized_datasets["train"][:2]["labels"]

[[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, -100]]

### Define Evaluation Metrics

In [20]:
from datasets import load_metric

In [21]:
metric = load_metric("seqeval")

In [22]:
label_names =  ner_feature.feature.names

labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']

In [23]:
# test seqeval with manul predictions
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [24]:
# Define metrics function with overall scores
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Define mapping for indices and labels

In [25]:
### Define the model with labels
# define mappings from ID to labels and back
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}


In [26]:
from transformers import AutoModelForTokenClassification

In [27]:
deberta_model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                                id2label=id2label,
                                                                label2id=label2id)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForTokenClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initial

In [28]:
# check
deberta_model.config.num_labels

7

In [29]:
# fine tuning!
# Login to push model to huggingface hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Define training arguments

In [30]:
## define training argument
from transformers import TrainingArguments

In [31]:
args = TrainingArguments(
    "deberta-finetuned-ner-connll-late-stop",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=7,
    weight_decay=0.01,
    push_to_hub=True
)

### Training Model

In [32]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [33]:
# Launch training
from transformers import Trainer
trainer = Trainer(
    model=deberta_model,
    args = args,
    train_dataset = tokenized_datasets["train"].shuffle(42).select(range(15000)),
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer=model_tokenizer,
)
trainer.train()

/content/deberta-finetuned-ner-connll-late-stop is already a clone of https://huggingface.co/baptiste/deberta-finetuned-ner-connll-late-stop. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 15000
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13125
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3408,0.363929,0.746238,0.788704,0.766883,0.896579
2,0.2435,0.293257,0.810437,0.833239,0.82168,0.917845
3,0.1822,0.303414,0.814749,0.838824,0.826611,0.922096
4,0.1402,0.366728,0.827501,0.847448,0.837355,0.923486
5,0.1013,0.429006,0.828492,0.844832,0.836583,0.92275
6,0.0677,0.491432,0.825937,0.847307,0.836485,0.923059
7,0.0439,0.525879,0.830193,0.847095,0.838558,0.922886


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
Saving model checkpoint to deberta-finetuned-ner-connll-late-stop/checkpoint-1875
Configuration saved in deberta-finetuned-ner-connll-late-stop/checkpoint-1875/config.json
Model weights saved in deberta-finetuned-ner-connll-late-stop/checkpoint-1875/pytorch_model.bin
tokenizer config file saved in deberta-finetuned-ner-connll-late-stop/checkpoint-1875/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner-connll-late-stop/checkpoint-1875/special_tokens_map.json
tokenizer config file saved in deberta-finetuned-ner-connll-late-stop/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner-connll-late-stop/special_tokens_map.json
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
Saving 

TrainOutput(global_step=13125, training_loss=0.16967850872221446, metrics={'train_runtime': 2213.0598, 'train_samples_per_second': 47.446, 'train_steps_per_second': 5.931, 'total_flos': 1592797346146032.0, 'train_loss': 0.16967850872221446, 'epoch': 7.0})

In [35]:
trainer.push_to_hub(commit_message="Training complete")

Saving model checkpoint to deberta-finetuned-ner-connll-late-stop
Configuration saved in deberta-finetuned-ner-connll-late-stop/config.json
Model weights saved in deberta-finetuned-ner-connll-late-stop/pytorch_model.bin
tokenizer config file saved in deberta-finetuned-ner-connll-late-stop/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner-connll-late-stop/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/529M [00:00<?, ?B/s]

Upload file runs/Sep18_15-43-50_d1971ae9aa49/events.out.tfevents.1663515839.d1971ae9aa49.860.0:  29%|##8      …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/baptiste/deberta-finetuned-ner-connll-late-stop
   acb478b..2d0ab97  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/baptiste/deberta-finetuned-ner-connll-late-stop
   acb478b..2d0ab97  main -> main

To https://huggingface.co/baptiste/deberta-finetuned-ner-connll-late-stop
   2d0ab97..8bbf305  main -> main

   2d0ab97..8bbf305  main -> main



'https://huggingface.co/baptiste/deberta-finetuned-ner-connll-late-stop/commit/2d0ab971c07bfc963233b67206cab3d20129126d'