<a href="https://colab.research.google.com/github/Batmobil/deberta_ner/blob/main/DeBERTa_NER_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
### Load dataset

In [None]:
from datasets import load_dataset

In [None]:
# raw_datasets = load_dataset("conll2003")
raw_datasets = load_dataset("wikiann", "en")
raw_datasets

Reusing dataset wikiann (/root/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['R.H.',
 'Saunders',
 '(',
 'St.',
 'Lawrence',
 'River',
 ')',
 '(',
 '968',
 'MW',
 ')']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

In [None]:
### Load tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
# addi9g prefix space f for deberta model
model_name = "microsoft/deberta-base"
model_tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True) 

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

In [None]:
# checking fast tokenizer
model_tokenizer.is_fast

True

In [None]:
# tokenize pre-tokenized inputs
inputs = model_tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'ĠR',
 '.',
 'H',
 '.',
 'ĠSaunders',
 'Ġ(',
 'ĠSt',
 '.',
 'ĠLawrence',
 'ĠRiver',
 'Ġ)',
 'Ġ(',
 'Ġ9',
 '68',
 'ĠMW',
 'Ġ)',
 '[SEP]']

In [None]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

In [None]:
raw_datasets["train"][0]["ner_tags"]

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]

In [None]:
# function to align labels with tokens 
# --> special tokens: -100 label id (ignored by cross entropy),
# --> if tokens are inside a word, replace 'B-' with 'I-' 
def align_labels_with_tokens(labels, word_ids):
  aligned_label_ids = []
  previous_word_id = None
  for word_id in word_ids:
    if word_id is None:
      aligned_label_ids.append(-100)
    elif word_id != previous_word_id:
      # new word!
      label_id = labels[word_id]
      aligned_label_ids.append(label_id)
      previous_word_id = word_id
    else:
      # inside of word
      label = labels[previous_word_id]
      # if label starts with B- change it to I-
      # all B- label ids have an odd index in dataset features
      if label % 2 == 1:
        label += 1
      aligned_label_ids.append(label)

  return aligned_label_ids

# test on first sentence
test_labels = raw_datasets["train"][0]["ner_tags"]
test_word_ids = inputs.word_ids()
print(test_labels)
print(align_labels_with_tokens(test_labels, test_word_ids))

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]
[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]


In [None]:
# define tokenize and align labels in one function to use on Dataset with map
def tokenize_and_align_labels(examples):
  tokenized_inputs = model_tokenizer(examples["tokens"], truncation=True,
                                     is_split_into_words=True)
  all_labels = examples["ner_tags"]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs
# note: inputs are noter padded, will be done dynamically with data collator

In [None]:
# Now we can apply it on the whole dataset the optimized way with map
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [None]:
### Fine Tuning

In [None]:
# using specific data collator to pad labels (specific to toekn classification task)
from transformers import DataCollatorForTokenClassification

In [None]:
 data_collator = DataCollatorForTokenClassification(tokenizer= model_tokenizer)
 # test
 batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
 batch["labels"]

tensor([[-100,    3,    4,    4,    4,    4,    0,    3,    4,    4,    4,    0,
            0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    1,    2,    2,    2,    2,    0,    0, -100,
         -100, -100, -100, -100, -100, -100]])

In [None]:
# test original (no padding)
tokenized_datasets["train"][:2]["labels"]

[[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, -100]]

In [None]:
### Metrics
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_metric

In [None]:
metric = load_metric("seqeval")

In [None]:
label_names =  ner_feature.feature.names

labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']

In [None]:
# test seqeval with manul predictions
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'ORG': {'f1': 1.0, 'number': 2, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [None]:
# Define metrics function with overall scores
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
### Define the model with labels
# define mappings from ID to labels and back
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}


In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
deberta_model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                                id2label=id2label,
                                                                label2id=label2id)

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": "5",
    "B-ORG": "3",
    "B-PER": "1",
    "I-LOC": "6",
    "I-ORG": "4",
    "I-PER": "2",
    "O": "0"
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_

In [None]:
# check
deberta_model.config.num_labels

7

In [None]:
# fine tuning!
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
## define training argument
from transformers import TrainingArguments

In [None]:
args = TrainingArguments(
    "deberta-finetuned-ner-wikiann",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Launch training
from transformers import Trainer
trainer = Trainer(
    model=deberta_model,
    args = args,
    train_dataset = tokenized_datasets["train"].shuffle(42).select(range(15000)),
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer=model_tokenizer,
)
trainer.train()

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-7cbe5ca8351fe251.arrow
/content/deberta-finetuned-ner-wikiann is already a clone of https://huggingface.co/baptiste/deberta-finetuned-ner-wikiann. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 15000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9375


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2051,0.306252,0.811351,0.832744,0.821908,0.919625


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
Saving model checkpoint to deberta-finetuned-ner-wikiann/checkpoint-1875
Configuration saved in deberta-finetuned-ner-wikiann/checkpoint-1875/config.json
Model weights saved in deberta-finetuned-ner-wikiann/checkpoint-1875/pytorch_model.bin
tokenizer config file saved in deberta-finetuned-ner-wikiann/checkpoint-1875/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner-wikiann/checkpoint-1875/special_tokens_map.json
tokenizer config file saved in deberta-finetuned-ner-wikiann/tokenizer_config.json
Special tokens file saved in deberta-finetuned-ner-wikiann/special_tokens_map.json


RuntimeError: ignored

In [None]:
trainer.push_to_hub(commit_message="Training complete")