# Preparing the data

* Named entity recognition (NER): Find the entities (such as persons, locations, or organizations) in a sentence. This can be formulated as attributing a label to each token by having one class per entity and one class for “no entity.”

In [1]:
# install datasets
!pip install datasets -q

In [2]:
# import  
from datasets import load_dataset

# load the 'wikiann' dataset, the english version
raw_dataset = load_dataset("wikiann", 'en')
# check the data
raw_dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [3]:
# shape
raw_dataset.shape

{'validation': (10000, 4), 'test': (10000, 4), 'train': (20000, 4)}

In [4]:
# see the tokens and the ner_tags for an specific example
print(raw_dataset["train"][7]["tokens"])
print(raw_dataset["train"][7]["ner_tags"])

['Antonín', 'Rýgr', '(', '1970–72', ')']
[1, 2, 0, 0, 0]


In [5]:
# create label names
label_names = raw_dataset["train"].features["ner_tags"].feature.names
# check the labels
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

* O means the word doesn’t correspond to any entity.

* B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.

* B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.

* B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.

In [6]:
words = raw_dataset["train"][1]["tokens"]
labels = raw_dataset["train"][1]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

; ' '' Anders Lindström '' ' 
O O O  B-PER  I-PER     O  O 


# Processing the data

As usual, our texts need to be converted to token IDs before the model can make sense of them. A big difference in the case of token classification tasks is that we have pre-tokenized inputs. Fortunately, the tokenizer API can deal with that pretty easily; we just need to warn the tokenizer with a special flag.

To begin, let’s create our tokenizer object. As we said before, we will be using a BERT pretrained model, so we’ll start by downloading and caching the associated tokenizer:

In [7]:
# install
!pip install -U transformers huggingface_hub -qq

In [8]:
# import
from transformers import AutoTokenizer

# load the tokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

To tokenize a pre-tokenized input, we can use our tokenizer as usual and just add is_split_into_words=True:

In [9]:
# create inputs
inputs = tokenizer(raw_dataset["train"][0]["tokens"], is_split_into_words=True)
# check the inputs
inputs.tokens()

['[CLS]',
 'R',
 '.',
 'H',
 '.',
 'Saunders',
 '(',
 'St',
 '.',
 'Lawrence',
 'River',
 ')',
 '(',
 '96',
 '##8',
 'MW',
 ')',
 '[SEP]']

As we can see, the tokenizer added the special tokens used by the model ([CLS] at the beginning and [SEP] at the end) and left most of the words untouched. The word lamb, however, was tokenized into two subwords, la and ##mb. This introduces a mismatch between our inputs and the labels: the list of labels has only 9 elements, whereas our input now has 18 tokens. Accounting for the special tokens is easy (we know they are at the beginning and the end), but we also need to make sure we align all the labels with the proper words.

In [10]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

We can then expand our label list to match the tokens. The first rule we’ll apply is that special tokens get a label of -100. This is because by default -100 is an index that is ignored in the loss function we will use (cross entropy). Then, each token gets the same label as the token that started the word it’s inside, since they are part of the same entity. For tokens inside a word but not at the beginning, we replace the B- with I- (since the token does not begin the entity):

In [11]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [12]:
# create arguments for the function above
labels = raw_dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(word_ids)

[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]
[None, 0, 0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]


In [13]:
# call the function
print(align_labels_with_tokens(labels, word_ids))

[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]


To preprocess our whole dataset, we need to tokenize all the inputs and apply align_labels_with_tokens() on all the labels. To take advantage of the speed of our fast tokenizer, it’s best to tokenize lots of texts at the same time, so we’ll write a function that processes a list of examples and use the Dataset.map() method with the option batched=True. The only thing that is different from our previous example is that the word_ids() function needs to get the index of the example we want the word IDs of when the inputs to the tokenizer are lists of texts (or in our case, list of lists of words), so we add that too:

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

We can now apply all that preprocessing in one go on the other splits of our dataset:

In [15]:
tokenized_datasets = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset["train"].column_names,
)



Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

# Fine-tuning the model with the Trainer API

## Data collation

Here our labels should be padded the exact same way as the inputs so that they stay the same size, using -100 as a value so that the corresponding predictions are ignored in the loss computation.

This is all done by a DataCollatorForTokenClassification. Like the DataCollatorWithPadding, it takes the tokenizer used to preprocess the inputs:

In [16]:
# import
from transformers import DataCollatorForTokenClassification

# load the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

To test this on a few samples, we can just call it on a list of examples from our tokenized training set:

In [17]:
# create batch with data collator
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
# check the batch
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    4,    4,    4,    4,    0,    3,    4,    4,    4,    0,
            0,    0,    0,    0,    0, -100],
        [-100,    0,    0,    0,    0,    1,    2,    2,    2,    0,    0,    0,
         -100, -100, -100, -100, -100, -100]])

Let’s compare this to the labels for the first and second elements in our dataset:

In [18]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 4, 4, 4, 4, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, -100]


As we can see, the second set of labels has been padded to the length of the first one using -100s.

## Metrics

To have the Trainer compute a metric every epoch, we will need to define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.

The traditional framework used to evaluate token classification prediction is seqeval. To use this metric, we first need to install the seqeval library:

In [19]:
# install
!pip install seqeval 
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
# import
import evaluate

# load the metric
metric = evaluate.load("seqeval")

This metric does not behave like the standard accuracy: it will actually take the lists of labels as strings, not integers, so we will need to fully decode the predictions and labels before passing them to the metric. Let’s see how it works. First, we’ll get the labels for our first training example:

In [21]:
# create labels
labels = raw_dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
# check the labels
labels

['B-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O']

We can then create fake predictions for those by just changing the value at index 2:

In [22]:
predictions = labels.copy()
predictions[2] = "B-ORG"
metric.compute(predictions=[predictions], references=[labels])

{'ORG': {'precision': 0.6666666666666666,
  'recall': 1.0,
  'f1': 0.8,
  'number': 2},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 1.0,
 'overall_f1': 0.8,
 'overall_accuracy': 0.9090909090909091}

This compute_metrics() function first takes the argmax of the logits to convert them to predictions (as usual, the logits and the probabilities are in the same order, so we don’t need to apply the softmax). Then we have to convert both labels and predictions from integers to strings. We remove all the values where the label is -100, then pass the results to the metric.compute() method:

In [23]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Now that this is done, we are almost ready to define our Trainer. We just need a model to fine-tune!

## Defining the model

Since we are working on a token classification problem, we will use the AutoModelForTokenClassification class. The main thing to remember when defining this model is to pass along some information on the number of labels we have. The easiest way to do this is to pass that number with the num_labels argument, but if we want a nice inference widget working like the one we saw at the beginning of this section, it’s better to set the correct label correspondences instead.

They should be set by two dictionaries, id2label and label2id, which contain the mappings from ID to label and vice versa:

In [24]:
# dictionaries
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method, and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:

In [25]:
# import
from transformers import AutoModelForTokenClassification

# load the model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [26]:
# check the model has the right number of labels
model.config.num_labels

7

## Fine-tuning the model

We are now ready to train our model! We just need to do two last things before we define our Trainer: log in to Hugging Face and define our training arguments.

In [27]:
# import
from huggingface_hub import notebook_login
from transformers import pipeline

# Hugging Face log-in (use the second token)
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [63]:
# import
from transformers import TrainingArguments

# define the arguments
args = TrainingArguments(
    "Endika99/NLP-TokenClass-NER",
    push_to_hub=True,
    hub_model_id="Endika99/NLP-TokenClass-NER",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [64]:
# import 
from transformers import Trainer

# define the trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# run the train
trainer.train()

Cloning https://huggingface.co/Endika99/NLP-TokenClass-NER into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.40k/411M [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/411M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 20000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2500
  Number of trainable parameters = 107725063


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0902,0.374085,0.811963,0.8425,0.82695,0.925476


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
Saving model checkpoint to Endika99/NLP-TokenClass-NER/checkpoint-2500
Configuration saved in Endika99/NLP-TokenClass-NER/checkpoint-2500/config.json
Model weights saved in Endika99/NLP-TokenClass-NER/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in Endika99/NLP-TokenClass-NER/checkpoint-2500/tokenizer_config.json
Special tokens file saved in Endika99/NLP-TokenClass-NER/checkpoint-2500/special_tokens_map.json
tokenizer config file saved in Endika99/NLP-TokenClass-NER/tokenizer_config.json
Special tokens file saved in Endika99/NLP-TokenClass-NER/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2500, training_loss=0.10983698272705078, metrics={'train_runtime': 307.3304, 'train_samples_per_second': 65.077, 'train_steps_per_second': 8.135, 'total_flos': 263590935604656.0, 'train_loss': 0.10983698272705078, 'epoch': 1.0})

In [65]:
model_checkpoint = "Endika99/NLP-TokenClass-NER"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint
)

loading configuration file Endika99/NLP-TokenClass-NER/config.json
Model config BertConfig {
  "_name_or_path": "Endika99/NLP-TokenClass-NER",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,

In [85]:
# check if the model is functional
token_classifier("Spain is close to France")

[{'entity': 'B-LOC',
  'score': 0.9983228,
  'index': 1,
  'word': 'Spain',
  'start': 0,
  'end': 5},
 {'entity': 'B-LOC',
  'score': 0.99837095,
  'index': 5,
  'word': 'France',
  'start': 18,
  'end': 24}]

# Upload the fine-tuned model on HuggingFace

In [72]:
# pushing the trainer to the Hugging Face Hub
trainer.push_to_hub('Endika99/NLP-TokenClass-NER')

# pushing the model to the Hugging Face Hub
model.push_to_hub("Endika99/NLP-TokenClass-NER")

Saving model checkpoint to Endika99/NLP-TokenClass-NER
Configuration saved in Endika99/NLP-TokenClass-NER/config.json
Model weights saved in Endika99/NLP-TokenClass-NER/pytorch_model.bin
tokenizer config file saved in Endika99/NLP-TokenClass-NER/tokenizer_config.json
Special tokens file saved in Endika99/NLP-TokenClass-NER/special_tokens_map.json
remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/Endika99/NLP-TokenClass-NER
   dfcfd4b..6065835  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Endika99/NLP-TokenClass-NER
   dfcfd4b..6065835  main -> main

To https://huggingface.co/Endika99/NLP-TokenClass-NER
   6065835..ae7952a  main -> main

   6065835..ae7952a  main -> main

Configuration saved in Endika99/NLP-TokenClass-NER/config.json
Model weights saved in Endika99/NLP-TokenClass-NER/pytorch_model.bin
Uploading the following files to Endika99/NLP-TokenClass-NER: pytor

CommitInfo(commit_url='https://huggingface.co/Endika99/NLP-TokenClass-NER/commit/78ba8f08f65e8a007df0d8aa9d8f9df112e5cc4b', commit_message='Upload BertForTokenClassification', commit_description='', oid='78ba8f08f65e8a007df0d8aa9d8f9df112e5cc4b', pr_url=None, pr_revision=None, pr_num=None)

In [77]:
# defining the path to the trained model checkpoint on the Hugging Face Hub
model_ckpt = 'Endika99/NLP-TokenClass-NER'

# creating a pipeline for token classification using the fine-tuned model from the Hugging Face Hub
pipe = pipeline('token-classification', model=model_ckpt)

loading configuration file Endika99/NLP-TokenClass-NER/config.json
Model config BertConfig {
  "_name_or_path": "Endika99/NLP-TokenClass-NER",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,

# Gradio 

In [79]:
# instal
!pip install gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.3/14.3 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.9/140.9 KB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 KB[0m [31m7.0 MB/s[0m

In [84]:
# import
import gradio as gr

# defining the classify function which takes tokens as input and returns the label classification
def classify(text):
  # initializing the pipeline for sentiment analysis
  cls = pipe
  
  # predicting the token label for the input text
  return cls(text)[0]['label']

#Creating the Gradio interface with input textbox and output text
gr.Interface(fn=classify, inputs=["textbox"], outputs="text").launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

