In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
import evaluate

# CoNLL-2003 dataset
datasets = load_dataset("conll2003")

# tokenize
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=False
    )
    labels = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(tokenized_inputs["input_ids"]))):
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(examples["ner_tags"][i][word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=datasets["train"].column_names
)

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [9]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [5]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=63662386429c2e9ea2426a6071b6a101f51b8e501b27c7fbc549f1c05048ded8
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [10]:
# model
label_list = datasets["train"].features["ner_tags"].feature.names
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list)
)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500
)

# evaluation
metric = evaluate.load("seqeval")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[l] for l in label_seq if l != -100]
        for label_seq in labels
    ]
    true_preds = [
        [label_list[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]

    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results.get("overall_accuracy", None)
    }

In [15]:
# train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
eval_results = trainer.evaluate()

print("Evaluation results:", eval_results)


Epoch 1/3: 100%|██████████| 4063/4063 [12:34<00:00,  5.38it/s, loss=0.3456, lr=2.0e-05]
Evaluation   : 100%|██████████| 625/625 [00:45<00:00, 13.69it/s]
Saving model checkpoint to ./ner_model/checkpoint-4063
Configuration saved in ./ner_model/checkpoint-4063/config.json
Model weights saved in ./ner_model/checkpoint-4063/pytorch_model.bin
tokenizer config file saved in ./ner_model/checkpoint-4063/tokenizer_config.json
Special tokens file saved in ./ner_model/checkpoint-4063/special_tokens_map.json
Epoch 2/3: 100%|██████████| 4063/4063 [12:32<00:00,  5.40it/s, loss=0.2124, lr=2.0e-05]
Evaluation   : 100%|██████████| 625/625 [00:45<00:00, 13.73it/s]
Saving model checkpoint to ./ner_model/checkpoint-8126
Epoch 3/3: 100%|██████████| 4063/4063 [12:30<00:00,  5.41it/s, loss=0.1589, lr=2.0e-05]
Evaluation   : 100%|██████████| 625/625 [00:45<00:00, 13.80it/s]

Evaluation results: {'eval_loss': 0.1423,
                     'eval_precision': 0.9104,
                     'eval_recall': 0.8897,
  