In [15]:
## Install Necessary Packages
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, classification_report

In [None]:
## Load the dataset
dataset = load_dataset("conll2003")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [None]:
## Model creation
model_checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels = 9)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from pprint import pprint

# What looks like inside.
pprint(dataset['train'][0])

{'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'id': '0',
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'tokens': ['EU',
            'rejects',
            'German',
            'call',
            'to',
            'boycott',
            'British',
            'lamb',
            '.']}


In [8]:
## Prepare labels and tokenize the dataset
label_list = dataset["train"].features['ner_tags'].feature.names

def align_labels(input):
    tokenized_inputs = tokenizer(input["tokens"], truncation = True, is_split_into_words = True)

    labels = []
    for i, label in enumerate(input["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != -100 else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets =  dataset.map(align_labels, batched = True)

Map: 100%|██████████| 14041/14041 [00:02<00:00, 6169.29 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 8888.79 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 10256.49 examples/s]


In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 2)
    
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }

training_args = TrainingArguments(
    "ner-bert-base",
    eval_strategy = "no",
    save_strategy = "no",
    learning_rate = 2e-5,
    logging_strategy = "epoch",
    per_device_train_batch_size = 2,
    num_train_epochs = 1,
    load_best_model_at_end = True,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`