CoNLL-2003 dataset

In [16]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [15]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [17]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""

for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


Processing

In [20]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [24]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()

print(labels)
print(word_ids)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [25]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [27]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels,
                                      batched=True,
                                      remove_columns=raw_datasets["train"].column_names)

Map: 100%|██████████| 14041/14041 [00:00<00:00, 28230.24 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 36304.07 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 40690.96 examples/s]
