In [1]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import torch
from tqdm.notebook import tqdm
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
tokenizer_fast = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

In [4]:
# https://huggingface.co/datasets/tner/ontonotes5
ds = load_dataset("tner/ontonotes5")

!wget -nc https://huggingface.co/datasets/tner/ontonotes5/resolve/main/dataset/label.json

with open('label.json') as f:
    ds_label_tag_mapping = json.load(f)
    ds_tag_label_mapping = dict((v,k) for k,v in ds_label_tag_mapping.items())
    assert len(ds_label_tag_mapping) == len(ds_tag_label_mapping)

[31mNothing to do - goodbye
[m[m[m[m[m

In [9]:
def tokenize_and_align_labels(examples, idx):
    tokenized_inputs = tokenizer_fast(examples["tokens"], truncation=True, is_split_into_words=True)

    input_ids = tokenized_inputs['input_ids']
    word_ids: list[int] = tokenized_inputs.word_ids()
    labels: list[int] = [0] * len(input_ids)

    assert len(input_ids) == len(word_ids) == len(labels)

    for i, word_id in enumerate(word_ids):
        # word_id is none on first and last padding tokens which are automatically introduced by the tokenizer, setting those to -100
        if word_id == None:
            #assert i == 0 or i == (len(input_ids) - 1)
            labels[i] = -100
        # if this is a continuation of a previous word and the word has an actual meaning
        elif word_id == word_ids[i-1] and examples['tags'][word_id] != 0:
            prev_word_tag: int = examples['tags'][word_ids[i-1]]
            prev_word_label: str = ds_tag_label_mapping[prev_word_tag]

            if prev_word_label.startswith('I-'):
                labels[i] = prev_word_tag
            elif prev_word_label.startswith('B-'):
                labels[i] = ds_label_tag_mapping[prev_word_label.replace('B-', 'I-')]
            else:
                raise Exception(f"Cannot determine label for word_id {word_id} and dataset row {idx}")
        else:
            labels[i] = examples['tags'][word_id]

    tokenized_inputs['labels'] = labels

    return tokenized_inputs
    
train_tokenized_dataset = ds['train'].map(tokenize_and_align_labels, with_indices=True, remove_columns=['tokens', 'tags']).shuffle(seed=5473)

Map:   0%|          | 0/59924 [00:00<?, ? examples/s]

In [13]:
tokenizer_fast([["Hello", "who", "is", "this", "?"], ["Hii", "I", "like", "you"]], truncation=True, is_split_into_words=True)

{'input_ids': [[0, 20920, 54, 2], [0, 289, 4132, 2]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1]]}

In [12]:
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(ds_label_tag_mapping))

training_args = TrainingArguments(
    output_dir="./results",
    #eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    tokenizer=tokenizer_fast
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/22473 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).