In [None]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
from tqdm.notebook import tqdm
import torch, json, os, pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer_fast = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

In [None]:
# https://huggingface.co/datasets/tner/ontonotes5
ds = load_dataset("tner/ontonotes5", keep_in_memory=True, num_proc=os.cpu_count())

!wget -nc https://huggingface.co/datasets/tner/ontonotes5/resolve/main/dataset/label.json

with open('label.json') as f:
    ds_label_tag_mapping = json.load(f)
    ds_tag_label_mapping = dict((v,k) for k,v in ds_label_tag_mapping.items())
    assert len(ds_label_tag_mapping) == len(ds_tag_label_mapping)

In [None]:
def tokenize_and_align_labels(input, idx):
    tokenized_inputs = tokenizer_fast(input["tokens"], truncation=True, is_split_into_words=True)

    input_ids = tokenized_inputs['input_ids']
    word_ids: list[int] = tokenized_inputs.word_ids()
    labels: list[int] = [0] * len(input_ids)

    assert len(input_ids) == len(word_ids) == len(labels)

    for i, word_id in enumerate(word_ids):
        # word_id is none on first and last padding tokens which are automatically introduced by the tokenizer, setting those to -100
        if word_id == None:
            #assert i == 0 or i == (len(input_ids) - 1)
            labels[i] = -100
        # if this is a continuation of a previous word and the word has an actual meaning
        elif word_id == word_ids[i-1] and input['tags'][word_id] != 0:
            prev_word_tag: int = input['tags'][word_ids[i-1]]
            prev_word_label: str = ds_tag_label_mapping[prev_word_tag]

            if prev_word_label.startswith('I-'):
                labels[i] = prev_word_tag
            elif prev_word_label.startswith('B-'):
                labels[i] = ds_label_tag_mapping[prev_word_label.replace('B-', 'I-')]
            else:
                raise Exception(f"Cannot determine label for word_id {word_id} and dataset row {idx}")
        else:
            labels[i] = input['tags'][word_id]

    tokenized_inputs['labels'] = labels

    return tokenized_inputs

In [None]:
def group_entries(input, max_list_size=512):
    input: dict = dict(input)
    output: dict[str, list[int]] = {k:[[]] for k in input.keys()}

    for x,y,z in zip(input['input_ids'], input['attention_mask'], input['labels']):
        assert len(x) == len(y) == len(z)

        if len(output['input_ids'][-1]) + len(x) < max_list_size:
            output['input_ids'][-1].extend(x)
            output['attention_mask'][-1].extend(y)
            output['labels'][-1].extend(z)
        else:
            output['input_ids'].append(x)
            output['attention_mask'].append(y)
            output['labels'].append(z)


    for x,y,z in zip(output['input_ids'], output['attention_mask'], output['labels']):
        elem_diff = max_list_size - len(x)

        x.extend(elem_diff * [1])
        y.extend(elem_diff * [0])
        z.extend(elem_diff * [-100])
    
    return output

In [None]:
def process_dataset(split: str):
    return (ds[split].map(tokenize_and_align_labels, with_indices=True, remove_columns=['tokens', 'tags'], num_proc=os.cpu_count())
            .shuffle(seed=5473)
            .map(group_entries, batched=True, batch_size=None))

train_tokenized_dataset = process_dataset('train')
validation_tokenized_dataset = process_dataset('validation')

In [None]:
from sklearn.metrics import accuracy_score

model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(ds_label_tag_mapping))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, dim=-1)
    return accuracy_score(labels, predictions)

batch_size = 8
training_steps = len(train_tokenized_dataset['input_ids']) * 3 / batch_size
steps_per_actions = int(training_steps * 0.10)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=steps_per_actions,
    eval_steps=steps_per_actions,
    logging_steps=steps_per_actions
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=validation_tokenized_dataset,
    tokenizer=tokenizer_fast
)

trainer.train()