In [9]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
from tqdm.notebook import tqdm
import utils

import importlib
importlib.reload(utils)

import torch, os, pandas as pd

In [10]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

In [11]:
# https://huggingface.co/datasets/tner/ontonotes5
ds = load_dataset("tner/ontonotes5", keep_in_memory=True, num_proc=os.cpu_count())

ds_label_tag_mapping, ds_tag_label_mapping = utils.load_label_mapping()

In [12]:
train_tokenized_dataset = utils.process_dataset(ds['train'], tokenizer)
validation_tokenized_dataset = utils.process_dataset(ds['validation'], tokenizer)

Map (num_proc=8):   0%|          | 0/59924 [00:00<?, ? examples/s]

Map:   0%|          | 0/59924 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/8528 [00:00<?, ? examples/s]

Map:   0%|          | 0/8528 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import accuracy_score

model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(ds_label_tag_mapping))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, dim=-1)
    return accuracy_score(labels, predictions)

batch_size = 8
training_steps = len(train_tokenized_dataset['input_ids']) * 3 / batch_size
steps_per_actions = int(training_steps * 0.10)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=steps_per_actions,
    eval_steps=steps_per_actions,
    logging_steps=steps_per_actions
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=validation_tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()