In [None]:
import sys
sys.path.append("../src")
from datasets import Dataset
from ner.load_conll import read_conll_file
from ner.tokenize_align import tokenize_and_align_labels
from ner.model_setup import load_model_and_tokenizer
from ner.trainer_setup import setup_trainer

In [4]:
# Load data
sentences, tags = read_conll_file("../data/labeled/ner_data.conll")
data = {"tokens": sentences, "ner_tags": tags}
dataset = Dataset.from_dict(data).train_test_split(test_size=0.2)

In [6]:
# Prepare labels
unique_tags = sorted(set(tag for seq in tags for tag in seq))
label_to_id = {tag: i for i, tag in enumerate(unique_tags)}
id_to_label = {i: tag for tag, i in label_to_id.items()}

In [None]:
# Load model
model_name = "xlm-roberta-base"
tokenizer, model = load_model_and_tokenizer(model_name, num_labels=len(unique_tags))

In [None]:

# Tokenize
tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label_to_id), batched=True)

In [None]:
# Train
trainer = setup_trainer(model, tokenizer, tokenized_dataset)
trainer.train()

In [None]:
# Save
trainer.save_model("models/xlm-roberta-ner")
tokenizer.save_pretrained("models/xlm-roberta-ner")