In [6]:
from datasets import Dataset
import pandas as pd

def read_conll_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        sentences, labels = [], []
        sentence, label = [], []

        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                token, tag = line.split()
                sentence.append(token)
                label.append(tag)

        if sentence:
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

tokens, ner_tags = read_conll_data("amharic_ner_labels.txt")


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/xlm-roberta-base/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


KeyboardInterrupt: 

In [None]:
from datasets import Dataset

dataset = Dataset.from_dict({
    'tokens': tokens,
    'ner_tags': ner_tags
})

label_list = list(set(tag for label in ner_tags for tag in label))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


In [None]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    labels = []

    word_ids = tokenized_inputs.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(label2id[example['ner_tags'][word_id]])
        else:
            labels.append(label2id[example['ner_tags'][word_id]])
        prev_word_id = word_id

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print(metrics)

In [None]:
trainer.save_model("amharic-ner-model")
tokenizer.save_pretrained("amharic-ner-model")