In [20]:
!pip install evaluate seqeval
# 2. Imports
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
from evaluate import load
import numpy as np
import json



In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# Parse CoNLL
def read_conll_file(file_path):
    sentences, labels = [], []
    sentence, label_seq = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label_seq)
                    sentence, label_seq = [], []
            else:
                token, tag = line.split()
                sentence.append(token)
                label_seq.append(tag)
    if sentence:
        sentences.append(sentence)
        labels.append(label_seq)
    return sentences, labels

tokens, ner_tags = read_conll_file("drive/MyDrive/amharic-ner/ner_auto_labels.conll")

label_list = sorted(set(tag for seq in ner_tags for tag in seq))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in enumerate(label_list)}
ner_ids = [[label2id[tag] for tag in seq] for seq in ner_tags]

In [24]:
from sklearn.model_selection import train_test_split
_, test_tokens, _, test_labels = train_test_split(tokens, ner_ids, test_size=0.2, random_state=42)

test_dataset = Dataset.from_dict({
    "tokens": test_tokens,
    "ner_tags": test_labels
})

# 5. Tokenization function
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    word_ids = tokenized_inputs.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id])
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word_id = word_id

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [25]:
# 6. Define metrics
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [27]:
# 7. Evaluate all models
model_paths = {
    "xlm-roberta": "drive/MyDrive/models/xlm-roberta/final",
    "bert-base-multilingual-cased": "drive/MyDrive/models/bert-base-multilingual-cased/final",
    "afroxlmr": "drive/MyDrive/models/afroxlmr/final"
}

results = {}

for name, path in model_paths.items():
    print(f"🔍 Evaluating {name}...")

    tokenizer = AutoTokenizer.from_pretrained(path)
    tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=False)

    model = AutoModelForTokenClassification.from_pretrained(
        path,
        id2label=id2label,
        label2id=label2id
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        compute_metrics=compute_metrics,
        eval_dataset=tokenized_test
    )

    eval_result = trainer.evaluate()
    results[name] = eval_result

    print(f"✅ {name} Evaluation:")
    for k, v in eval_result.items():
        print(f"{k}: {v:.4f}")
    print("-" * 40)

🔍 Evaluating xlm-roberta...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(


  _warn_prf(average, modifier, msg_start, len(result))


✅ xlm-roberta Evaluation:
eval_loss: 0.2308
eval_model_preparation_time: 0.0028
eval_precision: 0.2000
eval_recall: 0.0909
eval_f1: 0.1250
eval_accuracy: 0.9186
eval_runtime: 0.2873
eval_samples_per_second: 69.6040
eval_steps_per_second: 10.4410
----------------------------------------
🔍 Evaluating bert-base-multilingual-cased...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(


  _warn_prf(average, modifier, msg_start, len(result))


✅ bert-base-multilingual-cased Evaluation:
eval_loss: 0.4389
eval_model_preparation_time: 0.0027
eval_precision: 0.2500
eval_recall: 0.0625
eval_f1: 0.1000
eval_accuracy: 0.8576
eval_runtime: 0.3562
eval_samples_per_second: 56.1520
eval_steps_per_second: 8.4230
----------------------------------------
🔍 Evaluating afroxlmr...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory drive/MyDrive/models/afroxlmr/final.

In [None]:
# 8. Save results
with open("drive/MyDrive/model_comparison_results.json", "w") as f:
    json.dump(results, f, indent=2)