In [None]:
!pip install evaluate seqeval
# 2. Imports
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
from evaluate import load
import numpy as np
import json

In [None]:
# 3. Define label mappings (must match training!)
label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in enumerate(label_list)}

# 4. Load your tokenized test dataset (this must already be processed)
# Upload if needed
from google.colab import files
uploaded = files.upload()  # Upload amharic_ner_labels.txt again if needed

In [None]:
# Parse CoNLL
def read_conll_file(file_path):
    sentences, labels = [], []
    sentence, label_seq = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label_seq)
                    sentence, label_seq = [], []
            else:
                token, tag = line.split()
                sentence.append(token)
                label_seq.append(tag)
    if sentence:
        sentences.append(sentence)
        labels.append(label_seq)
    return sentences, labels

tokens, ner_tags = read_conll_file("amharic_ner_labels.txt")
ner_ids = [[label2id[tag] for tag in seq] for seq in ner_tags]

In [None]:
from sklearn.model_selection import train_test_split
_, test_tokens, _, test_labels = train_test_split(tokens, ner_ids, test_size=0.2, random_state=42)

test_dataset = Dataset.from_dict({
    "tokens": test_tokens,
    "ner_tags": test_labels
})

# 5. Tokenization function
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    word_ids = tokenized_inputs.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id])
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word_id = word_id

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# 6. Define metrics
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# 7. Evaluate all models
model_paths = {
    "xlm-roberta": "models/xlm-roberta/final",
    "bert-tiny-amharic": "models/bert-tiny-amharic/final",
    "afroxlmr": "models/afroxlmr/final"
}

results = {}

for name, path in model_paths.items():
    print(f"🔍 Evaluating {name}...")
    
    tokenizer = AutoTokenizer.from_pretrained(path)
    tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=False)
    
    model = AutoModelForTokenClassification.from_pretrained(
        path,
        id2label=id2label,
        label2id=label2id
    )
    
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        compute_metrics=compute_metrics,
        eval_dataset=tokenized_test
    )

    eval_result = trainer.evaluate()
    results[name] = eval_result

    print(f"✅ {name} Evaluation:")
    for k, v in eval_result.items():
        print(f"{k}: {v:.4f}")
    print("-" * 40)

In [None]:
# 8. Save results
with open("model_comparison_results.json", "w") as f:
    json.dump(results, f, indent=2)