# Task 4: Model Comparison & Selection

In [46]:
# %pip install transformers datasets seqeval
# %pip install -U transformers datasets seqeval
# !pip install evaluate

In [47]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import random
from datasets import load_dataset
import numpy as np
import pandas as pd
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [48]:
def read_conll(path):
    sentences, tags = [], []
    with open(path, 'r', encoding='utf-8') as f:
        tokens, labels = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    tags.append(labels)
                    tokens, labels = [], []
            else:
                token, label = line.split()[0], line.split()[-1]
                tokens.append(token)
                labels.append(label)
        if tokens:
            sentences.append(tokens)
            tags.append(labels)
    return sentences, tags

tokens, ner_tags = read_conll("labeled_amharic_data.conll")


In [49]:
# Unique labels
unique_labels = sorted(set(tag for seq in ner_tags for tag in seq))
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}

# Encode tags
encoded_tags = [[label2id[tag] for tag in seq] for seq in ner_tags]
dataset = Dataset.from_dict({"tokens": tokens, "ner_tags": encoded_tags})

# Split
split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
val_dataset = split["test"]

In [50]:
def align_labels(examples, tokenizer):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for word_id in word_ids:
            if word_id is None:
                aligned.append(-100)
            elif word_id != prev:
                aligned.append(labels[word_id])
            else:
                aligned.append(labels[word_id])
            prev = word_id
        all_labels.append(aligned)
    tokenized["labels"] = all_labels
    return tokenized

In [52]:
seq_metric = evaluate.load("seqeval")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true_preds, true_labels = [], []
    for pred, label in zip(preds, p.label_ids):
        true_preds.append([id2label[p] for (p, l) in zip(pred, label) if l != -100])
        true_labels.append([id2label[l] for (p, l) in zip(pred, label) if l != -100])
    results = seq_metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [63]:
model_names = {
    "XLM-Roberta": "xlm-roberta-base",
    "mBERT": "bert-base-multilingual-cased",
    "DistilBERT": "distilbert-base-multilingual-cased"
}

best_model = None
best_f1 = 0.0

for name, model_ckpt in model_names.items():
    print(f"\n🚀 Fine-tuning {name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForTokenClassification.from_pretrained(model_ckpt, num_labels=len(unique_labels))

    # Preprocess
    tokenized_train = train_dataset.map(lambda x: align_labels(x, tokenizer), batched=True)
    tokenized_val = val_dataset.map(lambda x: align_labels(x, tokenizer), batched=True)

    training_args = TrainingArguments(
        output_dir=f"./{name.replace(' ', '_')}_ner",
        eval_strategy="steps",  # evaluate every N steps
        save_strategy="steps",        # save every N steps
        save_steps=50,               # for example
        eval_steps=50,               # same number of steps
        load_best_model_at_end=True,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=10,
        logging_dir=f"./logs/{name.replace(' ', '_')}",
        metric_for_best_model="f1",
        report_to = "none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_result = trainer.evaluate()
    print(f" {name} F1-Score: {eval_result['eval_f1']:.4f}")

    if eval_result["eval_f1"] > best_f1:
        best_f1 = eval_result["eval_f1"]
        best_model = name


🚀 Fine-tuning XLM-Roberta...


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 XLM-Roberta F1-Score: 0.0000

🚀 Fine-tuning mBERT...


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


 mBERT F1-Score: 0.0000

🚀 Fine-tuning DistilBERT...


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss


 DistilBERT F1-Score: 0.0000


In [64]:
print(f" Best model: {best_model} with F1-score: {best_f1:.4f}")

 Best model: None with F1-score: 0.0000


In [None]:
from nbformat import read, write, NO_CONVERT
import json

# Load notebook
with open("Fine_Tune_NER_Model.ipynb", "r", encoding="utf-8") as f:
    nb = read(f, as_version=NO_CONVERT)

# Remove widgets metadata
nb.metadata.pop("widgets", None)

# Save cleaned notebook
with open("Fine_Tune_NER_Model_CLEANED.ipynb", "w", encoding="utf-8") as f:
    write(nb, f)