# Task 4: Model Comparison & Selection

In [2]:
# %pip install transformers datasets seqeval

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import load_dataset
import numpy as np
import pandas as pd
from seqeval.metrics import f1_score
import warnings

warnings.filterwarnings("ignore")

In [3]:
# 1. Load CoNLL NER Dataset
dataset = load_dataset("conll2003", trust_remote_code=True)
label_list = dataset["train"].features["ner_tags"].feature.names
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

# 2. Tokenization and label alignment
def tokenize_and_align(example, tokenizer):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    labels = []
    prev_word = None
    for idx in word_ids:
        if idx is None:
            labels.append(-100)
        elif idx != prev_word:
            labels.append(example["ner_tags"][idx])
        else:
            labels.append(-100)
        prev_word = idx
    tokenized["labels"] = labels
    return tokenized

# 3. Models to compare
model_names = {
    "XLM-Roberta": "xlm-roberta-base",
    "DistilBERT": "distilbert-base-multilingual-cased",
    "mBERT": "bert-base-multilingual-cased"
}

results = []

# 4. Train and evaluate each model
for name, model_ckpt in model_names.items():
    try:
        print(f" Fine-tuning {name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        model = AutoModelForTokenClassification.from_pretrained(
            model_ckpt, num_labels=len(label_list), id2label=id2label, label2id=label2id
        )

        # Tokenize a small subset for quick test
        tokenized_dataset = dataset.map(lambda x: tokenize_and_align(x, tokenizer), batched=True)
        tokenized_dataset = tokenized_dataset.remove_columns(['id', 'pos_tags', 'chunk_tags', 'ner_tags'])

        # Define Trainer
        args = TrainingArguments(
            output_dir=f"./results_{name.replace(' ', '_')}",
            evaluation_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=1,
            weight_decay=0.01,
            save_strategy="no",
            logging_dir="./logs"
        )

        def compute_metrics(p):
            preds = np.argmax(p.predictions, axis=2)
            labels = p.label_ids
            true_preds, true_labels = [], []
            for p_row, l_row in zip(preds, labels):
                preds_clean = []
                labels_clean = []
                for p_, l_ in zip(p_row, l_row):
                    if l_ != -100:
                        preds_clean.append(id2label[p_])
                        labels_clean.append(id2label[l_])
                true_preds.append(preds_clean)
                true_labels.append(labels_clean)
            return {"f1": f1_score(true_labels, true_preds)}

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=tokenized_dataset["train"].select(range(200)),  # quick demo
            eval_dataset=tokenized_dataset["validation"].select(range(100)),
            tokenizer=tokenizer,
            data_collator=DataCollatorForTokenClassification(tokenizer),
            compute_metrics=compute_metrics
        )

        trainer.train()
        eval_result = trainer.evaluate()
        results.append({"Model": name, "F1 Score": round(eval_result['eval_f1'], 4)})

    except Exception as e:
        results.append({"Model": name, "F1 Score": f"Error: {str(e)}"})

# 5. Show summary
summary = pd.DataFrame(results)
summary = summary.sort_values(by="F1 Score", ascending=False, key=lambda x: pd.to_numeric(x, errors='coerce'))
print("Model Comparison Summary:\n")
print(summary.to_string(index=False))

 Fine-tuning XLM-Roberta...
 Fine-tuning DistilBERT...


: 