# Task 3: Fine-Tune NER Model

In this task, we fine-tune a Named Entity Recognition (NER) model to extract products, prices, and locations from Amharic Telegram messages. We use a pre-trained multilingual model (e.g., XLM-Roberta, bert-tiny-amharic, or afroxmlr) and Hugging Face's Trainer API for training and evaluation.

## Steps
1. Set up environment and install required libraries
2. Load and parse the labeled CoNLL dataset
3. Tokenize data and align labels
4. Configure model and training arguments
5. Fine-tune using Hugging Face Trainer
6. Evaluate and save the model

In [None]:
# Install required libraries (run in Colab or a GPU environment)
!pip install transformers datasets seqeval

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import numpy as np
import pandas as pd


In [None]:
# Load and parse the labeled CoNLL dataset

def parse_conll(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding='utf-8') as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                tokens.append(splits[0])
                tags.append(splits[1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

sentences, ner_tags = parse_conll('../data/conll/labeled_subset.conll')
print(sentences[0], ner_tags[0])

In [None]:
# Prepare dataset for Hugging Face Trainer
MODEL_NAME = 'xlm-roberta-base'  # or 'Davlan/bert-tiny-amharic', 'Davlan/afro-xlmr-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

label_list = sorted(set(l for tags in ner_tags for l in tags))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

encodings = tokenizer(sentences, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
labels = []
for i, label in enumerate(ner_tags):
    word_ids = encodings.word_ids(batch_index=i)
    label_ids = []
    prev_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != prev_word_idx:
            label_ids.append(label2id[label[word_idx]])
        else:
            label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith('I-') else -100)
        prev_word_idx = word_idx
    labels.append(label_ids)
encodings['labels'] = labels

In [None]:
# Create Hugging Face Dataset object
from datasets import Dataset

dataset = Dataset.from_dict({
    'input_ids': encodings['input_ids'],
    'attention_mask': encodings['attention_mask'],
    'labels': encodings['labels']
})

dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
# Set up model and training arguments
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=10,
)

In [None]:
from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        'f1': f1_score(true_labels, true_predictions),
        'report': classification_report(true_labels, true_predictions)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results['eval_report'])

# Save the model
trainer.save_model('./fine_tuned_ner_model')