# Fine-Tune NER Model for Amharic Telegram Messages (Evaluate Fix)

This notebook demonstrates how to fine-tune a multilingual NER model (XLM-RoBERTa or similar) on Amharic Telegram messages labeled in CoNLL format, using modular utility functions and the new `evaluate` library.

**Steps:**
1. Install & import libraries
2. Load and parse CoNLL data using modules
3. Tokenize and align labels
4. Model setup and training
5. Evaluation
6. Save model


In [1]:
# 2. Import libraries
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
)
import torch
from sklearn.model_selection import train_test_split
import numpy as np
import random
import sys
sys.path.append('../src/utils')
from ner_data_utils import parse_conll, build_label_maps
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 3. Set seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(42)


In [3]:
# 4. Load and parse CoNLL data using utility module
conll_path = '../data/raw/labeled_cnll_manual.txt'  # adjust if needed
sentences, ner_tags = parse_conll(conll_path)
print(f'Loaded {len(sentences)} sentences.')
label2id, id2label = build_label_maps(ner_tags)
print(label2id)


Loaded 1 sentences.
{'B-LOC': 0, 'B-Product': 1, 'I-LOC': 2, 'I-PRICE': 3, 'I-Product': 4, 'O': 5}


In [5]:
# 5. Convert to Hugging Face Dataset
data = pd.DataFrame({'tokens': sentences, 'ner_tags': ner_tags})
dataset = Dataset.from_pandas(data)
if len(dataset) > 1:
    train_test = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test['train']
    eval_dataset = train_test['test']
else:
    train_dataset = dataset
    eval_dataset = dataset


In [8]:
# 6. Load tokenizer and model (choose one)
model_checkpoint = 'xlm-roberta-base'  # or 'Davlan/bert-tiny-amharic-ner', 'Davlan/afro-xlmr-mini'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/xlm-roberta-base/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /xlm-roberta-base/resolve/main/model.safetensors (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001F7540056A0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: b6ffa32b-4b49-4ba6-a911-0037eed8377b)')

In [None]:
# 7. Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                curr_label = label[word_idx]
                if curr_label.startswith('B-'):
                    curr_label = 'I-' + curr_label[2:]
                label_ids.append(label2id.get(curr_label, label2id[label[word_idx]]))
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
# 8. Training arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    logging_dir='./logs',
    logging_steps=50,
    fp16=True if torch.cuda.is_available() else False,
    report_to='none'
)


In [None]:
# 9. Metrics
metric = evaluate.load('seqeval')
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1'],
        'accuracy': results['overall_accuracy'],
    }


In [None]:
# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


In [None]:
# 11. Evaluate
metrics = trainer.evaluate()
print(metrics)


In [None]:
# 12. Save model
trainer.save_model('./amharic_ner_model')
tokenizer.save_pretrained('./amharic_ner_model')
print('Model and tokenizer saved to ./amharic_ner_model')
