<a href="https://colab.research.google.com/github/Addisu22/Amharic_Ecommerce_Extractor/blob/main/Fine_Tune_NER_Model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 3: Fine Tune NER Model

In [1]:
# %pip install transformers datasets seqeval accelerate
# %pip install evaluate
# %pip install hf_xet
# %pip install huggingface_hub[hf_xet]

In [2]:
import numpy as np
from datasets import Dataset, ClassLabel, Sequence
import pandas as pd
from transformers import TrainingArguments, Trainer
# from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append(os.path.abspath("../scripts"))
import warnings
warnings.filterwarnings('ignore')

In [27]:
os.chdir("..")

In [3]:
import evaluate
metric = evaluate.load("seqeval")

In [8]:
def read_conll(filepath):
    sentences = []
    labels = []
    with open(filepath, "r", encoding="utf-8") as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                tokens.append(splits[0])
                tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

# Read full dataset
sentences, labels = read_conll("labeled_amharic_data.conll")

# Restrict to first 30 samples
sentences = sentences[:30]
labels = labels[:30]

# Create Hugging Face Dataset from lists
data_dict = {"tokens": sentences, "ner_tags": labels}
dataset = Dataset.from_dict(data_dict)

# Get unique labels and map to ids
unique_labels = list({tag for doc in labels for tag in doc})
unique_labels.sort()
label_to_id = {l: i for i, l in enumerate(unique_labels)}

# Convert labels to ids
def encode_tags(tags):
    return [label_to_id[tag] for tag in tags]

# Apply label encoding
dataset = dataset.map(lambda x: {"ner_tags": encode_tags(x["ner_tags"])})

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [9]:
print(dataset[0])

{'tokens': ['ፎርኤቨር', 'ብራይት፦', 'ለጥርስዎ', 'ጥንካሬ', 'ፅዳት', 'እና', 'ንጣት', 'በአንድ', 'የያዘ', 'ከሬት', 'እና', 'ማር', 'የተቀመመ', 'የ', 'ምርት', 'ነው'], 'ner_tags': [4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4]}


In [10]:
from transformers import AutoTokenizer

label_list = ['O', 'B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']
label_to_id = {label: i for i, label in enumerate(label_list)}

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [11]:
model_name = "xlm-roberta-base"  # or 'bert-base-multilingual-cased', 'bert-tiny-amharic', 'afroxmlr'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(unique_labels)
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # map tokens to words
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # ignore token for loss calculation
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # For tokens inside a word, assign the same label or set to -100 to ignore
                label_ids.append(label[word_idx] if unique_labels[label[word_idx]].startswith("I-") else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [13]:
print(tokenized_dataset[0].keys())

dict_keys(['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'])


In [14]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["test"]

Set up training arguments and Trainer

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [unique_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="./amharic_ner_model",
    eval_strategy="epoch",
    eval_steps=50,
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_total_limit=1,
    report_to="none" # ✅ disables wandb logging
)

Initialize Trainer and train

In [16]:
# using Hugging Face's Trainer class to fine-tune a token classification model (like NER with XLM-Roberta) on Amharic-labeled text.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.640042,0.0,0.0,0.0,0.81768
2,No log,0.509813,0.0,0.0,0.0,0.839779
3,No log,0.447155,0.5,0.095238,0.16,0.895028


TrainOutput(global_step=72, training_loss=0.6484393543667264, metrics={'train_runtime': 864.2859, 'train_samples_per_second': 0.083, 'train_steps_per_second': 0.083, 'total_flos': 4703469189120.0, 'train_loss': 0.6484393543667264, 'epoch': 3.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.4471549689769745,
 'eval_precision': 0.5,
 'eval_recall': 0.09523809523809523,
 'eval_f1': 0.16,
 'eval_accuracy': 0.8950276243093923,
 'eval_runtime': 3.4876,
 'eval_samples_per_second': 1.72,
 'eval_steps_per_second': 1.72,
 'epoch': 3.0}

Evaluate and save the model

In [18]:
metrics = trainer.evaluate()
print(metrics)

trainer.save_model("./amharic_ner_model")
tokenizer.save_pretrained("./amharic_ner_model")

{'eval_loss': 0.4471549689769745, 'eval_precision': 0.5, 'eval_recall': 0.09523809523809523, 'eval_f1': 0.16, 'eval_accuracy': 0.8950276243093923, 'eval_runtime': 3.8092, 'eval_samples_per_second': 1.575, 'eval_steps_per_second': 1.575, 'epoch': 3.0}


('./amharic_ner_model/tokenizer_config.json',
 './amharic_ner_model/special_tokens_map.json',
 './amharic_ner_model/sentencepiece.bpe.model',
 './amharic_ner_model/added_tokens.json',
 './amharic_ner_model/tokenizer.json')

In [19]:
print(train_dataset[0])
print(tokenizer.convert_ids_to_tokens(train_dataset[2]['input_ids']))
print(train_dataset[0]['labels'])  # Check if non -100 labels are properly aligned

{'tokens': ['የቤትውን', 'ውበት', 'አንድ', 'ደረጃ', 'ከፍ', 'የሚያደርግ', 'ውሀ', 'የማያስገባ', 'ቅባት', 'ዘይት', 'ነገሮች', 'እሳት', 'እና', 'ተቀጣጣይ', 'ነገሮችን', 'የሚቋቋም', 'ለኪችን', 'ካቢኔት', 'ለኪችንዎ', 'ግርግዳ', 'ለባኞ', 'ቤት', 'ግድግዳ', 'ለቤትዎ', 'ማስዋቢያ', 'በ', 'የቀለም', 'አማራጭ', 'የቀረበ', 'ኪችንዎን', 'እና', 'ቤትዎን', 'ውብ', 'እና', 'ፅዱ', 'ለማድረግ', 'ተመራጭ', 'ዋጋ፦', 'ብር', 'ፒያሳ', 'ጣይቱ', 'ሆቴል', 'ጊቢ', 'ውስጥ', 'ቢሮ', 'ቁ', 'መገናኛ', 'መተባበር', 'ሕንፃ', 'ኛ', 'ፎቅ', 'ቢሮ', 'ቁ', 'ለማዘዝ', 'እነዚን', 'ቁጥሮች', 'ይጠቀሙ', 'በቴሌግራም', 'ለማዘዝ', 'ቤተሰብ', 'ይሁኑ'], 'ner_tags': [2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 4, 1, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4], 'input_ids': [0, 198417, 5554, 6, 1178, 4088, 9577, 32966, 82591, 13262, 61860, 6, 1178, 21608, 161408, 2095, 21080, 37751, 38569, 19104, 78953, 75868, 4708, 36821, 2302, 64054, 213445, 3841, 75868, 548, 2627, 182243, 2237, 29654, 23374, 15181, 13942, 11844, 1437, 2237, 29654, 23374, 29597, 6, 85342, 4799, 5040, 2237, 5657, 176354, 11834