In [1]:
# =============================================================================
# Task 3: Fine-Tuning a Transformer Model for Amharic NER
# This version is updated to handle all specified entities, including optional ones.
# =============================================================================

import numpy as np
from datasets import load_dataset, Dataset, DatasetDict, Features, Sequence, ClassLabel, Value
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import os
import torch

# --- Step 1: Define Comprehensive Label List and Load Dataset ---
# The label_list is expanded to include all possible entities from the project brief.
# This is the key change to fix the previous "Invalid string class label" error.
label_list = [
    "O", 
    "B-PRODUCT", "I-PRODUCT", 
    "B-PRICE", "I-PRICE", 
    "B-LOC", "I-LOC",
    "B-CONTACT_INFO", "I-CONTACT_INFO",
    "B-DELIVERY_FEE", "I-DELIVERY_FEE"
]
labeled_file_path = '../data/labeled_data.txt'

if not os.path.exists(labeled_file_path) or os.path.getsize(labeled_file_path) == 0:
    raise FileNotFoundError(f"Labeled data file not found or is empty at: {labeled_file_path}.")

# --- Step 2: Robustly Parse the CoNLL File into a Dataset ---
def read_conll_file(file_path):
    """Reads a CoNLL-formatted file and converts it into a Hugging Face Dataset."""
    with open(file_path, 'r', encoding='utf-8') as f:
        tokens_list, ner_tags_list = [], []
        tokens, ner_tags = [], []
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    tokens_list.append(tokens)
                    ner_tags_list.append(ner_tags)
                    tokens, ner_tags = [], []
            else:
                parts = line.split()
                if len(parts) == 2 and parts[1] in label_list:
                    tokens.append(parts[0])
                    ner_tags.append(parts[1])
                # Silently ignore lines with invalid labels to prevent crashes
        if tokens: # Add the last sentence
            tokens_list.append(tokens)
            ner_tags_list.append(ner_tags)
    return Dataset.from_dict({'tokens': tokens_list, 'ner_tags': ner_tags_list})

# Create and structure the dataset with the correct features
raw_dataset = read_conll_file(labeled_file_path)
features = Features({
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=label_list))
})
raw_dataset = raw_dataset.cast(features)

# Split into training and test sets
final_datasets = raw_dataset.train_test_split(test_size=0.2, seed=42)
print("Dataset processed and split successfully:\n", final_datasets)
print("\nSample from training data:\n", final_datasets['train'][0])

# --- Step 3: Tokenization and Label Alignment ---
model_checkpoint = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    """Tokenizes sentences and aligns labels to the new sub-word tokens."""
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )
    all_aligned_labels = []
    for i, labels_for_one_sentence in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        current_aligned_labels = []
        for word_idx in word_ids:
            if word_idx is None:
                current_aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                current_aligned_labels.append(labels_for_one_sentence[word_idx])
            else:
                current_aligned_labels.append(-100)
            previous_word_idx = word_idx
        all_aligned_labels.append(current_aligned_labels)
    tokenized_inputs["labels"] = all_aligned_labels
    return tokenized_inputs

tokenized_datasets = final_datasets.map(tokenize_and_align_labels, batched=True)

# --- Step 4: Model Training ---
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label={i: l for i, l in enumerate(label_list)}, label2id={l: i for i, l in enumerate(label_list)}
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

args = TrainingArguments(
    output_dir="../saved_models/amharic-ner-afro-xlmr",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=25, # Increased epochs for more complex entities
    weight_decay=0.01,
    logging_steps=10,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def compute_metrics(p):
    """Computes precision, recall, and F1-score for the evaluation set."""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
    return {
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "f1-score": report["micro avg"]["f1-score"],
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting model training with expanded entity list...")
trainer.train()

# --- Step 5: Final Evaluation and Saving ---
print("\nEvaluating final model on the test set...")
eval_results = trainer.evaluate()
print(f"Final Evaluation results: {eval_results}")

model_save_path = "../saved_models/amharic-ner-afro-xlmr"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel and tokenizer saved successfully to '{model_save_path}'")

# --- Step 6: Test the Final Model on a Complex Example ---
print("\n--- Testing the fine-tuned model with a pipeline ---")
from transformers import pipeline

ner_pipeline = pipeline("ner", model=model_save_path, aggregation_strategy="simple")
text = "1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር አድራሻ መገናኛ ቢሮ ቁ. S05/S06 0902660722"
results = ner_pipeline(text)
print(f"Test text: {text}")
for entity in results:
    print(entity)

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
Casting the dataset: 100%|██████████| 37/37 [00:00<00:00, 18683.99 examples/s]


Dataset processed and split successfully:
 DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 29
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8
    })
})

Sample from training data:
 {'tokens': ['HP', 'PAVILION', 'Price', 'አድራሻ:', '-', 'መገናኛ', 'ማራቶን', 'የ', 'ገበያ', 'ማእከል'], 'ner_tags': [1, 2, 0, 0, 0, 5, 6, 6, 6, 6]}


Map: 100%|██████████| 29/29 [00:00<00:00, 3639.47 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 1963.05 examples/s]
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting model training with expanded entity list...


                                               
  4%|▍         | 4/100 [00:13<04:52,  3.04s/it]

{'eval_loss': 2.0826213359832764, 'eval_precision': 0.4857142857142857, 'eval_recall': 0.425, 'eval_f1-score': 0.45333333333333337, 'eval_runtime': 0.2947, 'eval_samples_per_second': 27.145, 'eval_steps_per_second': 3.393, 'epoch': 1.0}


                                               
  8%|▊         | 8/100 [00:26<04:34,  2.99s/it]

{'eval_loss': 1.5904695987701416, 'eval_precision': 0.6122448979591837, 'eval_recall': 0.75, 'eval_f1-score': 0.6741573033707865, 'eval_runtime': 0.4467, 'eval_samples_per_second': 17.91, 'eval_steps_per_second': 2.239, 'epoch': 2.0}


 10%|█         | 10/100 [00:32<04:55,  3.28s/it]

{'loss': 2.0509, 'grad_norm': 5.453810691833496, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.5}


                                                
 12%|█▏        | 12/100 [00:39<04:30,  3.07s/it]

{'eval_loss': 1.196598768234253, 'eval_precision': 0.7560975609756098, 'eval_recall': 0.775, 'eval_f1-score': 0.7654320987654322, 'eval_runtime': 0.4105, 'eval_samples_per_second': 19.488, 'eval_steps_per_second': 2.436, 'epoch': 3.0}


                                                
 16%|█▌        | 16/100 [00:52<04:24,  3.15s/it]

{'eval_loss': 0.8953021764755249, 'eval_precision': 0.7560975609756098, 'eval_recall': 0.775, 'eval_f1-score': 0.7654320987654322, 'eval_runtime': 0.4124, 'eval_samples_per_second': 19.399, 'eval_steps_per_second': 2.425, 'epoch': 4.0}


 20%|██        | 20/100 [01:06<04:22,  3.28s/it]

{'loss': 1.0779, 'grad_norm': 3.873598575592041, 'learning_rate': 2.4e-05, 'epoch': 5.0}


                                                
 20%|██        | 20/100 [01:06<04:22,  3.28s/it]

{'eval_loss': 0.7161499261856079, 'eval_precision': 0.7441860465116279, 'eval_recall': 0.8, 'eval_f1-score': 0.7710843373493975, 'eval_runtime': 0.423, 'eval_samples_per_second': 18.91, 'eval_steps_per_second': 2.364, 'epoch': 5.0}


                                                
 24%|██▍       | 24/100 [01:20<04:08,  3.26s/it]

{'eval_loss': 0.5566896200180054, 'eval_precision': 0.8095238095238095, 'eval_recall': 0.85, 'eval_f1-score': 0.8292682926829269, 'eval_runtime': 0.6102, 'eval_samples_per_second': 13.111, 'eval_steps_per_second': 1.639, 'epoch': 6.0}


                                                
 28%|██▊       | 28/100 [01:32<03:42,  3.10s/it]

{'eval_loss': 0.43937015533447266, 'eval_precision': 0.8780487804878049, 'eval_recall': 0.9, 'eval_f1-score': 0.888888888888889, 'eval_runtime': 0.3918, 'eval_samples_per_second': 20.42, 'eval_steps_per_second': 2.552, 'epoch': 7.0}


 30%|███       | 30/100 [01:39<03:47,  3.25s/it]

{'loss': 0.5857, 'grad_norm': 2.8827106952667236, 'learning_rate': 2.1e-05, 'epoch': 7.5}


                                                
 32%|███▏      | 32/100 [01:45<03:25,  3.03s/it]

{'eval_loss': 0.3294372260570526, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4964, 'eval_samples_per_second': 16.114, 'eval_steps_per_second': 2.014, 'epoch': 8.0}


                                                
 36%|███▌      | 36/100 [01:57<03:11,  2.99s/it]

{'eval_loss': 0.25700685381889343, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3886, 'eval_samples_per_second': 20.589, 'eval_steps_per_second': 2.574, 'epoch': 9.0}


 40%|████      | 40/100 [02:09<02:58,  2.98s/it]

{'loss': 0.2803, 'grad_norm': 2.309680461883545, 'learning_rate': 1.8e-05, 'epoch': 10.0}


                                                
 40%|████      | 40/100 [02:10<02:58,  2.98s/it]

{'eval_loss': 0.20938891172409058, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4635, 'eval_samples_per_second': 17.261, 'eval_steps_per_second': 2.158, 'epoch': 10.0}


                                                
 44%|████▍     | 44/100 [02:22<02:47,  3.00s/it]

{'eval_loss': 0.18921521306037903, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4651, 'eval_samples_per_second': 17.202, 'eval_steps_per_second': 2.15, 'epoch': 11.0}


                                                
 48%|████▊     | 48/100 [02:35<02:33,  2.95s/it]

{'eval_loss': 0.18794341385364532, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3895, 'eval_samples_per_second': 20.539, 'eval_steps_per_second': 2.567, 'epoch': 12.0}


 50%|█████     | 50/100 [02:41<02:36,  3.12s/it]

{'loss': 0.1468, 'grad_norm': 2.207854747772217, 'learning_rate': 1.5e-05, 'epoch': 12.5}


                                                
 52%|█████▏    | 52/100 [02:47<02:24,  3.01s/it]

{'eval_loss': 0.1768486499786377, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3791, 'eval_samples_per_second': 21.103, 'eval_steps_per_second': 2.638, 'epoch': 13.0}


                                                
 56%|█████▌    | 56/100 [03:00<02:14,  3.06s/it]

{'eval_loss': 0.17175722122192383, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3836, 'eval_samples_per_second': 20.854, 'eval_steps_per_second': 2.607, 'epoch': 14.0}


 60%|██████    | 60/100 [03:12<01:58,  2.95s/it]

{'loss': 0.0954, 'grad_norm': 0.6211458444595337, 'learning_rate': 1.2e-05, 'epoch': 15.0}


                                                
 60%|██████    | 60/100 [03:12<01:58,  2.95s/it]

{'eval_loss': 0.17363359034061432, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3946, 'eval_samples_per_second': 20.272, 'eval_steps_per_second': 2.534, 'epoch': 15.0}


                                                
 64%|██████▍   | 64/100 [03:25<01:47,  2.99s/it]

{'eval_loss': 0.17316783964633942, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3952, 'eval_samples_per_second': 20.241, 'eval_steps_per_second': 2.53, 'epoch': 16.0}


                                                
 68%|██████▊   | 68/100 [03:37<01:35,  2.99s/it]

{'eval_loss': 0.175831601023674, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.384, 'eval_samples_per_second': 20.831, 'eval_steps_per_second': 2.604, 'epoch': 17.0}


 70%|███████   | 70/100 [03:44<01:36,  3.22s/it]

{'loss': 0.0821, 'grad_norm': 0.9661248326301575, 'learning_rate': 9e-06, 'epoch': 17.5}


                                                
 72%|███████▏  | 72/100 [03:50<01:25,  3.07s/it]

{'eval_loss': 0.18116715550422668, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3977, 'eval_samples_per_second': 20.117, 'eval_steps_per_second': 2.515, 'epoch': 18.0}


                                                
 76%|███████▌  | 76/100 [04:02<01:10,  2.95s/it]

{'eval_loss': 0.18500234186649323, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3888, 'eval_samples_per_second': 20.576, 'eval_steps_per_second': 2.572, 'epoch': 19.0}


 80%|████████  | 80/100 [04:14<00:58,  2.92s/it]

{'loss': 0.0527, 'grad_norm': 1.0591944456100464, 'learning_rate': 6e-06, 'epoch': 20.0}


                                                
 80%|████████  | 80/100 [04:15<00:58,  2.92s/it]

{'eval_loss': 0.18638241291046143, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3789, 'eval_samples_per_second': 21.116, 'eval_steps_per_second': 2.64, 'epoch': 20.0}


                                                
 84%|████████▍ | 84/100 [04:27<00:48,  3.03s/it]

{'eval_loss': 0.18689016997814178, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4381, 'eval_samples_per_second': 18.259, 'eval_steps_per_second': 2.282, 'epoch': 21.0}


                                                
 88%|████████▊ | 88/100 [04:40<00:37,  3.14s/it]

{'eval_loss': 0.1885034590959549, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4669, 'eval_samples_per_second': 17.135, 'eval_steps_per_second': 2.142, 'epoch': 22.0}


 90%|█████████ | 90/100 [04:48<00:34,  3.41s/it]

{'loss': 0.0494, 'grad_norm': 0.41516947746276855, 'learning_rate': 3e-06, 'epoch': 22.5}


                                                
 92%|█████████▏| 92/100 [04:54<00:25,  3.16s/it]

{'eval_loss': 0.18892742693424225, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.3994, 'eval_samples_per_second': 20.031, 'eval_steps_per_second': 2.504, 'epoch': 23.0}


                                                
 96%|█████████▌| 96/100 [05:08<00:12,  3.24s/it]

{'eval_loss': 0.1882058084011078, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4298, 'eval_samples_per_second': 18.611, 'eval_steps_per_second': 2.326, 'epoch': 24.0}


100%|██████████| 100/100 [05:20<00:00,  3.11s/it]

{'loss': 0.0432, 'grad_norm': 0.46382230520248413, 'learning_rate': 0.0, 'epoch': 25.0}


                                                 
100%|██████████| 100/100 [05:21<00:00,  3.21s/it]


{'eval_loss': 0.18778850138187408, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.5301, 'eval_samples_per_second': 15.093, 'eval_steps_per_second': 1.887, 'epoch': 25.0}
{'train_runtime': 321.1992, 'train_samples_per_second': 2.257, 'train_steps_per_second': 0.311, 'train_loss': 0.4464517691731453, 'epoch': 25.0}

Evaluating final model on the test set...


100%|██████████| 1/1 [00:00<00:00, 184.43it/s]


Final Evaluation results: {'eval_loss': 0.18778850138187408, 'eval_precision': 0.9743589743589743, 'eval_recall': 0.95, 'eval_f1-score': 0.9620253164556962, 'eval_runtime': 0.4634, 'eval_samples_per_second': 17.263, 'eval_steps_per_second': 2.158, 'epoch': 25.0}

Model and tokenizer saved successfully to '../saved_models/amharic-ner-afro-xlmr'

--- Testing the fine-tuned model with a pipeline ---
Test text: 1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር አድራሻ መገናኛ ቢሮ ቁ. S05/S06 0902660722
{'entity_group': 'PRODUCT', 'score': np.float32(0.99399006), 'word': '1 pairs Sneaker Crease Protector', 'start': 0, 'end': 32}
{'entity_group': 'PRICE', 'score': np.float32(0.9858272), 'word': '400 ብር', 'start': 37, 'end': 43}
{'entity_group': 'LOC', 'score': np.float32(0.87952137), 'word': 'መገናኛ ቢሮ ቁ. S05/S06', 'start': 49, 'end': 67}
{'entity_group': 'CONTACT_INFO', 'score': np.float32(0.99241734), 'word': '09', 'start': 68, 'end': 70}
{'entity_group': 'CONTACT_INFO', 'score': np.float32(0.9698542), 'w