# **Mount Google Drive:**

In [107]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Install Necessary Libraries:**

In [108]:
!pip install transformers datasets accelerate peft seqeval



# **Import Libraries:**

In [109]:
import os
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from seqeval.metrics import precision_score, recall_score, f1_score

# **Load and Parse CoNLL Data**

## **Load CoNLL Data:**

In [110]:
def load_conll(file_path):
    sentences = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # Non-empty line
                # Split the line by the last space or tab to separate token and label
                parts = line.rsplit(None, 1) # Split by whitespace, at most 1 time from the right
                token = parts[0]
                label = parts[1] if len(parts) > 1 else 'O' # Assign 'O' if no label is found (single part)

                current_sentence.append(token)
                current_labels.append(label)
            else:  # Empty line indicates new sentence
                if current_sentence:  # Save non-empty sentences
                    sentences.append({
                        'tokens': current_sentence,
                        'labels': current_labels
                    })
                    current_sentence = []
                    current_labels = []

    # Save the last sentence if it exists
    if current_sentence:
        sentences.append({
            'tokens': current_sentence,
            'labels': current_labels
        })

    return sentences

conll_file = '/content/amharic_labeled.conll.txt'
dataset = load_conll(conll_file)
print(f"Loaded {len(dataset)} sentences from CoNLL file")

Loaded 2359 sentences from CoNLL file


## **Split Data:**

In [111]:
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
print(f"Training set: {len(train_data)} sentences")
print(f"Validation set: {len(val_data)} sentences")

Training set: 1887 sentences
Validation set: 472 sentences


# **Tokenize and Align Labels**

## **Define Tokenization Function:**

In [112]:
def tokenize_and_align_labels(examples, tokenizer, label2id, max_length=128):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        is_split_into_words=True,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_offsets_mapping=False # We don't need offsets mapping for this approach
    )

    labels = []
    word_ids_list = [] # List to store word_ids for each example

    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        word_ids_list.append(word_ids) # Store word_ids
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens (CLS, SEP, PAD)
            elif word_idx != previous_word_idx:
                # Check if the label exists in label2id before accessing
                if label[word_idx] in label2id:
                     label_ids.append(label2id[label[word_idx]])  # First subword
                else:
                     # Handle cases where a label might not be in label2id (e.g., during inference with unseen labels)
                    label_ids.append(-100) # Or map to 'O' if appropriate, depending on the task
            else:
                # Convert B- to I- for subword tokens
                # Check if the label exists in label2id before accessing
                if label[word_idx].startswith('B-') and 'I-' + label[word_idx][2:] in label2id:
                    label_ids.append(label2id['I-' + label[word_idx][2:]])
                elif label[word_idx] in label2id:
                    label_ids.append(label2id[label[word_idx]])
                else:
                    label_ids.append(-100) # Handle unseen labels


            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    tokenized_inputs['word_ids'] = word_ids_list # Add word_ids to the output
    return tokenized_inputs

## **Convert Data to Hugging Face Dataset:**

In [113]:
models_to_compare = [
    "xlm-roberta-base",
    # "bert-tiny-amharic", # Removed as it caused an error
    # "afro-xlmr-base", # Removed as it caused an error
    "distilbert-base-multilingual-cased"
]

In [114]:
# Define label list and mappings
# Generate label_list from the entire dataset to ensure all labels are included
label_list = sorted(list(set(label for example in dataset for label in example['labels'])))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# Initialize tokenizer using the first model checkpoint
model_checkpoint = models_to_compare[0]  # Use the first model to initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

tokenized_train = train_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True,
    remove_columns=['tokens'] # Remove original tokens, but keep original labels
)
tokenized_val = val_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True,
    remove_columns=['tokens'] # Remove original tokens, but keep original labels
)

Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

## **Define Models to Compare:**

## **Define Data Collator:**

In [115]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## **Define Training Arguments:**

In [116]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    eval_strategy="epoch",     # evaluate every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for logs
    logging_steps=10,
)

## **Define Metrics:**

In [117]:
import numpy as np
from seqeval.scheme import IOB2

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100) and map empty string to 'O'
    true_labels = [[id2label[l] if id2label[l] != '' else 'O' for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] if id2label[p] != '' else 'O' for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Add print statements to inspect true_labels and true_predictions
    print("\n--- Debugging compute_metrics ---")
    print("Sample True Labels (first 5):", true_labels[:5])
    print("Sample Predicted Labels (first 5):", true_predictions[:5])
    print("---------------------------------")

    # Ensure all predicted labels are in true_labels for seqeval to work correctly
    # This step can help avoid errors if the model predicts a label that never appears in the true labels.
    # However, the primary fix is mapping '' to 'O'.

    return {
        "precision": precision_score(true_labels, true_predictions, scheme=IOB2),
        "recall": recall_score(true_labels, true_predictions, scheme=IOB2),
        "f1": f1_score(true_labels, true_predictions, scheme=IOB2),
    }

# **Fine-Tune Multiple Models**

## **Fine-Tune Each Model:**

In [118]:
results = []

for model_checkpoint in models_to_compare:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    tokenized_train = train_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
        batched=True,
        remove_columns=['tokens', 'labels']
    )
    tokenized_val = val_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
        batched=True,
        remove_columns=['tokens', 'labels']
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate()
    # Store the trained model along with its results
    results.append((model_checkpoint, eval_results, model))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0352,0.022605,0.976004,0.985463,0.980711
2,0.0089,0.010935,0.980066,0.990564,0.985287
3,0.0057,0.007331,0.985331,0.993624,0.98946



--- Debugging compute_metrics ---
Sample True Labels (first 5): [['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC',


--- Debugging compute_metrics ---
Sample True Labels (first 5): [['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC',

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1887 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0662,0.039853,0.981616,0.974996,0.978295
2,0.0222,0.015296,0.988191,0.991117,0.989652
3,0.0127,0.012474,0.989358,0.994078,0.991712



--- Debugging compute_metrics ---
Sample True Labels (first 5): [['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', '


--- Debugging compute_metrics ---
Sample True Labels (first 5): [['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', '

# **Evaluate and Compare Models**

## **Print Evaluation Results:**

In [119]:
for model_checkpoint, eval_results, trained_model in results:
    print(f"Model: {model_checkpoint}")
    print(f"Evaluation Results: {eval_results}")

Model: xlm-roberta-base
Evaluation Results: {'eval_loss': 0.00733127212151885, 'eval_precision': 0.9853313100657562, 'eval_recall': 0.9936240754909462, 'eval_f1': 0.9894603174603174, 'eval_runtime': 3.4263, 'eval_samples_per_second': 137.757, 'eval_steps_per_second': 8.756, 'epoch': 3.0}
Model: distilbert-base-multilingual-cased
Evaluation Results: {'eval_loss': 0.012473872862756252, 'eval_precision': 0.989358218729535, 'eval_recall': 0.9940779733508801, 'eval_f1': 0.991712480512021, 'eval_runtime': 1.9155, 'eval_samples_per_second': 246.409, 'eval_steps_per_second': 15.662, 'epoch': 3.0}


## **Select the Best Model:**

In [120]:
best_model = max(results, key=lambda x: x[1]['eval_f1'])
print(f"Best Model: {best_model[0]}")
print(f"Best Evaluation Results: {best_model[1]}")

Best Model: distilbert-base-multilingual-cased
Best Evaluation Results: {'eval_loss': 0.012473872862756252, 'eval_precision': 0.989358218729535, 'eval_recall': 0.9940779733508801, 'eval_f1': 0.991712480512021, 'eval_runtime': 1.9155, 'eval_samples_per_second': 246.409, 'eval_steps_per_second': 15.662, 'epoch': 3.0}


## **Save the Best Model:**

In [121]:
best_model_checkpoint, best_eval_results, best_trained_model = max(results, key=lambda x: x[1]['eval_f1'])
best_model_dir = f'/content/drive/MyDrive/Amharic-Ecommerce-Extractor/models/{best_model_checkpoint}/final'
os.makedirs(best_model_dir, exist_ok=True)

# Use the stored model object to save the model
best_trained_model.save_pretrained(best_model_dir)
tokenizer.save_pretrained(best_model_dir)

('/content/drive/MyDrive/Amharic-Ecommerce-Extractor/models/distilbert-base-multilingual-cased/final/tokenizer_config.json',
 '/content/drive/MyDrive/Amharic-Ecommerce-Extractor/models/distilbert-base-multilingual-cased/final/special_tokens_map.json',
 '/content/drive/MyDrive/Amharic-Ecommerce-Extractor/models/distilbert-base-multilingual-cased/final/vocab.txt',
 '/content/drive/MyDrive/Amharic-Ecommerce-Extractor/models/distilbert-base-multilingual-cased/final/added_tokens.json',
 '/content/drive/MyDrive/Amharic-Ecommerce-Extractor/models/distilbert-base-multilingual-cased/final/tokenizer.json')

# Project Report: Amharic E-commerce Named Entity Recognition

## 1. Project Goal
The main objective of this project was to build and evaluate different transformer-based models for Named Entity Recognition (NER) on Amharic e-commerce text data. The goal was to identify and classify entities within the text, such as product names, prices, or other relevant e-commerce information.

## 2. Data
The project utilized labeled data in CoNLL format, stored in a file named `conll_labelled_data.conll`. This data contained sentences with tokens and their corresponding labels, indicating the type of named entity they represent. The dataset was loaded and parsed to extract sentences and their labels.

## 3. Data Preparation
- The loaded CoNLL data was split into training and validation sets using `train_test_split` from `sklearn.model_selection` to evaluate model performance on unseen data.
- A critical step was tokenizing the text and aligning the labels for each token. The `tokenize_and_align_labels` function was defined to handle the tokenization using a pre-trained tokenizer and ensure that the labels were correctly aligned with the generated tokens, including handling subword tokens and special tokens.
- The data was converted into Hugging Face `Dataset` objects for compatibility with the `transformers` library's `Trainer`.

## 4. Models Evaluated
Initially, the plan was to compare three transformer models:
- `"xlm-roberta-base"`
- `"bert-tiny-amharic"`
- `"afro-xlmr-base"`
- `"distilbert-base-multilingual-cased"`

However, during the process, two models were removed from the comparison list due to `OSError`s indicating that their identifiers were not found on the Hugging Face Model Hub under the specified names. This left `"xlm-roberta-base"` and `"distilbert-base-multilingual-cased"` for the fine-tuning and evaluation process.

## 5. Fine-tuning Process
- The `transformers.Trainer` API was used for fine-tuning the models.
- `TrainingArguments` were defined to configure the training process, including output directory, evaluation strategy (epoch-based), learning rate, batch sizes, number of epochs (3), weight decay, and logging.
- A `DataCollatorForTokenClassification` was used to prepare batches of data for training, handling padding and other necessary pre-processing steps.
- A custom `compute_metrics` function was defined using `seqeval` to calculate precision, recall, and F1-score, which are standard metrics for NER tasks. This function was crucial for evaluating the performance of the fine-tuned models.

## 6. Challenges Faced and Solutions
- **`FileNotFoundError`:** Initially encountered when loading the CoNLL file. This was resolved by verifying and correcting the file path.
- **`ValueError: not enough values to unpack`:** Occurred during CoNLL parsing due to lines with missing labels. This was fixed by modifying the `load_conll` function to handle lines with only tokens and assign an empty string as the label.
- **`NameError: name 'tokenizer' is not defined`:** Happened when running the tokenization cell independently. This was fixed by moving the tokenizer initialization and label-to-ID mapping definition outside the model training loop.
- **`NameError: name 'Trainer' is not defined`:** Encountered because the `Trainer` class was not imported. This was fixed by adding the necessary import statement from the `transformers` library.
- **`TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'`:** This indicated an incorrect argument name in `TrainingArguments`. The argument name was corrected to `eval_strategy`.
- **`KeyError: ''`:** Occurred in the `tokenize_and_align_labels` function because the empty string label was not included in the `label2id` mapping. This was fixed by ensuring the empty string was included in the `label_list` and mapping.
- **`IndexError: string index out of range`:** Arose in the `compute_metrics` function when `seqeval` encountered the empty string label. This was fixed by mapping the empty string label to 'O' (Outside) before passing the labels to `seqeval` metrics.
- **`OSError: [model_name] is not a local folder and is not a valid model identifier...`:** Encountered for "bert-tiny-amharic" and "afro-xlmr-base". These models were removed from the comparison list as their identifiers were not found on the public Hugging Face Hub.
- **`AttributeError: 'dict' object has no attribute 'save_pretrained'`:** Occurred when trying to save the model because the evaluation results dictionary was being used instead of the trained model object. This was fixed by modifying the training loop to store the trained model object and using that object for saving.

## 7. Evaluation Results
After fine-tuning and evaluating the remaining models (`"xlm-roberta-base"` and `"distilbert-base-multilingual-cased"`), the evaluation results were as follows:

- **Model: xlm-roberta-base**
  Evaluation Results: `{'eval_loss': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 3.287, 'eval_samples_per_second': 143.597, 'eval_steps_per_second': 9.127, 'epoch': 3.0}`

- **Model: distilbert-base-multilingual-cased**
  Evaluation Results: `{'eval_loss': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.8681, 'eval_samples_per_second': 252.67, 'eval_steps_per_second': 16.06, 'epoch': 3.0}`

**Note on Evaluation Results:** The evaluation metrics (precision, recall, F1) are all 0.0. This is unexpected for a successful training run and might indicate an issue in the `compute_metrics` function's logic or how the labels are being processed before being passed to `seqeval`. It could also be that the model is not learning effectively with the current hyperparameters or dataset. Further investigation into the `compute_metrics` function and the model's predictions would be needed to diagnose this. The `UndefinedMetricWarning` messages from `seqeval` during execution also point to potential issues with zero true or predicted samples for certain labels, which would result in zero precision, recall, and F1-score.

## 8. Best Model Selection
Based on the evaluated F1-scores, both models achieved an F1-score of 0.0. In this scenario, where the primary metric is the same, other factors like evaluation runtime or the number of samples/steps per second could be considered, although the F1-score is the most relevant for NER performance. Since both models have an F1 of 0.0, there isn't a clearly "better" model based on this metric alone. However, the code selected `"xlm-roberta-base"` as the "best" model based on the `max` function applied to the F1-score (which were both 0.0).

## 9. Why Other Models Were Not Considered
- `"bert-tiny-amharic"` and `"afro-xlmr-base"` were not considered for the final evaluation and comparison because their model identifiers were not found on the public Hugging Face Model Hub, leading to `OSError`s during loading.

## 10. Conclusion and Next Steps
We have successfully set up the data loading, splitting, tokenization, and model fine-tuning pipeline for Amharic e-commerce NER. We attempted to compare four models but ended up fine-tuning and evaluating two. The evaluation results currently show F1-scores of 0.0, which requires further investigation.

Possible next steps include:
- **Debugging `compute_metrics`:** Carefully review the `compute_metrics` function and the predictions/labels being passed to `seqeval` to understand why the metrics are 0.0.
- **Investigating model predictions:** Examine the predictions of the trained models on the validation set to see what labels they are predicting.
- **Data Quality Check:** Re-examine the `conll_labelled_data.conll` file to ensure the labels are in a consistent and correct format, particularly regarding the handling of non-entity tokens.
- **Hyperparameter Tuning:** Experiment with different training arguments, such as learning rate, batch size, and number of epochs, to see if model performance improves.
- **Exploring other models:** If the original "bert-tiny-amharic" and "afro-xlmr-base" models are confirmed to exist under different names or require authentication, they can be added back to the comparison. Other models suitable for low-resource languages or multilingual NER could also be explored.
- **Analyze Warnings:** Investigate the `UndefinedMetricWarning` messages from `seqeval` to understand which labels are causing issues with zero true or predicted samples.



# Project Report: Amharic E-commerce Named Entity Recognition

## 1. Project Goal
The main objective of this project was to build and evaluate different transformer-based models for Named Entity Recognition (NER) on Amharic e-commerce text data. The goal was to identify and classify entities within the text, such as product names, prices, locations, and contact information.

## 2. Data
The project utilized labeled data in CoNLL format, stored in a file named `amharic_labeled.conll.txt`. This data contained sentences with tokens and their corresponding labels, indicating the type of named entity they represent. The dataset was loaded and parsed to extract sentences and their labels.

## 3. Data Preparation
- The loaded CoNLL data was split into training and validation sets using `train_test_split` from `sklearn.model_selection` to evaluate model performance on unseen data.
- A critical step was tokenizing the text and aligning the labels for each token. The `tokenize_and_align_labels` function was defined to handle the tokenization using a pre-trained tokenizer and ensure that the labels were correctly aligned with the generated tokens, including handling subword tokens and special tokens.
- The data was converted into Hugging Face `Dataset` objects for compatibility with the `transformers` library's `Trainer`.

## 4. Models Evaluated
Initially, the plan was to compare four transformer models:
- `"xlm-roberta-base"`
- `"bert-tiny-amharic"`
- `"afro-xlmr-base"`
- `"distilbert-base-multilingual-cased"`

However, during the process, two models ("bert-tiny-amharic" and "afro-xlmr-base") were removed from the comparison list due to `OSError`s indicating that their identifiers were not found on the Hugging Face Model Hub under the specified names. This left `"xlm-roberta-base"` and `"distilbert-base-multilingual-cased"` for the fine-tuning and evaluation process.

## 5. Fine-tuning Process
- The `transformers.Trainer` API was used for fine-tuning the models.
- `TrainingArguments` were defined to configure the training process, including output directory, evaluation strategy (epoch-based), learning rate, batch sizes, number of epochs (3), weight decay, and logging.
- A `DataCollatorForTokenClassification` was used to prepare batches of data for training, handling padding and other necessary pre-processing steps.
- A custom `compute_metrics` function was defined using `seqeval` to calculate precision, recall, and F1-score, which are standard metrics for NER tasks. This function was crucial for evaluating the performance of the fine-tuned models.

## 6. Challenges Faced and Solutions
Throughout the project, several challenges were encountered, primarily related to data loading, tokenization, and evaluation setup. Debugging these issues was a key learning experience:

-   **Incorrect CoNLL Parsing:** Initially, the `load_conll` function incorrectly parsed the data, resulting in empty strings as labels for many tokens. This was because the function was designed for a strict tab-separated format, while the input file used a space to separate tokens and labels in some cases.
    -   **Solution:** Modified the `load_conll` function to split lines by the *last* whitespace character, correctly separating the token from its potential label.

-   **Labels Lost During Dataset Mapping:** After correcting the CoNLL parsing, it was discovered that the entity labels were being lost during the `dataset.map` operation with the `tokenize_and_align_labels` function. The 'labels' column in the tokenized dataset was being populated with only -100.
    -   **Solution:** Modified the `dataset.map` calls to ensure the 'labels' column was explicitly kept after the tokenization and alignment process, preventing the loss of the correctly generated label IDs.

-   **Evaluation Metrics Showing 0.0:** Due to the issues with label loading and mapping, the evaluation metrics (precision, recall, F1) initially showed 0.0. Debugging the `compute_metrics` function revealed that it was receiving empty lists for both true and predicted labels after filtering.
    -   **Solution:** Fixing the data loading and dataset mapping issues ensured that the `compute_metrics` function received the correct true labels, allowing for accurate calculation of the metrics. The issue with predicted labels being all 'O' was a consequence of the model not being able to learn with incorrect true labels during training.

-   **`NameError` and `AttributeError` during Debugging:** Encountered `NameError` (e.g., `training_args` not defined) and `AttributeError` (e.g., list object having no `size`, Dataset object having no `word_ids`) during the debugging process due to running cells out of order or incorrect handling of data types and object methods.
    -   **Solution:** Ensured cells were run in the correct sequence and corrected code to handle data structures and object methods appropriately (e.g., converting lists to tensors, accessing word IDs correctly).

-   **`RuntimeError: Expected all tensors to be on the same device`:** Occurred when making predictions because input tensors were on the CPU while the model was on the GPU.
    -   **Solution:** Moved the input tensors to the model's device using `.to(best_trained_model.device)`.

## 7. Evaluation Results
After resolving the data pipeline issues and successfully fine-tuning the models, the evaluation results demonstrated significant improvement.

-   **Model: xlm-roberta-base**
    Evaluation Results: `{'eval_loss': 0.00733127212151885, 'eval_precision': 0.9853313100657562, 'eval_recall': 0.9936240754909462, 'eval_f1': 0.9894603174603174, 'eval_runtime': 3.4263, 'eval_samples_per_second': 137.757, 'eval_steps_per_second': 8.756, 'epoch': 3.0}`
-   **Model: distilbert-base-multilingual-cased**
    Evaluation Results: `{'eval_loss': 0.012473872862756252, 'eval_precision': 0.989358218729535, 'eval_recall': 0.9940779733508801, 'eval_f1': 0.991712480512021, 'eval_runtime': 1.9155, 'eval_samples_per_second': 246.409, 'eval_steps_per_second': 15.662, 'epoch': 3.0}`

The models achieved high F1-scores (around 0.98-0.99), indicating good performance on the NER task for Amharic e-commerce data.

## 8. Best Model Selection
Based on the evaluated F1-scores, `"distilbert-base-multilingual-cased"` achieved a slightly higher F1-score (0.9917) compared to `"xlm-roberta-base"` (0.9895). Therefore, `"distilbert-base-multilingual-cased"` was selected as the best-performing model.

## 9. Why Other Models Were Not Considered
- `"bert-tiny-amharic"` and `"afro-xlmr-base"` were not considered for the final evaluation and comparison because their model identifiers were not found on the public Hugging Face Model Hub, leading to `OSError`s during loading.

## 10. Conclusion and Lessons Learned
We have successfully built and fine-tuned transformer models for Amharic e-commerce Named Entity Recognition, achieving promising results with an F1-score of over 0.99 for the best model.

The debugging process highlighted the critical importance of:
-   **Data Pipeline Integrity:** Ensuring the correct loading, parsing, and processing of data is fundamental. Errors early in the pipeline can cascade and prevent models from learning effectively.
-   **Careful Inspection of Intermediate Outputs:** Debugging by inspecting the output of functions and data structures at different stages of the pipeline was essential in identifying where the labels were being lost.
-   **Understanding Library Behavior:** Recognizing how functions like `dataset.map` and metrics calculations in `seqeval` interact with data formats and filtering was key to resolving the issues.

The achieved performance demonstrates the potential of fine-tuning multilingual models for NER in low-resource languages like Amharic, provided the data preparation is handled correctly.

Possible next steps include:
-   **Error Analysis:** Investigate specific examples where the model made incorrect predictions to understand common errors and areas for improvement.
-   **Hyperparameter Optimization:** Further tune hyperparameters for the best model to potentially achieve even higher performance.
-   **Exploring Other Models:** If possible, revisit or explore other models, including those specifically pre-trained on Amharic, if available.
-   **Deployment:** Prepare the best model for inference on new, unseen Amharic e-commerce text.