In [None]:
# Task 4: Model Comparison and Selection
# This script implements Task 4 of the EthioMart Amharic E-commerce Data Extractor project.
# The goal is to fine-tune the `bert-base-multilingual-cased` model for Named Entity Recognition (NER)
# on Amharic Telegram data and compare its performance with other models (e.g., `xlm-roberta-base` from Task 3).
# The script includes data loading, tokenization, model training with memory-efficient configurations,
# evaluation, and model saving.

import os
import numpy as np
import gc
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
import evaluate

# Configuration
GDRIVE_PROJECT_PATH = "/content/drive/MyDrive/Ethio_mart"
LABELED_DATA_PATH = os.path.join(GDRIVE_PROJECT_PATH, "labeled_data_conll.txt")
OUTPUT_MODEL_DIR = os.path.join(GDRIVE_PROJECT_PATH, "models", "mbert-cased-ner-finetuned")
MODEL_CHECKPOINT = "bert-base-multilingual-cased"

In [None]:
# Define labels
labels_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for i, label in enumerate(labels_list)}

def check_environment():
    """Check if project folder and data file exist."""
    print("--- 1. Checking if the main project folder exists ---")
    if os.path.exists(GDRIVE_PROJECT_PATH):
        print(f"✅ SUCCESS: The folder '{GDRIVE_PROJECT_PATH}' exists.")
    else:
        print(f"❌ FAILURE: The folder '{GDRIVE_PROJECT_PATH}' DOES NOT EXIST. Is your Drive mounted?")
        raise FileNotFoundError(f"Project folder {GDRIVE_PROJECT_PATH} not found.")

    print("\n--- 2. Listing the contents of the project folder ---")
    os.system(f"ls -l {GDRIVE_PROJECT_PATH}")

    print("\n--- 3. Checking if the specific file exists ---")
    if os.path.exists(LABELED_DATA_PATH):
        print(f"✅ SUCCESS: The file '{LABELED_DATA_PATH}' was found!")
    else:
        print(f"❌ FAILURE: The file '{LABELED_DATA_PATH}' was NOT found at this exact path.")
        raise FileNotFoundError(f"Data file {LABELED_DATA_PATH} not found.")

def create_dataset_from_conll(file_path):
    """Parse CoNLL file and create a Hugging Face Dataset."""
    tokens_list, tags_list = [], []
    current_tokens, current_tags = [], []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "":
                if current_tokens:
                    tokens_list.append(current_tokens)
                    tags_list.append(current_tags)
                    current_tokens, current_tags = [], []
            else:
                parts = line.split()
                current_tokens.append(parts[0])
                current_tags.append(label2id[parts[1]])

    if current_tokens:
        tokens_list.append(current_tokens)
        tags_list.append(current_tags)

    return Dataset.from_dict({"tokens": tokens_list, "ner_tags": tags_list})

In [None]:
def tokenize_and_align_labels(examples, tokenizer):
    """Tokenize inputs and align NER labels with tokens."""
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    """Compute evaluation metrics using seqeval."""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric = evaluate.load("seqeval")
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
def main():
    # Uncomment to install dependencies in a new environment (e.g., Colab)
    # os.system("pip install transformers datasets seqeval accelerate evaluate -U")

    # Check environment
    check_environment()

    # Print label mappings
    print("\n--- Label Mappings ---")
    print(f"Label to ID: {label2id}")
    print(f"ID to Label: {id2label}")

    # Load and split dataset
    print("\n--- Loading and Splitting Dataset ---")
    full_dataset = create_dataset_from_conll(LABELED_DATA_PATH)
    train_test_split = full_dataset.train_test_split(test_size=0.2, seed=42)
    final_dataset = DatasetDict({
        "train": train_test_split["train"],
        "test": train_test_split["test"]
    })
    print("\n--- Dataset Created and Split ---")
    print(final_dataset)
    print("\nExample from training set:")
    print(final_dataset["train"][0])

    # Initialize tokenizer
    print(f"\n--- Initializing tokenizer from: {MODEL_CHECKPOINT} ---")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    # Tokenize dataset
    tokenized_datasets = final_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer), batched=True
    )
    print(f"\n--- Tokenization with '{MODEL_CHECKPOINT}' Complete ---")
    print("Original Tokens:", final_dataset["train"][0]["tokens"])
    print("New Tokens:", tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["input_ids"]))
    print("New Labels:", tokenized_datasets["train"][0]["labels"])

    # Load model
    print("\n--- Loading Model ---")
    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=len(labels_list),
        id2label=id2label,
        label2id=label2id
    )

    # Set up training arguments with memory optimizations
    print("\n--- Setting Up Training Arguments ---")
    args = TrainingArguments(
        output_dir=OUTPUT_MODEL_DIR,
        eval_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        fp16=True,
        num_train_epochs=5,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    # Initialize data collator and trainer
    data_collator = DataCollatorForTokenClassification(tokenizer)
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Proactive memory cleanup
    print("\n--- Cleaning up memory before training ---")
    del full_dataset
    del final_dataset
    gc.collect()

    # Train model
    print(f"\n--- Starting Training for '{MODEL_CHECKPOINT}' ---")
    try:
        trainer.train()
        print("\n--- Training Complete ---")
    except Exception as e:
        print(f"Training failed: {e}")
        raise

    # Evaluate model
    print("\n--- Evaluating Final Model ---")
    final_evaluation = trainer.evaluate()
    print(final_evaluation)

    # Save model and tokenizer
    print("\n--- Saving Final Model and Tokenizer ---")
    os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)
    trainer.save_model(OUTPUT_MODEL_DIR)
    tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
    print(f"✅ Model saved successfully to {OUTPUT_MODEL_DIR}")

    # Notes:
    # - This script fine-tunes `bert-base-multilingual-cased` with memory-efficient settings
    #   (small batch size, gradient accumulation, mixed-precision training).
    # - Compare the F1-score, precision, and recall with the `xlm-roberta-base` model from Task 3
    #   to select the best model for production.
    # - The small dataset (40 train, 10 test) may limit performance. Consider augmenting data for better results.
    # - Save the `final_evaluation` results for the final report.

if __name__ == "__main__":
    main()