# Task3_FineTune_NER_Model.ipynb

## Import dependencies

In [None]:
# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Step 1: Install Necessary Libraries ---
!pip install transformers datasets seqeval accelerate evaluate

# IMPORTANT: After running this cell, if prompted, click "Restart runtime"
# and then "Run all cells" to ensure all libraries are correctly loaded.

# --- Step 2: Import Libraries ---
import os
import json
import numpy as np
import pandas as pd # Although not strictly used in this specific notebook, good practice for ML setup
import torch
from datasets import Dataset, Features, Value, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import evaluate

# --- Configuration ---

In [None]:
DRIVE_PROJECT_BASE_PATH = "/content/drive/MyDrive/colab_projects/EthioMart_NER"

# Path to your labeled CoNLL file within Google Drive
CONLL_FILE_PATH = os.path.join(DRIVE_PROJECT_BASE_PATH, "data/labeled_data/labeled_data.conll")

# Define your entity types (must match what you used for training)
LABEL_NAMES = ["O", "B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]

# --- Step 3: Load and Parse the Labeled Dataset in CoNLL Format ---
def parse_conll_file(file_path):
    """Parses a CoNLL formatted file into a list of dictionaries."""
    try:
        raw_text = open(file_path, "r", encoding="utf-8").read()
    except FileNotFoundError:
        print(f"Error: CoNLL file not found at {file_path}. Please upload it or check the path.")
        return []
    
    sentences = raw_text.strip().split("\n\n")
    data = []
    for sentence_str in sentences:
        tokens = []
        ner_tags = []
        lines = sentence_str.split("\n")
        for line in lines:
            if line.strip():
                parts = line.split("\t")
                if len(parts) == 2:
                    tokens.append(parts[0])
                    ner_tags.append(parts[1])
        if tokens and ner_tags:
            data.append({"tokens": tokens, "ner_tags": ner_tags})
    return data

print(f"Loading labeled data from {CONLL_FILE_PATH} for fine-tuning...")
conll_data = parse_conll_file(CONLL_FILE_PATH)
if not conll_data:
    print("No labeled data found. Fine-tuning will be skipped.")
    exit() # Exit if no data to train on

print(f"Loaded {len(conll_data)} sentences for fine-tuning.")

# Define features for the dataset. ClassLabel maps string labels to integers.
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=LABEL_NAMES))
})

# Create the Hugging Face Dataset object
dataset = Dataset.from_list(conll_data, features=features)

# --- Step 4: Split Data into Training and Validation Sets ---
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"\nDataset split: {len(train_dataset)} training examples, {len(eval_dataset)} evaluation examples.")
print("Example from training dataset:")
print(train_dataset[0])

# --- Step 5: Choose a Pre-trained Model and Tokenizer ---
# Using the best performing model identified from previous comparison
MODEL_NAME = "mbeukman/xlm-roberta-base-finetuned-ner-amharic"
MODEL_SHORT_NAME = "XLM-R-Amharic-NER"

print(f"\nLoading tokenizer and model from: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL_NAMES),
    id2label={i: label for i, label in enumerate(LABEL_NAMES)},
    label2id={label: i for i, label in enumerate(LABEL_NAMES)},
    ignore_mismatched_sizes=True # Crucial: tells the model to reinitialize the classification head
)

# Verify label mappings
id2label = model.config.id2label # Get model's id2label after loading
label2id = {v: k for k, v in id2label.items()} # Correct label2id mapping
print(f"Model's ID to Label mapping: {id2label}")
print(f"Model's Label to ID mapping: {label2id}")

# --- Step 6: Tokenization and Label Alignment Function ---
def tokenize_and_align_labels(examples, id2label_map, label2id_map):
    """Aligns word-level CoNLL labels to subword tokens."""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )
    labels = []
    for i, label_ids_raw in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        current_labels = []
        for word_idx in word_ids:
            if word_idx is None:
                current_labels.append(-100)
            elif word_idx != previous_word_idx:
                original_label_string = id2label_map[label_ids_raw[word_idx]]
                current_labels.append(label2id_map[original_label_string])
            else:
                original_label_string = id2label_map[label_ids_raw[word_idx]]
                if original_label_string.startswith("B-"):
                    i_label_string = "I-" + original_label_string[2:]
                    if i_label_string in label2id_map:
                        current_labels.append(label2id_map[i_label_string])
                    else:
                        current_labels.append(label2id_map[original_label_string])
                else:
                    current_labels.append(label2id_map[original_label_string])
            previous_word_idx = word_idx
        labels.append(current_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print("\nTokenizing and aligning labels...")
tokenized_train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, id2label, label2id), batched=True)
# FIX: Corrected typo from 'tokenize_and_and_align_labels' to 'tokenize_and_align_labels'
tokenized_eval_dataset = eval_dataset.map(lambda x: tokenize_and_align_labels(x, id2label, label2id), batched=True)

print("\nExample of tokenized and aligned input:")
print(tokenized_train_dataset[0])

# --- Step 7: Set Up Training Arguments ---
# Output directory for this specific model
OUTPUT_DIR = os.path.join(DRIVE_PROJECT_BASE_PATH, f"{MODEL_SHORT_NAME}_ner_output")
LOGGING_DIR = os.path.join(DRIVE_PROJECT_BASE_PATH, f"{MODEL_SHORT_NAME}_ner_logs")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20, # Reduced epochs for faster iteration
    weight_decay=0.01,
    logging_dir=LOGGING_DIR,
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    # Further parameter tuning considerations for optimizing results (beyond basic setup):
    # - learning_rate: Experiment with values like 1e-5, 3e-5.
    # - gradient_accumulation_steps: Increase if batch_size is small to simulate larger effective batch size.
    # - lr_scheduler_type: Use "cosine" or "linear" for more sophisticated learning rate decay.
    # - fp16: Set to True for faster training with mixed precision if GPU supports it.
    # - num_train_epochs: Can be adjusted based on validation loss/F1-score trends (e.g., using callbacks for early stopping).
    # - optimizer: Explore alternatives like AdamW with custom parameters.
)

# --- Step 8: Define Metrics for Evaluation ---
metric = evaluate.load("seqeval")

def compute_metrics(p):
    """Computes and returns evaluation metrics using seqeval."""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[LABEL_NAMES[l] for l in label if l != -100] for label in labels]
    true_predictions = [[LABEL_NAMES[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# --- Step 9: Initialize Trainer ---
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- Step 10: Fine-Tune the Model ---
print("\nStarting model fine-tuning...")
trainer.train()
print("Fine-tuning complete!")

# --- Step 11: Evaluate the Fine-tuned Model on Validation Set ---
print("\nEvaluating fine-tuned model...")
eval_results = trainer.evaluate()
print("\nEvaluation Results:")
print(eval_results)

# --- Step 12: Save the Fine-tuned Model and Tokenizer ---
FINAL_MODEL_PATH = os.path.join(OUTPUT_DIR, "final_model")
print(f"\nSaving fine-tuned model to {FINAL_MODEL_PATH}...")
trainer.save_model(FINAL_MODEL_PATH)
tokenizer.save_pretrained(FINAL_MODEL_PATH)
print("Model and tokenizer saved successfully.")

print(f"\nFine-tuning for {MODEL_SHORT_NAME} completed and model saved to: {FINAL_MODEL_PATH}")
