3: Fine Tune NER Model

# Step 1: Environment Setup
!pip install -q transformers datasets seqeval accelerate

In [None]:
# Step 2: Upload & Parse the Dataset.
# from google.colab import files
# uploaded = files.upload()

In [None]:
#Parse the uploaded CoNLL file
def parse_conll_file(filepath):
    tokens = []
    ner_tags = []

    with open(filepath, encoding='utf-8') as f:
        temp_tokens = []
        temp_tags = []
        for line in f:
            line = line.strip()
            if line == "":
                if temp_tokens:
                    tokens.append(temp_tokens)
                    ner_tags.append(temp_tags)
                    temp_tokens, temp_tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    temp_tokens.append(splits[0])
                    temp_tags.append(splits[1])
        if temp_tokens:
            tokens.append(temp_tokens)
            ner_tags.append(temp_tags)

    return {"tokens": tokens, "ner_tags": ner_tags}

# uploaded wiht the right file name
file_name = "ner_auto_labels.conll"
data_dict = parse_conll_file(file_name)


In [None]:
# Convert to
from datasets import Dataset

dataset = Dataset.from_dict(data_dict)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Define your labels based on your dataset
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]


# Mappings
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}


In [None]:
# Step 3: Tokenization and Label Alignment

# Install & import the tokenizer
from transformers import AutoTokenizer

model_checkpoint = "Davlan/xlm-roberta-base-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [None]:
# tokenizer and model loading code:

from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
)



In [None]:
# Define the tokenization + alignment function

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=256,
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        current_word = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != current_word:
                label_ids.append(label_to_id[label[word_idx]])
                current_word = word_idx
            else:
                label_ids.append(-100)

        # pad labels to max_length (256)
        label_ids += [-100] * (256 - len(label_ids))
        label_ids = label_ids[:256]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Apply the tokenizer and label alignment to your dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
# Extract all unique labels from the dataset
all_labels = set()
for tags in dataset["train"]["ner_tags"]:
    all_labels.update(tags)

labels = sorted(all_labels)  # sorted list of unique labels
print(labels)


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
)


In [None]:
# !pip install --upgrade transformers

In [None]:
# Setup Training Arguments
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    report_to=[],  # <- disables logging to W&B and others
    # Removed load_best_model_at_end and metric_for_best_model to avoid mismatch
)



In [None]:
# Step 5: Define metrics function for evaluation

import numpy as np
from datasets import load_metric

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    # For token classification, predictions are logits, so take argmax
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [
        [id_to_label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Optionally, get overall metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
# Step 6: Initialize the Trainer and start training

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
### Here is hte last Step
trainer.train()

In [None]:
# Run evaluation with the Trainer
if eval_dataset is not None:
    trainer.evaluate(eval_dataset=eval_dataset)
else:
    print("No evaluation dataset found.")



In [None]:
# Define output directory
output_dir = "./ner_model_amharic"

# Save model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")
