In [1]:
# STEP 1: Install Dependencies
!pip install -q datasets seqeval

In [2]:
!pip install transformers datasets --upgrade



In [3]:
# STEP 2: Import Libraries
import os
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
import numpy as np
from sklearn.metrics import classification_report
import torch


In [4]:
# STEP 3: Define Labels
labels = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [5]:
# STEP 4: Load CoNLL Data Manually
def read_conll_file(file_path):
    tokens, tags, sentences = [], [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": [label2id[t] for t in tags]})
                    tokens, tags = [], []
            else:
                splits = line.strip().split()
                tokens.append(splits[0])
                tags.append(splits[-1])
    return Dataset.from_list(sentences)

dataset = read_conll_file("labeled_dataset.conll")
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-base")
model = AutoModelForMaskedLM.from_pretrained("Davlan/afro-xlmr-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:

# STEP 5: Load Tokenizer & Model (Choose one: bert-tiny-amharic, XLM-R, or AfroXLMR)
model_checkpoint = "Davlan/afro-xlmr-base"
# or "xlm-roberta-base" or "Davlan/afroxlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels), id2label=id2label, label2id=label2id)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# STEP 6: Tokenize & Align Labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label[word_idx])
            else:
                aligned_labels.append(label[word_idx] if label[word_idx] != -100 else -100)
            previous_word_idx = word_idx
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [13]:
# STEP 7: Training Arguments
from transformers import TrainingArguments


args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="epoch", # use this after fixing transformers
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # REDUCED
    per_device_eval_batch_size=4,   # REDUCED
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    load_best_model_at_end=True,
)


data_collator = DataCollatorForTokenClassification(tokenizer)

In [14]:
# STEP 8: Metrics
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions, true_labels = [], []
    for pred, label in zip(predictions, labels):
        true_pred = [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        true_lab = [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        true_predictions.append(true_pred)
        true_labels.append(true_lab)

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
    }

In [1]:
# STEP 9: Train the Model
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Disable temporarily
)


trainer.train()

NameError: name 'Trainer' is not defined

In [None]:

# STEP 10: Save Final Model
model_path = "./amharic_ner_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
