In [15]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
import evaluate

In [17]:
# 3. Load Labeled Dataset from Local CoNLL File
# Expected CoNLL format: token<TAB>label, with empty lines separating sentences

def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
                continue
            token, tag = line.split()
            tokens.append(token)
            tags.append(tag)
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

sentences, tags = read_conll("../data1/amharic_ner_data_conll.txt")

# 4. Automatically extract unique labels from the dataset
unique_tags = sorted(set(tag for doc in tags for tag in doc))
label_list = unique_tags
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

# 5. Convert string tags to their corresponding IDs
tag_ids = [[label_to_id[tag] for tag in doc] for doc in tags]

# 6. Create Hugging Face Dataset object
data = {"tokens": sentences, "ner_tags": tag_ids}
dataset = Dataset.from_dict(data)

# 7. Load Pretrained Multilingual Model and Tokenizer
model_checkpoint = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True  # Important to reinitialize classification head
)

# 8. Tokenize and Align Labels with Tokens
def tokenize_and_align_labels(example):
    tokenized = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)  # Special token or padding
        elif word_idx != previous_word_idx:
            aligned_labels.append(example['ner_tags'][word_idx])
        else:
            # For wordpieces, you can choose to assign the same label or -100 to ignore
            aligned_labels.append(example['ner_tags'][word_idx])
        previous_word_idx = word_idx
    tokenized['labels'] = aligned_labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# 9. Training Configuration
args = TrainingArguments(
    output_dir="../data1/amharic-ner-model",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

# 10. Load Metric
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# 11. Initialize Trainer and Train
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()

# 12. Save Final Model and Tokenizer
trainer.save_model("../data1/amharic-ner-model")
tokenizer.save_pretrained("../data1/amharic-ner-model")

# 13. Optional: Check label info
print(f"Labels: {label_list}")
print(f"Number of labels: {len(label_list)}")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([114]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([114, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss


Labels: ['0', '1', '1,000', '1,100', '1,200', '1,250', '1,300', '1,400', '1,500', '1,600', '1,700', '1,800', '1,900', '1.8', '1.80', '10', '100', '101', '12', '13', '138', '15', '150', '16', '175', '18', '180', '2', '2,000', '2,100', '2,200', '2,300', '2,400', '2,500', '2,600', '2,700', '2,800', '2,900', '2.0', '200', '22', '220', '250', '3', '3,000', '3,200', '3,300', '3,400', '3,500', '3,600', '3,700', '30', '300', '304', '330', '350', '36', '360', '4', '4,000', '4,300', '4,500', '4,600', '4,700', '4,900', '400', '420', '450', '47', '49', '5', '5,200', '5,500', '50', '500', '54', '550', '6', '6,500', '6,800', '600', '650', '7', '7,200', '70', '700', '750', '780', '8', '8,800', '800', '850', '9', '900', 'All', 'Dimensions', 'Dry', 'GEESSAN', 'Grinder', 'Kel', 'Light', 'N/A', 'O', 'Rack', 'Relief', 'Saachi', 'Sticker', 'Training', 'basket', 'dust', 'pan', 'protection', 'stop', 'ዋጋ፦']
Number of labels: 114


In [13]:
print(label_to_id)


{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3}
