In [1]:

import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.metrics import classification_report
import numpy as np
import torch
import os

# Step 3: Load CoNLL formatted data
from datasets import load_dataset

def read_conll(file_path):
    with open(file_path, encoding="utf-8") as f:
        tokens, tags, sentences = [], [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": tags})
                    tokens, tags = [], []
                continue
            splits = line.split()
            if len(splits) != 2:
                continue 
            token, tag = splits
            tokens.append(token)
            tags.append(tag)
        return Dataset.from_list(sentences)

conll_path = "../data/labeled/amharic_ner_data.conll"  
ner_dataset = read_conll(conll_path)

# Step 4: Label mapping
unique_tags = set(tag for example in ner_dataset for tag in example['ner_tags'])
label2id = {tag: idx for idx, tag in enumerate(sorted(unique_tags))}
id2label = {idx: tag for tag, idx in label2id.items()}

# Convert string labels to IDs
def encode_labels(example):
    example["labels"] = [label2id[tag] for tag in example["ner_tags"]]
    return example

er_dataset = ner_dataset.map(encode_labels)

# Step 5: Tokenization
model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])
        else:
            label_ids.append(example["labels"][word_idx])  # Use same label for sub-tokens
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = ner_dataset.map(tokenize_and_align_labels)

# Step 6: Load model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id,    ignore_mismatched_sizes=True  )

args = TrainingArguments(
    output_dir="outputs/ner_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01
)
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 8: Train
trainer.train()

# Step 9: Save the model
trainer.save_model("models/amharic-ner-model")

print("\n✅ Model training complete and saved!")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-ner-hrl and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([0]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([0, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`