In [None]:
# Amharic Telegram NER Fine-Tuning with LoRA and Hugging Face

# STEP 1: Install Required Libraries (for Google Colab or local GPU env)
!pip install transformers datasets accelerate peft seqeval

In [None]:
# STEP 2: Load Your CoNLL-Formatted Dataset
from datasets import load_dataset

In [None]:
# STEP 2: Manually Parse CoNLL-Formatted Dataset

def load_conll_file(file_path):
    examples = []
    tokens = []
    ner_tags = []

    with open(file_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    examples.append({"tokens": tokens, "ner_tags": ner_tags})
                    tokens, ner_tags = [], []
            else:
                splits = line.split()
                if len(splits) == 2:
                    token, tag = splits
                    tokens.append(token)
                    ner_tags.append(tag)

    if tokens:
        examples.append({"tokens": tokens, "ner_tags": ner_tags})
    return examples

In [None]:
from datasets import Dataset, DatasetDict

train_data = load_conll_file("/content/ner_labeled_conll.conll")

dataset = DatasetDict({
    "train": Dataset.from_list(train_data)
})

In [None]:
print(dataset['train'][4])

In [None]:
# STEP 4: Tokenization + Label Alignment
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Align token labels with tokenizer's subword split
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        truncation=True,
        padding='max_length',
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    word_ids = tokenized.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["ner_tags"][word_idx]])
        else:
            labels.append(label2id[example["ner_tags"][word_idx]])
        previous_word_idx = word_idx

    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

In [None]:
# STEP 5: Load Model + Apply PEFT (LoRA)
from transformers import AutoModelForTokenClassification
from peft import LoraConfig, get_peft_model, TaskType

base_model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]
)

peft_model = get_peft_model(base_model, lora_config)

In [None]:
# STEP 6: Trainer Setup
import os
os.environ["WANDB_DISABLED"] = "true"


from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")
label_list = list(label2id.keys())

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return seqeval.compute(predictions=true_predictions, references=true_labels)

training_args = TrainingArguments(
    output_dir="./amharic-ner-checkpoints",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# STEP 7: Train the Model
trainer.train()