In [20]:
!pip install transformers datasets seqeval



In [21]:
!pip install transformers datasets seqeval accelerate peft



In [22]:
!pip install transformers datasets seqeval accelerate peft shap lime



In [23]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments)
from transformers import DataCollatorForTokenClassification
from peft import LoraConfig, get_peft_model
from seqeval.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

# Parse CoNLL formatted data
def parse_conll(file_path):
    sentences, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as file:
        words, tags = [], []
        for line in file:
            if line.strip() == "":
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                word, tag = line.strip().split()
                words.append(word)
                tags.append(tag)
        if words:
            sentences.append(words)
            labels.append(tags)
    return sentences, labels


In [25]:
# Load data
file_path = "/content/drive/MyDrive/Colab Notebooks/EthioMart/labeled_telegram_product_price_location.txt-"
sentences, labels = parse_conll(file_path)
dataset = Dataset.from_dict({"tokens": sentences, "ner_tags": labels})

In [26]:
# Create label mappings
label_list = sorted(list(set(tag for sublist in labels for tag in sublist)))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}


In [27]:
# Load model and tokenizer
base_model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForTokenClassification.from_pretrained(
    base_model_name, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Apply LoRA for PEFT (Parameter Efficient Fine-Tuning)
peft_config = LoraConfig(
    task_type="TOKEN_CLS",
    r=16,
    lora_alpha=32,
    target_modules=[
        "attention.self.query", "attention.self.key", "attention.self.value",
        "intermediate.dense", "output.dense"
    ],
    lora_dropout=0.1,
)
model = get_peft_model(model, peft_config)



In [29]:
# Tokenize and align labels with padding and truncation
def tokenize_and_align_labels(batch):
    tokenized = tokenizer(batch["tokens"], truncation=True, padding=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned_labels = []
        for idx in word_ids:
            if idx is None:
                aligned_labels.append(-100)  # Padding token
            else:
                aligned_labels.append(label_to_id[label[idx]])
        labels.append(aligned_labels)
    tokenized["labels"] = labels
    return tokenized

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/13000 [00:00<?, ? examples/s]

In [30]:
# Split dataset into train and validation
train_valid_split = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset = DatasetDict({
    "train": train_valid_split["train"],
    "validation": train_valid_split["test"]
})


In [31]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
)




In [32]:
 # Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Define custom metrics function
def compute_metrics(pred):
    predictions, labels = pred
    preds = np.argmax(predictions, axis=2)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id_to_label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    precision = precision_score(true_labels, true_preds, average='macro')
    recall = recall_score(true_labels, true_preds, average='macro')
    f1 = f1_score(true_labels, true_preds, average='macro')

    return {"precision": precision, "recall": recall, "f1": f1}


In [33]:
# Model comparison
model_names = ["rasyosef/bert-tiny-amharic", "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"]
results = []

for model_name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id
    )

    # Apply PEFT (LoRA) configuration
    model = get_peft_model(model, peft_config)

    # Define Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    predictions, labels, _ = trainer.predict(tokenized_dataset["validation"])
    preds = np.argmax(predictions, axis=2)
    true_preds = [[id_to_label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    # Calculate F1-score using seqeval classification report
    f1 = classification_report(
        [[id_to_label[l] for l in label if l != -100] for label in labels],
        true_preds,
        output_dict=True
    )["macro avg"]["f1-score"]

    results.append((model_name, f1))

# Print the results
print("Model Comparison Results:")
for model, score in results:
    print(f"{model}: F1-score = {score:.4f}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: index out of range in self