<a href="https://colab.research.google.com/github/5237-mests/Amharic-E-commerce-Data-Extractor/blob/task-3/notebooks/finetuning_colabnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tuning

### Helper functions

In [1]:
# Load and parse CoNLL format
def read_conll_file(filepath):
    sentences, labels = [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, tag = parts
                    sentence.append(token)
                    label.append(tag)
                else:
                    print(f"⚠️ Skipping malformed line {line_num}: {line}")
    return sentences, labels


In [2]:
# Tokenize text and align labels
def tokenize_and_align_labels(examples, tokenizer, label_to_id):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id[label[word_idx]])
            else:
                aligned_labels.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
!pip install seqeval

In [3]:
# compute metrics function
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for pred, lab in zip(predictions, labels):
        temp_true_labels = []
        temp_predictions = []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                temp_true_labels.append(id_to_label[l_i])
                temp_predictions.append(id_to_label[p_i])
        true_labels.append(temp_true_labels)
        true_predictions.append(temp_predictions)

    print(classification_report(true_labels, true_predictions))  # Optional
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }


In [4]:
# Load model, tokenizer, label mappings
from transformers import AutoTokenizer, AutoModelForTokenClassification

def load_model_and_tokenizer(model_name, num_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)
    return tokenizer, model

In [5]:
#  Hugging Face Trainer setup and run
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

def setup_trainer(model, tokenizer, tokenized_dataset, output_dir="./ner-model", epochs=5):
    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=epochs,
        weight_decay=0.01,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    return trainer

### Load Datasets

In [6]:
# Load data
from datasets import Dataset
sentences, tags = read_conll_file("test_final.conll")
data = {"tokens": sentences, "ner_tags": tags}
dataset = Dataset.from_dict(data).train_test_split(test_size=0.2)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 999
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 250
    })
})

### Prepare labels

In [8]:
# Prepare labels
unique_tags = sorted(set(tag for seq in tags for tag in seq))
label_to_id = {tag: i for i, tag in enumerate(unique_tags)}
id_to_label = {i: tag for tag, i in label_to_id.items()}

In [9]:
print("unique tags: ")
unique_tags

unique tags: 


['B-LOC', 'B-PRICE', 'B-PRODUCT', 'I-LOC', 'I-PRICE', 'I-PRODUCT', 'O']

In [10]:
print("label to id: ")
label_to_id

label to id: 


{'B-LOC': 0,
 'B-PRICE': 1,
 'B-PRODUCT': 2,
 'I-LOC': 3,
 'I-PRICE': 4,
 'I-PRODUCT': 5,
 'O': 6}

In [11]:
print("id to label: ")
id_to_label

id to label: 


{0: 'B-LOC',
 1: 'B-PRICE',
 2: 'B-PRODUCT',
 3: 'I-LOC',
 4: 'I-PRICE',
 5: 'I-PRODUCT',
 6: 'O'}

## Load model, train model, and save trained model

write function to reuse for 3 d/t models

In [13]:
# train model, eval func
def model_trainer(model_name):
  # Load model
  tokenizer, model = load_model_and_tokenizer(model_name, num_labels=len(unique_tags))

  # Tokenize and align
  tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label_to_id), batched=True)

  # Setup trainer
  trainer = setup_trainer(model, tokenizer, tokenized_dataset)

  # Train
  trainer.train()

  # Evaluate models
  metrics = trainer.evaluate()
  print(metrics)

  # Save model
  # trainer.save_model(f"/content/drive/MyDrive/models/{model_name}-ner")
  # tokenizer.save_pretrained(f"/content/drive/MyDrive/models/{model_name}-ner")

  save_path = f"/content/drive/MyDrive/models4/{model_name.replace('/', '_')}-ner"
  trainer.save_model(save_path)
  tokenizer.save_pretrained(save_path)

  return metrics


### Train model "xlm-roberta-base"

In [None]:
  # mount google drive
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load model
model_name = "xlm-roberta-base"

In [None]:
# call model trainer
metrics = model_trainer(model_name)

In [None]:
# Load my trained model
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/models2/xlm-roberta-base-ner")


In [None]:
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/models2/xlm-roberta-base-ner")

In [None]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # Groups subwords into full entity spans
)


Device set to use cpu


Run Prediction on new text

In [None]:
text = "አዲስ የፀጉር መቀነቢያ ዋጋ 1200 ብር ኣድራሻ በአዲስ አበባ ይገኛል።"

results = ner_pipeline(text)

for entity in results:
    print(f"{entity['word']} → {entity['entity_group']} ({entity['score']:.2f})")


አዲስ የ → LABEL_0 (0.64)
ፀጉር መቀነቢያ → LABEL_3 (0.86)
ዋጋ 1200 ብር → LABEL_2 (0.89)
ኣድራሻ በአዲስ አበባ ይገኛል። → LABEL_4 (0.87)


### Train model "rasyosef/bert-tiny-amharic"

In [None]:
# model rasyosef/bert-tiny-amharic
model_name = "rasyosef/bert-tiny-amharic"

In [None]:
# call model trainer
metrics = model_trainer(model_name)

In [None]:
# Load my trained model
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/models3/rasyosef_bert-tiny-amharic-ner")

In [None]:
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/models3/rasyosef_bert-tiny-amharic-ner")

In [None]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # Groups subwords into full entity spans
)

Device set to use cpu


In [None]:
text = "አዲስ የፀጉር መቀነቢያ ዋጋ 1200 ብር ኣድራሻ በአዲስ አበባ ይገኛል።"

results = ner_pipeline(text)

for entity in results:
    print(f"{entity['word']} → {entity['entity_group']} ({entity['score']:.2f})")


አዲስ የፀ → LABEL_5 (0.30)
##ጉር → LABEL_2 (0.29)
መቀነቢያ ዋጋ 1200 ብር ኣድራሻ በአዲስ አበባ ይገኛል ። → LABEL_5 (0.35)


### Train model "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"

In [14]:
# modelmasakhane/afroxlmr-large-ner-masakhaner-1.0_2.0
model_name = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"

In [None]:
# call model trainer
metrics = model_trainer(model_name)

Not successfully trained due to limited RAM in cola.