<a href="https://colab.research.google.com/github/5237-mests/Amharic-E-commerce-Data-Extractor/blob/task-3/notebooks/finetuning_colabnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tuning

### Helper functions

In [None]:
# Load and parse CoNLL format
def read_conll_file(filepath):
    sentences, labels = [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, tag = parts
                    sentence.append(token)
                    label.append(tag)
                else:
                    print(f"⚠️ Skipping malformed line {line_num}: {line}")
    return sentences, labels


In [None]:
# Tokenize text and align labels
def tokenize_and_align_labels(examples, tokenizer, label_to_id):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id[label[word_idx]])
            else:
                aligned_labels.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
# Load model, tokenizer, label mappings
from transformers import AutoTokenizer, AutoModelForTokenClassification

def load_model_and_tokenizer(model_name, num_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    return tokenizer, model


In [None]:
#  Hugging Face Trainer setup and run
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

def setup_trainer(model, tokenizer, tokenized_dataset, output_dir="./ner-model", epochs=5):
    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=epochs,
        weight_decay=0.01,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    return trainer

### Load Datasets

In [None]:
# Load data
from datasets import Dataset
sentences, tags = read_conll_file("ner_data.conll")
data = {"tokens": sentences, "ner_tags": tags}
dataset = Dataset.from_dict(data).train_test_split(test_size=0.2)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 56
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14
    })
})

### Prepare labels

In [None]:
# Prepare labels
unique_tags = sorted(set(tag for seq in tags for tag in seq))
label_to_id = {tag: i for i, tag in enumerate(unique_tags)}
id_to_label = {i: tag for tag, i in label_to_id.items()}

### Load model

In [None]:
# Load model
model_name = "xlm-roberta-base"
tokenizer, model = load_model_and_tokenizer(model_name, num_labels=len(unique_tags))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize
tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label_to_id), batched=True)

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [None]:
# Train
trainer = setup_trainer(model, tokenizer, tokenized_dataset)
trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmesfin-7[0m ([33mmesfin-7-ml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,1.227272
2,No log,1.017032
3,No log,0.887153
4,No log,0.860156
5,No log,0.839303


TrainOutput(global_step=35, training_loss=1.0076818193708148, metrics={'train_runtime': 1470.5263, 'train_samples_per_second': 0.19, 'train_steps_per_second': 0.024, 'total_flos': 35815605680448.0, 'train_loss': 1.0076818193708148, 'epoch': 5.0})

In [None]:
# Evaluate models
eval_results = trainer.evaluate()
print("Evaluation Results:")
eval_results

Evaluation Results:


{'eval_loss': 0.8393029570579529,
 'eval_runtime': 10.4105,
 'eval_samples_per_second': 1.345,
 'eval_steps_per_second': 0.192,
 'epoch': 5.0}

### Save models

In [None]:
# Save
trainer.save_model("models/xlm-roberta-ner")
tokenizer.save_pretrained("models/xlm-roberta-ner")

('models/xlm-roberta-ner/tokenizer_config.json',
 'models/xlm-roberta-ner/special_tokens_map.json',
 'models/xlm-roberta-ner/sentencepiece.bpe.model',
 'models/xlm-roberta-ner/added_tokens.json',
 'models/xlm-roberta-ner/tokenizer.json')