In [1]:
!pip install transformers datasets accelerate seqeval -q

# ## 2. Import Libraries
import pandas as pd
from datasets import Dataset, Features, Sequence, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np # For compute_metrics


In [2]:
# # Task 3: Fine-Tune NER Model for Amharic
# ## 3. Define Labels and ID-to-Label Mapping
# These labels correspond to the CoNLL format and your entity types.
labels = ["O", "B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE"]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# ## 4. Load the Labeled Dataset in CoNLL Format
# We'll use the simulated CoNLL data generated in the previous step.
# In a real scenario, you would load this from a file, e.g., `labeled_data.txt`.

# --- START OF SIMULATED CoNLL DATA ---
# This data is a placeholder. Replace it with the content of your
# `labeled_data.txt` from Task 2 for real training.
conll_data_string = """
አዲስ	B-LOC
አበባ	I-LOC
ላይ	O
ጥሩ	O
ጥራት	O
ያለው	O
የህፃናት	B-Product
አልጋ	I-Product
በ2500	B-PRICE
ብር	I-PRICE
ብቻ።	O

የሴቶች	B-Product
ቦርሳ	I-Product
በ1200	B-PRICE
ብር	I-PRICE
ቦሌ	B-LOC
አካባቢ	O
ይገኛል።	O

ሳሪስ	B-LOC
ለገበያ	O
የቀረበ	O
ዘመናዊ	B-Product
ቲቪ	I-Product
8000	B-PRICE
ብር።	I-PRICE

ለቤትዎ	O
ውበት	O
የሆኑ	O
መጋረጃዎች	B-Product
በ650	B-PRICE
ብር	I-PRICE
ከፒያሳ።	B-LOC

የወንዶች	B-Product
ሸሚዝ	I-Product
በ950	B-PRICE
ብር	I-PRICE
አዲስ	B-LOC
አበባ።	I-LOC

ጥሩ	O
ስልክ	B-Product
በ1000	B-PRICE
ብር	I-PRICE
ቦሌ	B-LOC
ላይ።	O

ላፕቶፕ	B-Product
በ15000	B-PRICE
ብር	I-PRICE
ሳሪስ	B-LOC
አካባቢ።	O

ለልጆች	O
መጫወቻዎች	B-Product
በ300	B-PRICE
ብር	I-PRICE
ከፒያሳ።	B-LOC

የተለያዩ	O
የቤት	B-Product
እቃዎች	I-Product
7000	B-PRICE
ብር	I-PRICE
አዲስ	B-LOC
አበባ።	I-LOC

ፋሽን	B-Product
ልብሶች	I-Product
800	B-PRICE
ብር	I-PRICE
ቦሌ።	B-LOC

የህፃናት	B-Product
መጫወቻዎች	I-Product
በ450	B-PRICE
ብር	I-PRICE
ሳሪስ።	B-LOC
"""
# --- END OF SIMULATED CoNLL DATA ---

def parse_conll(conll_string):
    sentences = []
    current_sentence = []
    for line in conll_string.strip().split('\n'):
        if line.strip() == "":
            if current_sentence:
                sentences.append(current_sentence)
            current_sentence = []
        else:
            parts = line.split('\t')
            if len(parts) == 2: # Ensure it's a token-label pair
                current_sentence.append(parts)
    if current_sentence: # Add the last sentence if not followed by a blank line
        sentences.append(current_sentence)
    return sentences

parsed_data = parse_conll(conll_data_string)

# Convert parsed data to a format suitable for Hugging Face `datasets`
processed_data = []
for sentence_data in parsed_data:
    tokens = [item[0] for item in sentence_data]
    # Convert labels to numerical IDs. Handle potential KeyError if label not found.
    ner_tags = []
    for item in sentence_data:
        label_str = item[1]
        if label_str in label2id:
            ner_tags.append(label2id[label_str])
        else:
            # Handle unknown labels if any, perhaps by assigning to O or logging
            ner_tags.append(label2id["O"]) # Default to 'O' for safety
            print(f"Warning: Unknown label '{label_str}' encountered. Assigned 'O'.")

    processed_data.append({"tokens": tokens, "ner_tags": ner_tags})

# Create a Hugging Face Dataset
features = Features({
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=labels))
})

dataset = Dataset.from_list(processed_data, features=features)

# ## 5. Tokenize the Data and Align Labels
# We use `XLM-Roberta-base` as the pre-trained model.
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length", # Pad to max length of model's input
        max_length=128 # Define a reasonable max length for tokens
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word_idx that is None. We set their label to -100
            # so they are ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to -100.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# ## 6. Split Dataset into Training and Validation Sets
# For demonstration, we'll split the small simulated dataset.
# In a real scenario, you'd have a much larger dataset for training.
# Ensure there's enough data for both train and test. If the dataset is tiny,
# train_test_split might result in empty sets, so add a check.
if len(tokenized_dataset) > 1:
    train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test_split["train"]
    eval_dataset = train_test_split["test"]
else:
    train_dataset = tokenized_dataset
    eval_dataset = tokenized_dataset # Use the same for eval if only one sample, for demonstration
    print("Warning: Dataset is too small for meaningful split. Using entire dataset for both train and eval.")

print(f"Training examples: {len(train_dataset)}")
print(f"Evaluation examples: {len(eval_dataset)}")


# ## 7. Set Up Training Arguments
# Define the output directory, learning rate, batch size, etc.
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-ner",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False, # Set to True if you want to push to Hugging Face Hub
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# ## 8. Initialize Model and Trainer
# Load the pre-trained model with the correct number of labels.
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# Define a data collator to batch your inputs.
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define compute_metrics function for evaluation
from seqeval.metrics import f1_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels_ids = p # Renamed 'labels' to 'labels_ids' to avoid confusion with the global 'labels' list
    predictions = np.argmax(predictions, axis=2)

    true_labels_filtered = []
    true_predictions_filtered = []

    for sentence_labels, sentence_predictions in zip(labels_ids, predictions):
        temp_true_labels = []
        temp_true_predictions = []
        for label_id, pred_id in zip(sentence_labels, sentence_predictions):
            if label_id != -100: # Only consider tokens that are not special tokens
                temp_true_labels.append(label_id)
                temp_true_predictions.append(pred_id)
        true_labels_filtered.append(temp_true_labels)
        true_predictions_filtered.append(temp_true_predictions)

    # Convert numeric IDs back to original labels for seqeval
    true_labels_decoded = [[id2label[l_id] for l_id in sublist] for sublist in true_labels_filtered]
    true_predictions_decoded = [[id2label[p_id] for p_id in sublist] for sublist in true_predictions_filtered]

    return {
        "f1": f1_score(true_labels_decoded, true_predictions_decoded),
        "precision": precision_score(true_labels_decoded, true_predictions_decoded),
        "recall": recall_score(true_labels_decoded, true_predictions_decoded),
    }

# Initialize the Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ## 9. Fine-Tune the Model
print("Starting model training...")
trainer.train()
print("Model training complete.")

# ## 10. Evaluate the Fine-Tuned Model
print("\n*** Model Evaluation Results ***")
results = trainer.evaluate()
print(results)

# A more detailed classification report
predictions, labels_ids, _ = trainer.predict(eval_dataset) # Renamed for clarity
predictions = np.argmax(predictions, axis=2)

true_labels_filtered_report = []
true_predictions_filtered_report = []

for sentence_labels, sentence_predictions in zip(labels_ids, predictions):
    temp_true_labels = []
    temp_true_predictions = []
    for label_id, pred_id in zip(sentence_labels, sentence_predictions):
        if label_id != -100:
            temp_true_labels.append(label_id)
            temp_true_predictions.append(pred_id)
    true_labels_filtered_report.append(temp_true_labels)
    true_predictions_filtered_report.append(temp_true_predictions)

true_labels_decoded = [[id2label[l_id] for l_id in sublist] for sublist in true_labels_filtered_report]
true_predictions_decoded = [[id2label[p_id] for p_id in sublist] for sublist in true_predictions_filtered_report]


# Handle cases where true_labels_decoded or true_predictions_decoded might be empty
if any(true_labels_decoded) and any(true_predictions_decoded):
    print("\nClassification Report:")
    print(classification_report(true_labels_decoded, true_predictions_decoded, zero_division=0))
else:
    print("\nNo entities found in evaluation set for classification report.")

# ## 11. Save the Model and Tokenizer
output_dir = "./finetuned_ner_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Training examples: 8
Evaluation examples: 3


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting model training...


[34m[1mwandb[0m: Currently logged in as: [33mdmuay2015[0m ([33mdmuay2015-debre-markos-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,No log,1.999116,0.133333,0.095238,0.222222
2,No log,1.978674,0.133333,0.095238,0.222222
3,No log,1.968409,0.133333,0.095238,0.222222


Model training complete.

*** Model Evaluation Results ***


{'eval_loss': 1.9991164207458496, 'eval_f1': 0.13333333333333333, 'eval_precision': 0.09523809523809523, 'eval_recall': 0.2222222222222222, 'eval_runtime': 1.285, 'eval_samples_per_second': 2.335, 'eval_steps_per_second': 0.778, 'epoch': 3.0}

Classification Report:
              precision    recall  f1-score   support

         LOC       0.00      0.00      0.00         3
       PRICE       0.00      0.00      0.00         3
     Product       0.10      0.67      0.17         3

   micro avg       0.10      0.22      0.13         9
   macro avg       0.03      0.22      0.06         9
weighted avg       0.03      0.22      0.06         9

Model and tokenizer saved to ./finetuned_ner_model


In [3]:
# ## 1. Mount Google Drive
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

google_drive_save_path = "/content/drive/MyDrive/my_ner_models/amharic_ner_xlmroberta"

# Create the directory if it doesn't exist
import os
os.makedirs(google_drive_save_path, exist_ok=True)
print(f"Ensured directory exists: {google_drive_save_path}")

# ## 3. Save the Model and Tokenizer
print(f"Saving model and tokenizer to Google Drive at: {google_drive_save_path}")
try:
    model.save_pretrained(google_drive_save_path)
    tokenizer.save_pretrained(google_drive_save_path)
    print("Model and tokenizer successfully saved to Google Drive!")
except Exception as e:
    print(f"Error saving model to Google Drive: {e}")
    print("Please ensure your Drive is mounted correctly and the path is valid.")


Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Ensured directory exists: /content/drive/MyDrive/my_ner_models/amharic_ner_xlmroberta
Saving model and tokenizer to Google Drive at: /content/drive/MyDrive/my_ner_models/amharic_ner_xlmroberta
Model and tokenizer successfully saved to Google Drive!
