In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install transformers datasets torch pandas seqeval

In [None]:
import os
file_path = '/content/drive/MyDrive/Amharic-Ecommerce-Extractor/data/conll_labeled_data.conll'
print("CoNLL file exists:", os.path.exists(file_path))

CoNLL file exists: True


In [None]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to load CoNLL file
def load_conll(file_path):
    sentences = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # Non-empty line
                try:
                    token, label = line.split('\t')
                    current_sentence.append(token)
                    current_labels.append(label)
                except ValueError:
                    logger.warning(f"Skipping malformed line: {line}")
            else:  # Empty line indicates new sentence
                if current_sentence:  # Save non-empty sentences
                    sentences.append({
                        'tokens': current_sentence,
                        'labels': current_labels
                    })
                    current_sentence = []
                    current_labels = []

    # Save the last sentence if it exists
    if current_sentence:
        sentences.append({
            'tokens': current_sentence,
            'labels': current_labels
        })

    return sentences

# Load CoNLL file
conll_file = '/content/drive/MyDrive/Amharic-Ecommerce-Extractor/data/conll_labeled_data.conll'
dataset = load_conll(conll_file)
logger.info(f"Loaded {len(dataset)} sentences from CoNLL file")

# Split into train and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
logger.info(f"Training set: {len(train_data)} sentences")
logger.info(f"Validation set: {len(val_data)} sentences")

# Example: Print first sentence
print("Example sentence:")
print("Tokens:", train_data[0]['tokens'])
print("Labels:", train_data[0]['labels'])

Example sentence:
Tokens: ['Imitation', 'Volcano', 'Humidifier', 'with', 'LED', 'Light', 'በኤሌክትሪክየሚሰራ', 'ለቤት', 'መዓዛን', 'የሚሰጥ', 'ዋጋ', '1400', 'ውስን', 'ፍሬ', 'ያለን', 'አድራሻ', 'መገናኛመሰረትደፋርሞልሁለተኛፎቅ', 'ቢሮ', 'ቁ', 'S05S06', '0902660722', '0928460606', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', 'zemencallcenter', 'zemenexpressadmin', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን', 'httpstelegrammezemenexpress', 'EN', 'imitation', 'EN', 'volcano', 'EN', 'humidifier', 'EN', 'led', 'EN', 'light', 'EN', 'በኤሌክትሪክየሚሰራ', 'EN', 'ለቤት', 'EN', 'መልካም', 'EN', 'መዓዛን', 'EN', 'የሚሰጥ', 'EN', 'ዋጋ', 'EN', '1400', 'EN', 'ብር', 'EN', 'ውስን', 'EN', 'ፍሬ', 'EN', 'ነው', 'EN', 'ያለን', 'EN', 'አድራሻ', 'EN', 'መገናኛመሰረትደፋርሞልሁለተኛፎቅ', 'EN', 'ቢሮ', 'EN', 'ቁ', 'EN', 's05s06', 'EN', '0902660722', 'EN', '0928460606', 'EN', 'በtelegram', 'EN', 'ለማዘዝ', 'EN', 'ይጠቀሙ', 'EN', 'zemencallcenter', 'EN', 'zemenexpressadmin', 'EN', 'ለተጨማሪ', 'EN', 'ማብራሪያ', 'EN', 'የቴሌግራም', 'EN', 'ገፃችን', 'EN', 'httpstelegrammezemenexpress']
Labels: ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUC

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import logging

# Set up logging (if not already set)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define label set
label_list = [
    'O',
    'B-PRODUCT', 'I-PRODUCT',
    'B-PRICE', 'I-PRICE',
    'B-LOC', 'I-LOC'
]
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for idx, label in enumerate(label_list)}

# Load model and tokenizer
model_name = "Davlan/afro-xlmr-base"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    logger.info(f"Loaded model and tokenizer: {model_name}")
except Exception as e:
    logger.error(f"Failed to load model/tokenizer: {e}")
    raise

# Verify tokenizer
example_text = "GROOMING SET ሶስት ዋጋ 2300 ብር መገናኛ"
tokens = tokenizer.tokenize(example_text)
print("Example tokens:", tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Example tokens: ['▁GR', 'OOM', 'ING', '▁SET', '▁ሶስት', '▁ዋጋ', '▁2', '300', '▁ብር', '▁መ', 'ገና', 'ኛ']


In [None]:
from datasets import Dataset
import logging
import numpy as np

# Set up logging (if not already set)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to align labels with tokenized inputs
def tokenize_and_align_labels(examples, tokenizer, label2id, max_length=128):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        is_split_into_words=True,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_offsets_mapping=False
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens (CLS, SEP, PAD)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])  # First subword
            else:
                # Convert B- to I- for subword tokens
                if label[word_idx].startswith('B-'):
                    label_ids.append(label2id['I-' + label[word_idx][2:]])
                else:
                    label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Convert data to Hugging Face Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# Tokenize and align labels
tokenized_train = train_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True,
    remove_columns=['tokens', 'labels']
)
tokenized_val = val_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True,
    remove_columns=['tokens', 'labels']
)

logger.info(f"Tokenized training dataset: {len(tokenized_train)} examples")
logger.info(f"Tokenized validation dataset: {len(tokenized_val)} examples")

# Verify tokenized example
example = tokenized_train[0]
print("Example tokenized input IDs:", example['input_ids'][:10], "...")
print("Example labels:", example['labels'][:10], "...")

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Example tokenized input IDs: [0, 113322, 22062, 5976, 38938, 19210, 532, 85789, 678, 8908] ...
Example labels: [-100, 1, 2, 2, 2, 2, 2, 2, 2, 2] ...


In [None]:
from transformers import TrainingArguments
import logging

# Set up logging (if not already set)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define training arguments
output_dir = '/content/drive/My Drive/Amharic-Ecommerce-Extractor/models/ner_model'
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,              # Number of epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    learning_rate=2e-5,              # Learning rate
    warmup_steps=10,                 # Warmup steps
    weight_decay=0.01,               # Weight decay for regularization
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model at the end of each epoch
    load_best_model_at_end=True,     # Load the best model based on evaluation
    metric_for_best_model="f1",      # Use F1 score to select best model
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,                # Log every 10 steps
    seed=42                          # Random seed for reproducibility
)

logger.info("Training arguments configured")
print("Training arguments:", training_args)



Training arguments: TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalS

In [None]:
from transformers import Trainer, DataCollatorForTokenClassification
from seqeval.metrics import precision_score, recall_score, f1_score
import logging
import numpy as np

# Set up logging (if not already set)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define custom metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert predictions and labels to label names, ignoring -100
    true_predictions = [
        [label_list[pred] for (pred, lbl) in zip(prediction, label) if lbl != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[lbl] for (pred, lbl) in zip(prediction, label) if lbl != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }
    return results

# Initialize data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
logger.info("Starting model fine-tuning")
trainer.train()

logger.info("Fine-tuning completed")

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabelzeleke5173[0m ([33mabelzeleke5173-cbe[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.861764,0.004124,0.052632,0.007648
2,No log,1.621838,0.0,0.0,0.0
3,1.782300,1.136139,0.0,0.0,0.0
4,1.782300,0.816407,0.0,0.0,0.0
5,0.933000,0.799226,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import logging
import numpy as np
from seqeval.metrics import classification_report

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Evaluate the model
logger.info("Evaluating model on validation set")
eval_results = trainer.evaluate()

# Print evaluation metrics
print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Get predictions for a few validation examples
def get_predictions(dataset, trainer, tokenizer, label_list, num_examples=3):
    predictions, labels, _ = trainer.predict(dataset)
    predictions = np.argmax(predictions, axis=2)

    results = []
    for i in range(min(num_examples, len(dataset))):
        tokens = tokenizer.convert_ids_to_tokens(dataset[i]['input_ids'], skip_special_tokens=True)
        pred_labels = [label_list[pred] for pred, lbl in zip(predictions[i], dataset[i]['labels']) if lbl != -100]
        true_labels = [label_list[lbl] for lbl in dataset[i]['labels'] if lbl != -100]
        results.append({
            'tokens': tokens,
            'predicted': pred_labels,
            'true': true_labels
        })
    return results

# Print sample predictions
logger.info("Getting sample predictions")
sample_predictions = get_predictions(tokenized_val, trainer, tokenizer, label_list)
for i, pred in enumerate(sample_predictions):
    print(f"\nSample {i+1}:")
    for token, pred_label, true_label in zip(pred['tokens'], pred['predicted'], pred['true']):
        print(f"{token}\tPredicted: {pred_label}\tTrue: {true_label}")

Evaluation Results:
eval_loss: 1.8618
eval_precision: 0.0041
eval_recall: 0.0526
eval_f1: 0.0076
eval_runtime: 0.1082
eval_samples_per_second: 73.9200
eval_steps_per_second: 9.2400
epoch: 5.0000

Sample 1:
▁5	Predicted: I-LOC	True: O
in	Predicted: I-PRODUCT	True: O
1	Predicted: I-PRODUCT	True: O
▁Trou	Predicted: I-PRICE	True: B-PRODUCT
ser	Predicted: I-PRODUCT	True: I-PRODUCT
▁Hang	Predicted: B-PRICE	True: I-PRODUCT
er	Predicted: I-PRODUCT	True: I-PRODUCT
▁የ	Predicted: O	True: O
ሱ	Predicted: B-PRODUCT	True: O
ሪ	Predicted: B-PRICE	True: O
▁ማስ	Predicted: O	True: O
ቀ	Predicted: O	True: O
መ	Predicted: O	True: O
ጫ	Predicted: I-PRODUCT	True: O
▁ዋጋ	Predicted: I-PRODUCT	True: O
▁650	Predicted: I-PRODUCT	True: O
▁	Predicted: I-PRODUCT	True: O
ውስ	Predicted: B-PRICE	True: O
ን	Predicted: I-PRODUCT	True: O
▁ፍ	Predicted: O	True: O
ሬ	Predicted: B-PRICE	True: O
▁ያለ	Predicted: I-LOC	True: O
ን	Predicted: I-PRODUCT	True: O
▁አድራሻ	Predicted: B-PRODUCT	True: O
▁መ	Predicted: I-PRODUCT	True: O
ገና	Predicted: O

In [13]:
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Save the model and tokenizer
save_dir = '/content/drive/My Drive/Amharic-Ecommerce-Extractor/models/ner_model/final'
os.makedirs(save_dir, exist_ok=True)

try:
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    logger.info(f"Model and tokenizer saved to: {save_dir}")
except Exception as e:
    logger.error(f"Failed to save model/tokenizer: {e}")
    raise

# Verify saved files
print("Saved files:", os.listdir(save_dir))

Saved files: ['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'sentencepiece.bpe.model', 'tokenizer.json']


In [15]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load saved model and tokenizer
save_dir = '/content/drive/My Drive/Amharic-Ecommerce-Extractor/models/ner_model/final'
try:
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    model = AutoModelForTokenClassification.from_pretrained(save_dir)
    logger.info(f"Loaded model and tokenizer from: {save_dir}")
except Exception as e:
    logger.error(f"Failed to load model/tokenizer: {e}")
    raise

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample test messages (based on your dataset)
test_messages = [
    "GROOMING SET ሶስት በአንድ የፀጉር ማሽን ዋጋ 2300 ብር መገናኛ",
    "1L Water Bottle ዋጋ 800 ብር በአዲስ አበባ",
    "ሼቨር በመስከረም አካባቢ ዋጋ 1500 ብር"
]

# Run inference and print results
logger.info("Running inference on test messages")
for i, message in enumerate(test_messages):
    results = ner_pipeline(message)
    print(f"\nTest Message {i+1}: {message}")
    print("Predicted Entities:")
    for entity in results:
        print(f" - {entity['word']} ({entity['entity_group']}): {entity['score']:.4f}")

Device set to use cuda:0



Test Message 1: GROOMING SET ሶስት በአንድ የፀጉር ማሽን ዋጋ 2300 ብር መገናኛ
Predicted Entities:
 - GR (LOC): 0.1769
 - ING SET ሶስት (PRODUCT): 0.1577
 - በአንድ (PRICE): 0.1664
 - የ (PRODUCT): 0.1592
 - ር (PRICE): 0.1734
 - ማሽን (PRODUCT): 0.1688
 - ዋጋ (PRODUCT): 0.1616
 - ብር (PRICE): 0.1658
 - መ (PRODUCT): 0.1647
 - ኛ (PRODUCT): 0.1575

Test Message 2: 1L Water Bottle ዋጋ 800 ብር በአዲስ አበባ
Predicted Entities:
 - 1 (LOC): 0.1531
 - L (PRICE): 0.1577
 - Water Bo (PRODUCT): 0.1572
 - ttle (PRICE): 0.1533
 - ብር (PRICE): 0.1720
 - በአዲስ (PRICE): 0.1638

Test Message 3: ሼቨር በመስከረም አካባቢ ዋጋ 1500 ብር
Predicted Entities:
 -  (PRODUCT): 0.1813
 - ሼ (PRICE): 0.1796
 - ቨር (PRICE): 0.1680
 - መስከረም (PRICE): 0.1728
 - አካባቢ (PRICE): 0.1651
 - ብር (PRICE): 0.1715
