In [5]:
# Step 1: Set Up Environment in Google Colab
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [7]:
!pip install transformers datasets seqeval pandas numpy torch

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [14]:
# Step 3: Load and Prepare the Dataset
import pandas as pd
from datasets import Dataset
from collections import Counter

def read_conll_file(file_path):
    """Reads CONLL file with space-separated word label format"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    sentences = []
    current_sentence = []

    for line in lines:
        line = line.strip()
        if not line:  # Sentence boundary
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        else:
            parts = line.split()  # Split by whitespace
            if len(parts) >= 2:  # word label format
                word = ' '.join(parts[:-1])  # Handle cases with spaces in words
                label = parts[-1]
                current_sentence.append({'word': word, 'ner_tag': label})
            else:
                print(f"Skipping malformed line: {line}")

    if current_sentence:  # Add last sentence
        sentences.append(current_sentence)

    return sentences

# Load data
conll_path = "/content/drive/MyDrive/labeled_conll.txt"
sentences = read_conll_file(conll_path)

# Convert to pandas DataFrame
data = []
for sent in sentences:
    words = [token['word'] for token in sent]
    labels = [token['ner_tag'] for token in sent]
    data.append({'words': words, 'labels': labels})  # Changed column names

df = pd.DataFrame(data)

# Verify DataFrame
print("DataFrame columns:", df.columns)
print("\nFirst row:")
print(df.iloc[0])

# Get unique tags
all_tags = [tag for sublist in df['labels'] for tag in sublist]
tag_counts = Counter(all_tags)
print("\nTag distribution:")
print(tag_counts.most_common())

unique_tags = sorted(tag_counts.keys())
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

print("\nUnique tags:", unique_tags)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
print("\nDataset sample:", dataset[0])

DataFrame columns: Index(['words', 'labels'], dtype='object')

First row:
words     [3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ...
labels    [B-Product, I-Product, I-Product, O, O, O, O, ...
Name: 0, dtype: object

Tag distribution:
[('O', 47234), ('I-Product', 174), ('I-LOC', 90), ('I-PRICE', 48), ('B-LOC', 30), ('B-Product', 28), ('B-PRICE', 28)]

Unique tags: ['B-LOC', 'B-PRICE', 'B-Product', 'I-LOC', 'I-PRICE', 'I-Product', 'O']

Dataset sample: {'words': ['3pcs', 'Bottle', 'Stopper', 'በማንኛውም', 'ጠርሙስ', 'ጫፍ', 'የሚገጠም', 'ለዘይት', 'እና', 'መሰል', 'ነገሮች', 'መቀነሻ', 'የሚሆን', 'በአግባቡ', 'እየመጠንን', 'ለመጠቀም', 'ተመራጭ', 'ዋጋ፦', '3ፍሬ', '400', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ', 'ቢሮ', 'ቁ', 'S05S06', '0902660722', '0928460606', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', 'zemencallcenter', 'zemenexpressadmin', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን', 'httpstelegrammezemenexpress'], 'labels': ['B-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [15]:
# Step 4: Prepare Tokenization and Label Alignment
from transformers import AutoTokenizer

# Load tokenizer (using XLM-Roberta as it handles Amharic well)
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Special tokens
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # New word
                label_ids.append(tag2id[label[word_idx]])
            else:  # Same word (subword token)
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply to dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

In [16]:
# Step 5: Split Dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [17]:
# Step 6: Model Setup
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_tags),
    id2label=id2tag,
    label2id=tag2id
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
#Step 7: Training Configuration
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score

training_args = TrainingArguments(
    output_dir="./amharic-ner-model",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir='./logs',
    report_to="none"  # Disables wandb/mlflow logging if not needed
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

In [20]:
# Step 8: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.05556,0.0,0.0,0.0
2,No log,0.043757,0.0,0.0,0.0
3,No log,0.041599,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=135, training_loss=0.1588961848506221, metrics={'train_runtime': 181.4762, 'train_samples_per_second': 11.754, 'train_steps_per_second': 0.744, 'total_flos': 139342794193152.0, 'train_loss': 0.1588961848506221, 'epoch': 3.0})

In [21]:
# Step 9: Save the Model
model.save_pretrained("./amharic-ner-model-final")
tokenizer.save_pretrained("./amharic-ner-model-final")

# To save to Google Drive
model.save_pretrained("/content/drive/MyDrive/amharic-ner-model")
tokenizer.save_pretrained("/content/drive/MyDrive/amharic-ner-model")

('/content/drive/MyDrive/amharic-ner-model/tokenizer_config.json',
 '/content/drive/MyDrive/amharic-ner-model/special_tokens_map.json',
 '/content/drive/MyDrive/amharic-ner-model/sentencepiece.bpe.model',
 '/content/drive/MyDrive/amharic-ner-model/added_tokens.json',
 '/content/drive/MyDrive/amharic-ner-model/tokenizer.json')

In [22]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model="/content/drive/MyDrive/amharic-ner-model",
    tokenizer="/content/drive/MyDrive/amharic-ner-model"
)
print(ner_pipeline("የ100 ብር ዋጋ ያለው ስልክ"))  # Test with Amharic text

Device set to use cuda:0


[]


In [23]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model="./amharic-ner-model-final",
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

sample_text = "ልጫ በ100 ብር ዋጋ ያለው ስልክ በአዲስ አበባ ይገኛል"
results = ner_pipeline(sample_text)
print(results)

Device set to use cuda:0


[]


In [24]:
print(trainer.state.log_history)  # Should show increasing F1 scores

[{'eval_loss': 0.05556049942970276, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.2774, 'eval_samples_per_second': 139.348, 'eval_steps_per_second': 9.394, 'epoch': 1.0, 'step': 45}, {'eval_loss': 0.04375699907541275, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.2682, 'eval_samples_per_second': 140.361, 'eval_steps_per_second': 9.463, 'epoch': 2.0, 'step': 90}, {'eval_loss': 0.041599005460739136, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.2371, 'eval_samples_per_second': 143.885, 'eval_steps_per_second': 9.7, 'epoch': 3.0, 'step': 135}, {'train_runtime': 181.4762, 'train_samples_per_second': 11.754, 'train_steps_per_second': 0.744, 'total_flos': 139342794193152.0, 'train_loss': 0.1588961848506221, 'epoch': 3.0, 'step': 135}]


In [25]:
print("Sample text contains 'ብር':", "ብር" in sample_text)  # Should be True for PRICE
print("Contains product words:", any(w in sample_text for w in ["ስልክ", "ልጫ"]))

Sample text contains 'ብር': True
Contains product words: True


In [26]:
from transformers import AutoModelForTokenClassification

try:
    test_model = AutoModelForTokenClassification.from_pretrained("./amharic-ner-model-final")
    print("Model loaded successfully with", test_model.num_labels, "labels")
except Exception as e:
    print("Model loading failed:", str(e))

Model loaded successfully with 7 labels


In [27]:
import json
with open("./amharic-ner-model-final/config.json") as f:
    config = json.load(f)
print("Label mappings:", config["id2label"])

Label mappings: {'0': 'B-LOC', '1': 'B-PRICE', '2': 'B-Product', '3': 'I-LOC', '4': 'I-PRICE', '5': 'I-Product', '6': 'O'}


In [29]:
# Update your tag2id and id2tag to match EXACTLY:
tag2id = {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2,  # Uppercase PRODUCT
          'I-LOC': 3, 'I-PRICE': 4, 'I-PRODUCT': 5, 'O': 6}

In [30]:
print("First sample labels:", train_dataset[0]['labels'])
# Should match: ['B-PRODUCT', 'I-PRODUCT', ...] not ['B-Product', ...]

First sample labels: [-100, 6, -100, 6, 6, -100, 6, -100, -100, 6, 6, 6, -100, 6, 6, -100, 6, 6, 6, -100, 6, -100, 6, 6, 6, -100, -100, 6, -100, 6, 6, 6, 6, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 6, 6, 6, -100, -100, -100, 6, -100, -100, -100, 6, -100, -100, -100, 6, -100, -100, 6, -100, -100, 6, -100, 6, -100, -100, -100, 6, -100, -100, -100, 6, -100, -100, 6, 6, -100, -100, -100, -100, 6, -100, -100, 6, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [31]:
import json
with open("./amharic-ner-model-final/config.json") as f:
    config = json.load(f)
print("Label mappings:", config["id2label"])

Label mappings: {'0': 'B-LOC', '1': 'B-PRICE', '2': 'B-Product', '3': 'I-LOC', '4': 'I-PRICE', '5': 'I-Product', '6': 'O'}


In [32]:
simple_test = "ዋጋ፦ 500 ብር"  # Should detect PRICE
print(ner_pipeline(simple_test))

[]


In [33]:
print("Original labels before tokenization:")
print(df.iloc[0]['labels'])  # Should show ['B-PRODUCT', 'I-PRODUCT', ...]

Original labels before tokenization:
['B-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [36]:
# Update your tag2id mapping to match EXACTLY what's in the data
tag2id = {
    'B-LOC': 0,
    'B-PRICE': 1,
    'B-Product': 2,  # Changed to match your data
    'I-LOC': 3,
    'I-PRICE': 4,
    'I-Product': 5,  # Changed to match your data
    'O': 6
}

# Update id2tag accordingly
id2tag = {v: k for k, v in tag2id.items()}

In [37]:
print(tag2id['B-Product'])  # Should return 2
print(id2tag[2])            # Should return 'B-Product'

2
B-Product


In [39]:
def tokenize_and_align_labels(examples, label2id):
    tokenized_inputs = tokenizer(
        examples["words"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, label_seq in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_seq[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [40]:
from functools import partial

# Create a partially applied function
align_fn_with_labels = partial(
    tokenize_and_align_labels,
    label2id=tag2id
)

# Apply to dataset
tokenized_dataset = dataset.map(
    align_fn_with_labels,
    batched=True,
    batch_size=32
)

Map:   0%|          | 0/889 [00:00<?, ? examples/s]

In [41]:
print("Verifying first sample:")
sample = tokenized_dataset[0]
for token, label in zip(
    tokenizer.convert_ids_to_tokens(sample["input_ids"])[:20],
    sample["labels"][:20]
):
    print(f"{token:20} | {label:5} | {id2tag.get(label, 'IGNORE')}")

Verifying first sample:
<s>                  |  -100 | IGNORE
▁3                   |     2 | B-Product
pc                   |  -100 | IGNORE
s                    |  -100 | IGNORE
▁Bo                  |     5 | I-Product
ttle                 |  -100 | IGNORE
▁Stop                |     5 | I-Product
per                  |  -100 | IGNORE
▁በ                   |     6 | O
ማንኛውም                |  -100 | IGNORE
▁                    |     6 | O
ጠር                   |  -100 | IGNORE
ሙስ                   |  -100 | IGNORE
▁                    |     6 | O
ጫ                    |  -100 | IGNORE
ፍ                    |  -100 | IGNORE
▁የሚ                  |     6 | O
ገ                    |  -100 | IGNORE
ጠ                    |  -100 | IGNORE
ም                    |  -100 | IGNORE


In [43]:
simple_test = "ዋጋ፦ 500 ብር"  # Should detect PRICE
print(ner_pipeline(simple_test))

[]


In [44]:
from collections import Counter

all_labels = [tag for sublist in df['labels'] for tag in sublist]
label_counts = Counter(all_labels)

print("Label distribution:")
for tag, count in label_counts.items():
    print(f"{tag}: {count} samples ({count/len(all_labels):.1%})")

Label distribution:
B-Product: 28 samples (0.1%)
I-Product: 174 samples (0.4%)
O: 47234 samples (99.2%)
B-PRICE: 28 samples (0.1%)
I-PRICE: 48 samples (0.1%)
B-LOC: 30 samples (0.1%)
I-LOC: 90 samples (0.2%)


In [45]:
print(f"Train samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Check first/last samples in each split
print("\nFirst train sample words:", train_dataset[0]['words'][:5])
print("First val sample words:", val_dataset[0]['words'][:5])

Train samples: 711
Validation samples: 178

First train sample words: ['Saachi', 'Electric', 'Kettle', 'Borosilicate', 'Glass']
First val sample words: ['Columbia', 'Trailstorm', 'size', '4041424344', 'Price']


In [47]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    # Separate inputs and labels
    inputs = {
        'input_ids': [torch.tensor(x['input_ids']) for x in batch],
        'attention_mask': [torch.tensor(x['attention_mask']) for x in batch]
    }
    labels = [torch.tensor(x['labels']) for x in batch]

    # Pad sequences
    inputs['input_ids'] = pad_sequence(inputs['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id)
    inputs['attention_mask'] = pad_sequence(inputs['attention_mask'], batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels
    }

# Create DataLoader with custom collate
dataloader = DataLoader(
    train_dataset,
    batch_size=2,
    collate_fn=collate_fn
)

In [48]:
batch = next(iter(dataloader))
print("Padded batch shapes:")
print(f"input_ids: {batch['input_ids'].shape}")
print(f"labels: {batch['labels'].shape}")

# Show padding
print("\nFirst sample (decoded):")
print(tokenizer.decode(batch['input_ids'][0]))
print("Labels:", [id2tag.get(l.item(), 'IGNORE') for l in batch['labels'][0]])

Padded batch shapes:
input_ids: torch.Size([2, 128])
labels: torch.Size([2, 128])

First sample (decoded):
<s> Saachi Electric Kettle Borosilicate Glass Body Overheat protection Automatic switch off 2200w ዋጋ፦ 2700 ብር ውስን ፍሬ ነው ያለው አድራሻ መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ S05S06 0902660722 0928460606 በTelegram ለማዘዝ ይጠቀሙ zemencallcenter zemenexpressadmin ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን httpstelegrammezemenexpress</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Labels: ['IGNORE', 'O', 'IGNORE', 'O', 'O', 'IGNORE', 'O', 'IGNORE', 'IGNORE', 'O', 'O', 'O', 'IGNORE', 'O', 'O', 'IGNORE', 'O', 'O', 'O', 'IGNORE', 'O', 'IGNORE', 'O', 'O', 'O', 'IGNORE', 'IGNORE', 'O', 'IGNORE', 'O', 'O', 'O', 'O', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'IGNORE', 'O', 'O', 'O', 'IGNORE', 'IGNORE', 'IGNORE', 'O', '

In [49]:
print("Original labels before tokenization:")
print(df.iloc[0]['labels'][:20])  # Should show entity tags like 'B-PRODUCT'

Original labels before tokenization:
['B-Product', 'I-Product', 'I-Product', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE']


In [50]:
sample = train_dataset[0]
tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
word_ids = tokenizer(sample['words'], is_split_into_words=True).word_ids()

print("\nToken-Label Alignment:")
for i, (token, word_id) in enumerate(zip(tokens, word_ids)):
    label = sample['labels'][i]
    print(f"{i:3d} | {token:20} | {str(word_id):5} | {label:5} | {id2tag.get(label, 'IGNORE')}")


Token-Label Alignment:
  0 | <s>                  | None  |  -100 | IGNORE
  1 | ▁Saa                 | 0     |     6 | O
  2 | chi                  | 0     |  -100 | IGNORE
  3 | ▁Electric            | 1     |     6 | O
  4 | ▁Kett                | 2     |     6 | O
  5 | le                   | 2     |  -100 | IGNORE
  6 | ▁Boro                | 3     |     6 | O
  7 | sili                 | 3     |  -100 | IGNORE
  8 | cate                 | 3     |  -100 | IGNORE
  9 | ▁Glass               | 4     |     6 | O
 10 | ▁Body                | 5     |     6 | O
 11 | ▁Over                | 6     |     6 | O
 12 | heat                 | 6     |  -100 | IGNORE
 13 | ▁protection          | 7     |     6 | O
 14 | ▁Automat             | 8     |     6 | O
 15 | ic                   | 8     |  -100 | IGNORE
 16 | ▁switch              | 9     |     6 | O
 17 | ▁off                 | 10    |     6 | O
 18 | ▁2200                | 11    |     6 | O
 19 | w                    | 11    |  -100 | IGN

In [56]:
!grep -B2 -A2 "B-PRODUCT" "/content/drive/MyDrive/labeled_conll.txt"

In [60]:
!grep -B2 -A2 "B-Product" "/content/drive/MyDrive/labeled_conll.txt"

3pcs B-Product
Bottle I-Product
Stopper I-Product
--

1 O
pairs B-Product
Sneaker I-Product
Crease I-Product
--
httpstelegrammezemenexpress O

Imitation B-Product
Volcano I-Product
Humidifier I-Product
--
httpstelegrammezemenexpress O

Baby B-Product
Carrier I-Product
በፈለጉት O
--
httpstelegrammezemenexpress O

Smart B-Product
Usb I-Product
Ultrasonic I-Product
--
httpstelegrammezemenexpress O

Baby B-Product
Head I-Product
Helmet I-Product
--
httpstelegrammezemenexpress O

Baby B-Product
knee I-Product
socks I-Product
--
httpstelegrammezemenexpress O

5in1 B-Product
Trouser I-Product
Hanger I-Product
--
httpstelegrammezemenexpress O

WaterProof B-Product
Shower I-Product
Cap I-Product
--
httpstelegrammezemenexpress O

Shock B-Product
and I-Product
Noise I-Product
--
httpstelegrammezemenexpress O

Magic B-Product
Silicone I-Product
Dish I-Product
--
httpstelegrammezemenexpress O

13pc B-Product
Portable I-Product
Health I-Product


In [61]:
# Check first 10 entity labels
entity_labels = [t['ner_tag'] for sent in sentences for t in sent if t['ner_tag'] != 'O']
print("Unique entity labels:", set(entity_labels))

Unique entity labels: {'B-Product', 'I-PRICE', 'I-LOC', 'B-LOC', 'I-Product', 'B-PRICE'}


In [63]:
tag2id = {
    'B-LOC': 0,
    'B-PRICE': 1,
    'B-Product': 2,  # Original case
    'I-Product': 5,
    # ...
}

In [64]:
def read_conll_file(file_path):
    """Reads CONLL file and standardizes labels to UPPERCASE"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]

    sentences = []
    current_sentence = []  # Initialize for each new sentence

    for line in lines:
        if not line:  # Sentence boundary
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        else:
            parts = line.split('\t') if '\t' in line else line.split()
            if len(parts) >= 2:
                current_sentence.append({
                    'word': parts[0],
                    'ner_tag': parts[-1].upper()  # Force uppercase
                })

    if current_sentence:  # Add last sentence if exists
        sentences.append(current_sentence)

    return sentences

In [65]:
sentences = read_conll_file("/content/drive/MyDrive/labeled_conll.txt")
print(f"Loaded {len(sentences)} sentences")
print("First sentence labels:", [t['ner_tag'] for t in sentences[0]][:5])

Loaded 889 sentences
First sentence labels: ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'O', 'O']


In [66]:
all_tags = set(t['ner_tag'] for sent in sentences for t in sent)
print("Unique tags:", all_tags)  # Should show UPPERCASE tags

Unique tags: {'I-PRICE', 'I-LOC', 'B-PRODUCT', 'B-LOC', 'I-PRODUCT', 'O', 'B-PRICE'}


In [67]:
tag2id = {
    'B-PRODUCT': 0,
    'I-PRODUCT': 1,
    'B-PRICE': 2,
    'I-PRICE': 3,
    'B-LOC': 4,
    'I-LOC': 5,
    'O': 6
}
id2tag = {v: k for k, v in tag2id.items()}

print("Verification:")
print("B-PRODUCT →", tag2id['B-PRODUCT'])
print("2 →", id2tag[2])

Verification:
B-PRODUCT → 0
2 → B-PRICE


In [68]:
from collections import Counter

label_counts = Counter(t['ner_tag'] for sent in sentences for t in sent)
print("\nLabel counts:")
for tag, count in label_counts.most_common():
    print(f"{tag}: {count} samples")


Label counts:
O: 47234 samples
I-PRODUCT: 174 samples
I-LOC: 90 samples
I-PRICE: 48 samples
B-LOC: 30 samples
B-PRODUCT: 28 samples
B-PRICE: 28 samples


In [70]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Convert to numpy arrays
all_labels = np.array([t['ner_tag'] for sent in sentences for t in sent])
classes = np.array(list(tag2id.keys()))  # Must match label order in tag2id

# Calculate weights
class_weights = compute_class_weight(
    'balanced',
    classes=classes,
    y=all_labels
)

# Verify weights
print("Class weights:")
for cls, weight in zip(classes, class_weights):
    print(f"{cls}: {weight:.2f}")

# Convert to tensor
weights = torch.tensor(class_weights, dtype=torch.float32).to('cuda')

Class weights:
B-PRODUCT: 243.02
I-PRODUCT: 39.11
B-PRICE: 243.02
I-PRICE: 141.76
B-LOC: 226.82
I-LOC: 75.61
O: 0.14


In [71]:
# Duplicate sentences containing rare entities
augmented_sentences = []
for sent in sentences:
    if any(t['ner_tag'] in ['B-PRODUCT', 'B-PRICE'] for t in sent):
        augmented_sentences.extend([sent] * 3)  # 3x copies
    augmented_sentences.append(sent)

In [77]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./amharic-ner-model",
    # Training parameters
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=3e-5,
    weight_decay=0.01,

    # Evaluation & saving
    eval_strategy="steps",        # Changed from evaluation_strategy
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,

    # Model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    # Logging
    logging_steps=50,
    report_to="none",

    # Optimization
    optim="adamw_torch",
    gradient_accumulation_steps=1
)

In [78]:
new_counts = Counter(t['ner_tag'] for sent in augmented_sentences for t in sent)
print("Augmented counts:", new_counts.most_common())

Augmented counts: [('O', 50090), ('I-PRODUCT', 696), ('I-LOC', 342), ('I-PRICE', 192), ('B-LOC', 114), ('B-PRODUCT', 112), ('B-PRICE', 112)]


In [81]:
from transformers import Trainer
import torch

# 1. First calculate class weights (unchanged)
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

all_labels = np.array([t['ner_tag'] for sent in sentences for t in sent])
classes = np.array(list(tag2id.keys()))

class_weights = compute_class_weight(
    'balanced',
    classes=classes,
    y=all_labels
)
weights = torch.tensor(class_weights, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Define Data Collator (new requirement)
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

# 3. Updated Weighted Trainer Class
class WeightedTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        # Remove tokenizer from kwargs if present
        kwargs.pop('tokenizer', None)
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )

        return (loss, outputs) if return_outputs else loss

# 4. Create trainer with new syntax
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=val_dataset if 'val_dataset' in locals() else None,
    compute_metrics=compute_metrics,
    data_collator=data_collator  # Replaces tokenizer parameter
)

In [83]:
print("Sample dataset item:", tokenized_dataset[0])
# Should show properly formatted dict with:
# {'input_ids': [...], 'attention_mask': [...], 'labels': [...]}

Sample dataset item: {'words': ['3pcs', 'Bottle', 'Stopper', 'በማንኛውም', 'ጠርሙስ', 'ጫፍ', 'የሚገጠም', 'ለዘይት', 'እና', 'መሰል', 'ነገሮች', 'መቀነሻ', 'የሚሆን', 'በአግባቡ', 'እየመጠንን', 'ለመጠቀም', 'ተመራጭ', 'ዋጋ፦', '3ፍሬ', '400', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ', 'ቢሮ', 'ቁ', 'S05S06', '0902660722', '0928460606', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', 'zemencallcenter', 'zemenexpressadmin', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን', 'httpstelegrammezemenexpress'], 'labels': [-100, 2, -100, -100, 5, -100, 5, -100, 6, -100, 6, -100, -100, 6, -100, -100, 6, -100, -100, -100, 6, -100, -100, 6, 6, -100, 6, 6, -100, -100, 6, 6, 6, -100, -100, -100, 6, -100, 6, -100, -100, 1, -100, 4, -100, -100, 4, 4, 6, -100, -100, 6, -100, 6, 6, 6, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 3, 3, 3, -100, -100, -100, 6, -100, -100, -100, 6, -100, -100, -100, 6, -100, -100, 6, -100, -100, 6, -100, 6, -100, -100, -100, 6, -100, -100, -100, 6, -100, -100, 6, 6, -100, -100,

In [86]:
print(trainer.state.log_history)

[]


In [87]:
from transformers import pipeline

# Load your saved model (adjust path as needed)
model_path = "./amharic-ner-model-final"  # or "/content/drive/MyDrive/amharic-ner-model"
ner_pipeline = pipeline(
    "ner",
    model=model_path,
    tokenizer=model_path,
    aggregation_strategy="simple",  # Groups subwords into words
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

Device set to use cuda:0


In [88]:
from transformers import pipeline

# Load your saved model (adjust path as needed)
model_path = "/content/drive/MyDrive/amharic-ner-model"  # or "/content/drive/MyDrive/amharic-ner-model"
ner_pipeline = pipeline(
    "ner",
    model=model_path,
    tokenizer=model_path,
    aggregation_strategy="simple",  # Groups subwords into words
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

Device set to use cuda:0
