In [None]:
#!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=5fa79b274454732e7149e476c536f616c27e23fc6feca6337d345aa1ee77ea8f
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
# !pip install transformers datasets seqeval

In [None]:

from datasets import Dataset, DatasetDict
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from seqeval.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:

def read_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    tokens, labels = [], []
    current_tokens, current_labels = [], []

    for line in lines:
        line = line.strip()
        if line:
            parts = line.split()
            if len(parts) >= 2:  
                token, label = parts[0], parts[-1]  
                current_tokens.append(token)
                current_labels.append(label)
            else:
                print(f"Skipping malformed line: {line}")
        else:
            if current_tokens:
                tokens.append(current_tokens)
                labels.append(current_labels)
                current_tokens, current_labels = [], []

    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)

    return {"tokens": tokens, "ner_tags": labels}


In [None]:

# Load and prepare data once
print("Loading and preparing dataset...")
data = read_conll("data.txt") # Same data as Data/labeled_telegram_product_price_location
df = pd.DataFrame(data)
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

# Define models to evaluate
MODELS = [
    "distilbert-base-multilingual-cased",
    "bert-base-multilingual-cased",
    "FacebookAI/xlm-roberta-base"
]

# Get label mappings
label_list = sorted(list(set(tag for tags in data["ner_tags"] for tag in tags)))
label2id = {tag: i for i, tag in enumerate(label_list)}
id2label = {i: tag for tag, i in label2id.items()}

# Test text for prediction
test_text = "3pcs silicon brush ዋጋ-550ብር መገናኛ #ዛም_ሞል"


Loading and preparing dataset...


In [None]:

for model_name in MODELS:
    print(f"\n{'='*50}")
    print(f"Starting processing for model: {model_name}")
    print(f"{'='*50}\n")

    try:
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(
                examples["tokens"],
                truncation=True,
                is_split_into_words=True,
                padding="max_length",
                max_length=128
            )

            labels = []
            for i, tags in enumerate(examples["ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(label2id[tags[word_idx]])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        print("Tokenizing dataset...")
        tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

        # Model initialization
        print("Initializing model...")
        model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(label2id),
            id2label=id2label,
            label2id=label2id
        )

        # Training
        training_args = TrainingArguments(
            output_dir=f"./ner_results_{model_name.replace('/', '-')}",
            eval_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,  
            weight_decay=0.01,
            save_strategy="epoch",
            load_best_model_at_end=True,
            logging_dir=f"./logs_{model_name.replace('/', '-')}",
            report_to="none"
        )

        def compute_metrics(p):
            predictions, labels = p
            predictions = np.argmax(predictions, axis=2)

            true_predictions = [
                [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            true_labels = [
                [label_list[l] for l in label if l != -100]
                for label in labels
            ]

            results = classification_report(true_labels, true_predictions, output_dict=True)
            accuracy = accuracy_score(true_labels, true_predictions)

            return {
                "precision": results["macro avg"]["precision"],
                "recall": results["macro avg"]["recall"],
                "f1": results["macro avg"]["f1-score"],
                "accuracy": accuracy,
            }

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["validation"],
            compute_metrics=compute_metrics,
        )

        print(f"Training {model_name}...")
        trainer.train()

        # Evaluation
        print("Running evaluation...")
        predictions = trainer.predict(tokenized_datasets["validation"])
        preds = np.argmax(predictions.predictions, axis=2)

        # Confusion Matrix
        true_labels = [label for batch in predictions.label_ids for label in batch if label != -100]
        flat_preds = [pred for batch, labels in zip(preds, predictions.label_ids)
                     for pred, label in zip(batch, labels) if label != -100]

        cm = confusion_matrix(true_labels, flat_preds, normalize="true")

        blue_colors = plt.cm.Blues(np.linspace(0, 1, 256))
        white_blue_colors = blue_colors[64:]
        cmap = matplotlib.colors.ListedColormap(white_blue_colors)

        fig, ax = plt.subplots(figsize=(12, 10))
        disp = ConfusionMatrixDisplay(
            confusion_matrix=cm,
            display_labels=label_list
        )
        disp.plot(
            cmap=cmap,
            ax=ax,
            values_format=".4f",
            colorbar=False,
        )
        plt.title(f"NER Confusion Matrix - {model_name}", pad=20, fontsize=14, weight='bold')
        plt.tight_layout()
        cbar = plt.colorbar(disp.im_, ax=ax, fraction=0.046, pad=0.04)
        cbar.set_label('Accuracy', rotation=270, labelpad=15)
        plt.savefig(f"confusion_matrix_{model_name.replace('/', '-')}.png")
        plt.close()

        # Save model
        model_dir = f"./{model_name.replace('/', '-')}_ner_model"
        print(f"Saving model to {model_dir}...")
        model.save_pretrained(model_dir)
        tokenizer.save_pretrained(model_dir)

        # Inference
        ner_pipeline = pipeline(
            task="ner",
            model=model_dir,
            tokenizer=model_dir,
            aggregation_strategy="simple",
            device = 0
        )

        results = ner_pipeline(test_text)

        print(f"\nResults for {model_name}:")
        print(f"Input Text: '{test_text}'\n")
        print("Detected Entities:")
        for entity in results:
            print(f"  {entity['word']} -> {entity['entity_group']} (confidence: {entity['score']:.2%})")

        # Save predictions
        with open(f"predictions_{model_name.replace('/', '-')}.txt", "w") as f:
            f.write(f"Model: {model_name}\n")
            f.write(f"Input: {test_text}\n\n")
            f.write("Entities:\n")
            for entity in results:
                f.write(f"{entity['word']}\t{entity['entity_group']}\t{entity['score']:.4f}\n")

    except Exception as e:
        print(f"Error processing {model_name}: {str(e)}")
        continue

print("\nAll model evaluations completed!")


Starting processing for model: distilbert-base-multilingual-cased

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/2533 [00:00<?, ? examples/s]

Map:   0%|          | 0/633 [00:00<?, ? examples/s]

Initializing model...


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training distilbert-base-multilingual-cased...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.100895,0.838435,0.571155,0.616537,0.968719
2,No log,0.071502,0.88224,0.768091,0.814231,0.980279
3,No log,0.060282,0.889036,0.839212,0.862328,0.984375


Running evaluation...


Saving model to ./distilbert-base-multilingual-cased_ner_model...
Error processing distilbert-base-multilingual-cased: name 'torch' is not defined

Starting processing for model: bert-base-multilingual-cased

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/2533 [00:00<?, ? examples/s]

Map:   0%|          | 0/633 [00:00<?, ? examples/s]

Initializing model...


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training bert-base-multilingual-cased...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.090269,0.795409,0.650425,0.7033,0.973482
2,No log,0.062407,0.89301,0.826114,0.856998,0.983434
3,No log,0.056673,0.874972,0.849828,0.861877,0.984739


Running evaluation...


Saving model to ./bert-base-multilingual-cased_ner_model...
Error processing bert-base-multilingual-cased: name 'torch' is not defined

Starting processing for model: FacebookAI/xlm-roberta-base

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/2533 [00:00<?, ? examples/s]

Map:   0%|          | 0/633 [00:00<?, ? examples/s]

Initializing model...


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training FacebookAI/xlm-roberta-base...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.036817,0.9395,0.945346,0.942306,0.990724
2,No log,0.028308,0.959829,0.961472,0.960614,0.993408
3,No log,0.02558,0.964662,0.966745,0.965669,0.994733


Running evaluation...


Saving model to ./FacebookAI-xlm-roberta-base_ner_model...
Error processing FacebookAI/xlm-roberta-base: name 'torch' is not defined

All model evaluations completed!


In [8]:
!zip -r xlm_roberta.zip FacebookAI-xlm-roberta-base_ner_model

  adding: FacebookAI-xlm-roberta-base_ner_model/ (stored 0%)
  adding: FacebookAI-xlm-roberta-base_ner_model/model.safetensors (deflated 29%)
  adding: FacebookAI-xlm-roberta-base_ner_model/special_tokens_map.json (deflated 52%)
  adding: FacebookAI-xlm-roberta-base_ner_model/sentencepiece.bpe.model (deflated 49%)
  adding: FacebookAI-xlm-roberta-base_ner_model/config.json (deflated 51%)
  adding: FacebookAI-xlm-roberta-base_ner_model/tokenizer.json (deflated 76%)
  adding: FacebookAI-xlm-roberta-base_ner_model/tokenizer_config.json (deflated 76%)


In [9]:
!zip -r bert-base.zip bert-base-multilingual-cased_ner_model

  adding: bert-base-multilingual-cased_ner_model/ (stored 0%)
  adding: bert-base-multilingual-cased_ner_model/model.safetensors (deflated 7%)
  adding: bert-base-multilingual-cased_ner_model/special_tokens_map.json (deflated 42%)
  adding: bert-base-multilingual-cased_ner_model/config.json (deflated 54%)
  adding: bert-base-multilingual-cased_ner_model/tokenizer.json (deflated 67%)
  adding: bert-base-multilingual-cased_ner_model/tokenizer_config.json (deflated 75%)
  adding: bert-base-multilingual-cased_ner_model/vocab.txt (deflated 45%)


In [10]:
!zip -r distilbert-base-base.zip distilbert-base-multilingual-cased_ner_model

  adding: distilbert-base-multilingual-cased_ner_model/ (stored 0%)
  adding: distilbert-base-multilingual-cased_ner_model/model.safetensors (deflated 7%)
  adding: distilbert-base-multilingual-cased_ner_model/special_tokens_map.json (deflated 42%)
  adding: distilbert-base-multilingual-cased_ner_model/config.json (deflated 47%)
  adding: distilbert-base-multilingual-cased_ner_model/tokenizer.json (deflated 67%)
  adding: distilbert-base-multilingual-cased_ner_model/tokenizer_config.json (deflated 75%)
  adding: distilbert-base-multilingual-cased_ner_model/vocab.txt (deflated 45%)


In [None]:
!cp xlm_roberta.zip bert-base.zip distilbert-base-base.zip drive/MyDrive
# distilbert-base-base.zip : https://drive.google.com/file/d/1Q-WFBMI3dwS7xJvsrmKriamLZduHhS-J/view?usp=sharing
# xlm_roberta.zip : https://drive.google.com/file/d/1ewXoghkbPg8ktxwniMeQlcMF-dQfkPmV/view?usp=sharing
# bert-base.zip : https://drive.google.com/file/d/1VKnWCCfhp8Q5kOdsA0IxMAwRAuXXh0W5/view?usp=sharing

# Here are the attached drive links for the folder for further inspection