In [2]:
!pip install transformers datasets seqeval --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd
import time
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer
)
from seqeval.metrics import classification_report


In [4]:
def load_conll_to_dataset(filepath):
    tokens_list, labels_list = [], []
    tokens, labels = [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    tokens_list.append(tokens)
                    labels_list.append(labels)
                    tokens, labels = [], []
            else:
                token, label = line.split('\t')
                tokens.append(token)
                labels.append(label)
    df = pd.DataFrame({'tokens': tokens_list, 'ner_tags': labels_list})
    return Dataset.from_pandas(df)

dataset = load_conll_to_dataset('/content/labeled_subset.conll')


In [5]:
label_list = sorted(list(set(label for doc in dataset['ner_tags'] for label in doc)))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


In [6]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']


In [7]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length"
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [8]:
model_names = {
    "XLM-Roberta": "xlm-roberta-base",
    "DistilBERT": "distilbert-base-multilingual-cased",
    "mBERT": "bert-base-multilingual-cased"
}


In [11]:
results = []

for model_alias, model_checkpoint in model_names.items():
    print(f"\n=== Training and Evaluating: {model_alias} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    train_tokenized = train_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer), batched=True, num_proc=2
    )
    val_tokenized = val_dataset.map(
        lambda x: tokenize_and_align_labels(x, tokenizer), batched=True, num_proc=2
    )

    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    training_args = TrainingArguments(
    output_dir=f"./results/{model_alias}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=f"./logs/{model_alias}"
    # Remove unsupported params like evaluation_strategy, fp16, etc.
       )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer
        )

    start_time = time.time()
    trainer.train()
    train_time = time.time() - start_time

    # Predictions
    preds, labels, _ = trainer.predict(val_tokenized)
    pred_labels = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds.argmax(-1), labels)
    ]
    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    try:
        report = classification_report(true_labels, pred_labels, output_dict=True)
        f1 = round(report['overall']['f1-score'], 4)
        precision = round(report['overall']['precision'], 4)
        recall = round(report['overall']['recall'], 4)
    except KeyError:
        print(f"⚠️  Failed to compute 'overall' scores for {model_alias}. Skipping...")
        f1 = precision = recall = 0.0

    results.append({
        "Model": model_alias,
        "F1-score": f1,
        "Precision": precision,
        "Recall": recall,
        "Train Time (s)": round(train_time, 2),
        "Model Size (MB)": round(model.num_parameters() * 4 / 1024**2, 2)
    })

    trainer.save_model(f"./final_models/{model_alias}")
    tokenizer.save_pretrained(f"./final_models/{model_alias}")



=== Training and Evaluating: XLM-Roberta ===


Map (num_proc=2):   0%|          | 0/40 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


⚠️  Failed to compute 'overall' scores for XLM-Roberta. Skipping...

=== Training and Evaluating: DistilBERT ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map (num_proc=2):   0%|          | 0/40 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


⚠️  Failed to compute 'overall' scores for DistilBERT. Skipping...

=== Training and Evaluating: mBERT ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map (num_proc=2):   0%|          | 0/40 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


⚠️  Failed to compute 'overall' scores for mBERT. Skipping...


In [12]:
results_df = pd.DataFrame(results)
print("\n=== Model Comparison Results ===")
print(results_df.sort_values(by="F1-score", ascending=False))



=== Model Comparison Results ===
         Model  F1-score  Precision  Recall  Train Time (s)  Model Size (MB)
0  XLM-Roberta       0.0        0.0     0.0           43.46          1058.41
1   DistilBERT       0.0        0.0     0.0           89.26           513.98
2        mBERT       0.0        0.0     0.0           46.24           676.22


In [13]:
print(f"\n🔍 DEBUG: {model_alias}")
print(f"Sample true_labels[0]: {true_labels[0]}")
print(f"Sample pred_labels[0]: {pred_labels[0]}")
print(f"Number of samples: {len(true_labels)}")
print(f"Number of predictions: {len(pred_labels)}")



🔍 DEBUG: mBERT
Sample true_labels[0]: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sample pred_labels[0]: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Number of samples: 10
Number of predictions: 10
