In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
!pip install transformers datasets evaluate seqeval onnx onnxruntime optimum

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting optimum
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11->optimum)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11->optimum)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11->optimum)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 

In [34]:
import os
import numpy as np
import torch
import evaluate
import itertools
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments
)

# For manual ONNX export
from optimum.exporters.tasks import TasksManager
from optimum.exporters.onnx import export as onnx_export
from pathlib import Path
import shutil

def train_and_evaluate_ner(
    model_checkpoint: str,
    train_file: str,
    val_file: str,
    entity_labels: list,
    num_epochs: int,
    learning_rate: float,
    batch_size: int,
    output_dir: str,
    return_trainer: bool = False
):
    """
    Train a token classification model on a dataset and return final metrics,
    along with the Trainer (if return_trainer=True).
    """

    # Prepare labels
    all_labels = ["O"] + entity_labels
    label2id = {label: i for i, label in enumerate(all_labels)}
    id2label = {i: label for label, i in label2id.items()}

    # Load raw dataset
    data_files = {"train": train_file, "validation": val_file}
    raw_dataset = load_dataset("json", data_files=data_files)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    # Alignment function
    def align_spans_to_tokens(examples):
        texts = examples["text"]
        batch_tokenized = tokenizer(
            texts,
            truncation=True,
            return_offsets_mapping=True
        )

        new_labels = []
        for i, offsets in enumerate(batch_tokenized["offset_mapping"]):
            spans = examples["spans"][i]
            span_entities = [(s["start"], s["end"], s["label"]) for s in spans]

            # Initialize all tokens to -100
            label_ids = [-100] * len(offsets)

            for span_start, span_end, span_label in span_entities:
                for idx, (token_start, token_end) in enumerate(offsets):
                    # If offset_mapping is (0, 0), it's likely a special token
                    if token_start == 0 and token_end == 0:
                        continue
                    if token_end > span_start and token_start < span_end:
                        label_ids[idx] = label2id[span_label]

            # Convert real tokens still -100 to "O"
            for idx, (token_start, token_end) in enumerate(offsets):
                if (token_start != 0 or token_end != 0) and label_ids[idx] == -100:
                    label_ids[idx] = label2id["O"]

            new_labels.append(label_ids)

        batch_tokenized["labels"] = new_labels
        batch_tokenized.pop("offset_mapping")
        return batch_tokenized

    # Encode dataset
    encoded_dataset = raw_dataset.map(
        align_spans_to_tokens,
        batched=True,
        remove_columns=raw_dataset["train"].column_names
    )

    train_dataset = encoded_dataset["train"]
    eval_dataset = encoded_dataset["validation"]

    # Load model
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Metric
    metric = evaluate.load("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_labels = []
        true_predictions = []
        for label_ids, pred_ids in zip(labels, predictions):
            filtered_labels = []
            filtered_preds = []
            for l, p_ in zip(label_ids, pred_ids):
                if l == -100:
                    continue
                filtered_labels.append(id2label[l])
                filtered_preds.append(id2label[p_])
            true_labels.append(filtered_labels)
            true_predictions.append(filtered_preds)

        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"]
        }

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",        # We'll save model each epoch, but only keep the best
        save_total_limit=1,          # Keep only the best checkpoint
        logging_strategy="steps",
        logging_steps=10,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        # disable wandb if desired
        # report_to="none",
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train
    trainer.train()

    # -------------------------------
    # SAVE the final (best) model +
    # SAVE the tokenizer
    # -------------------------------
    trainer.save_model(output_dir)         # saves the best model
    tokenizer.save_pretrained(output_dir)  # ensure tokenizer is saved

    # Remove any leftover "checkpoint-XYZ" subfolders if they exist
    for subdir in os.listdir(output_dir):
        if subdir.startswith("checkpoint-") and os.path.isdir(os.path.join(output_dir, subdir)):
            shutil.rmtree(os.path.join(output_dir, subdir))

    # Evaluate final model
    metrics = trainer.evaluate()

    if return_trainer:
        return metrics, trainer
    else:
        return metrics




In [37]:
###########################
#    Hyperparameters      #
###########################
# model, learning rate, batch size, epochs
model_combinations = [
    ("prajjwal1/bert-tiny", 0.0001, 16, 35),
    ("prajjwal1/bert-tiny", 0.0001, 32, 35),
    ("prajjwal1/bert-tiny", 0.00005, 32, 35),
    ("microsoft/MiniLM-L12-H384-uncased", 0.0001, 16, 10),
    ("microsoft/MiniLM-L12-H384-uncased", 0.0001, 24, 10),
    ("google/mobilebert-uncased", 0.0001, 16, 8),
    ("google/mobilebert-uncased", 0.0001, 24, 8),
    ("google/mobilebert-uncased", 0.00005, 16, 8),
    ("google/mobilebert-uncased", 0.00005, 24, 8),
    ("distilbert-base-uncased", 0.0001, 16, 8),
    ("distilbert-base-uncased", 0.0001, 24, 8)
]

entity_labels = ["HOSP", "PATIENT", "STAFF", "AGE", "BIRTHDATE", "DATE",
                 "PHONE", "ID", "EMAIL", "LOC", "ORG"]
train_file = "train.json"
val_file = "validation.json"

# Make sure you set the path to a folder in your Google Drive
base_output_dir = "/content/drive/MyDrive/my_ner_experiments"

results_list = []
trainers_dict = {}

# Create the base directory if it doesn't exist
os.makedirs(base_output_dir, exist_ok=True)

##########################################################
#  Loop over each combination of checkpoint, LR, BS      #
##########################################################
for checkpoint, lr, bs, num_epochs in model_combinations:
    # Build a unique subfolder for this run
    folder_name = f"{checkpoint.replace('/', '_')}_lr{lr}_bs{bs}"
    out_dir = os.path.join(base_output_dir, folder_name)
    os.makedirs(out_dir, exist_ok=True)

    print(f"Running experiment: {checkpoint}, lr={lr}, bs={bs}")
    print(f"Saving outputs to: {out_dir}")

    # Train and get metrics + trainer
    metrics, trainer = train_and_evaluate_ner(
        model_checkpoint=checkpoint,
        train_file=train_file,
        val_file=val_file,
        entity_labels=entity_labels,
        num_epochs=num_epochs,
        learning_rate=lr,
        batch_size=bs,
        output_dir=out_dir,
        return_trainer=True
    )

    # Record final metrics
    row = {
        "model_checkpoint": checkpoint,
        "learning_rate": lr,
        "batch_size": bs,
        "precision": metrics["eval_precision"],
        "recall": metrics["eval_recall"],
        "f1": metrics["eval_f1"],
        "accuracy": metrics["eval_accuracy"],
        "output_dir": out_dir,
    }
    results_list.append(row)

    # Store the trainer for analysis
    trainers_dict[(checkpoint, lr, bs)] = trainer

    ############################################
    #   PLOT TRAINING vs VALIDATION CURVES     #
    ############################################
    log_history = trainer.state.log_history

    # 1) Training vs Validation Loss
    train_loss_vals = []
    train_steps = []
    eval_loss_vals = []
    eval_steps = []

    for entry in log_history:
        if "loss" in entry and "step" in entry:
            train_loss_vals.append(entry["loss"])
            train_steps.append(entry["step"])
        if "eval_loss" in entry and "step" in entry:
            eval_loss_vals.append(entry["eval_loss"])
            eval_steps.append(entry["step"])

    plt.figure(figsize=(8, 6))
    plt.plot(train_steps, train_loss_vals, label="Training Loss")
    plt.plot(eval_steps, eval_loss_vals, label="Validation Loss", marker='o')
    plt.xlabel("Global Step")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()

    loss_plot_path = os.path.join(out_dir, "loss_plot.png")
    plt.savefig(loss_plot_path)
    plt.close()

    # 2) F1 Score Over Time
    eval_f1_vals = []
    eval_f1_steps = []

    for entry in log_history:
        # 'eval_f1' typically appears in the logs as "eval_f1"
        if "eval_f1" in entry and "step" in entry:
            eval_f1_vals.append(entry["eval_f1"])
            eval_f1_steps.append(entry["step"])

    if eval_f1_vals:
        plt.figure(figsize=(8, 6))
        plt.plot(eval_f1_steps, eval_f1_vals, label="Validation F1", marker='o', color='green')
        plt.xlabel("Global Step")
        plt.ylabel("F1 Score")
        plt.title("Validation F1 Over Time")
        plt.legend()

        f1_plot_path = os.path.join(out_dir, "f1_plot.png")
        plt.savefig(f1_plot_path)
        plt.close()

    ###################################################
    #   CONVERT THE MODEL TO ONNX AND SAVE model.onnx #
    ###################################################
    print(f"Converting {checkpoint} to ONNX...")

    # 1) Reload your final (best) model from the local directory
    model_for_export = AutoModelForTokenClassification.from_pretrained(out_dir)

    # 2) Retrieve an ONNX export config
    onnx_config_constructor = TasksManager.get_exporter_config_constructor(
        model=model_for_export,
        task="token-classification",  # your task
        exporter="onnx"
    )
    onnx_config = onnx_config_constructor(model_for_export.config)

    # 3) Export to ONNX
    onnx_path = Path(os.path.join(out_dir, "model.onnx"))
    model_for_export.cpu()  # easier to export on CPU

    onnx_export(
        model=model_for_export,
        config=onnx_config,
        opset=14,          # adjust if needed
        output=onnx_path,
        device="cpu"
    )

    print(f"ONNX model saved at: {onnx_path}")
    print("-----------------------------------------------------------")


Running experiment: prajjwal1/bert-tiny, lr=0.0001, bs=16
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/prajjwal1_bert-tiny_lr0.0001_bs16


Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.3462,1.142865,0.0,0.0,0.0,0.751695
2,0.8818,0.701341,0.465046,0.352535,0.401048,0.862429
3,0.6102,0.459887,0.661504,0.68894,0.674944,0.931921
4,0.4244,0.335806,0.719828,0.769585,0.743875,0.946893
5,0.3351,0.260566,0.77342,0.817972,0.795073,0.956497
6,0.2873,0.211541,0.807522,0.841014,0.823928,0.961582
7,0.2251,0.17818,0.808696,0.857143,0.832215,0.964689
8,0.2215,0.160739,0.830022,0.866359,0.847802,0.965537
9,0.1749,0.144376,0.858407,0.894009,0.875847,0.972034
10,0.1399,0.133873,0.856512,0.894009,0.874859,0.971751


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


Converting prajjwal1/bert-tiny to ONNX...
ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/prajjwal1_bert-tiny_lr0.0001_bs16/model.onnx
-----------------------------------------------------------
Running experiment: prajjwal1/bert-tiny, lr=0.0001, bs=32
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/prajjwal1_bert-tiny_lr0.0001_bs32


Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.6728,1.359963,0.0,0.0,0.0,0.751695
2,1.3565,1.169928,0.0,0.0,0.0,0.751695
3,1.1886,0.919774,0.133333,0.009217,0.017241,0.762994
4,0.8366,0.691117,0.381944,0.253456,0.304709,0.864124
5,0.7732,0.532429,0.513369,0.442396,0.475248,0.90226
6,0.576,0.430309,0.646778,0.624424,0.635404,0.927401
7,0.4717,0.348349,0.695067,0.714286,0.704545,0.94548
8,0.4173,0.302097,0.736842,0.774194,0.755056,0.951412
9,0.3329,0.254968,0.787746,0.829493,0.808081,0.960169
10,0.3002,0.226206,0.799127,0.843318,0.820628,0.962429


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


Converting prajjwal1/bert-tiny to ONNX...
ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/prajjwal1_bert-tiny_lr0.0001_bs32/model.onnx
-----------------------------------------------------------
Running experiment: prajjwal1/bert-tiny, lr=5e-05, bs=32
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/prajjwal1_bert-tiny_lr5e-05_bs32


Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.9687,1.661511,0.0,0.0,0.0,0.751695
2,1.5771,1.351493,0.0,0.0,0.0,0.751695
3,1.3885,1.234053,0.0,0.0,0.0,0.751695
4,1.2183,1.140849,0.0,0.0,0.0,0.751695
5,1.2396,0.999575,0.0,0.0,0.0,0.751977
6,1.011,0.849641,0.168317,0.039171,0.063551,0.79661
7,0.8883,0.72768,0.383212,0.241935,0.29661,0.861582
8,0.8084,0.654618,0.39403,0.304147,0.343303,0.879661
9,0.6886,0.577175,0.516484,0.43318,0.471178,0.899435
10,0.6552,0.517572,0.558603,0.516129,0.536527,0.911864


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


Converting prajjwal1/bert-tiny to ONNX...
ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/prajjwal1_bert-tiny_lr5e-05_bs32/model.onnx
-----------------------------------------------------------
Running experiment: microsoft/MiniLM-L12-H384-uncased, lr=0.0001, bs=16
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/microsoft_MiniLM-L12-H384-uncased_lr0.0001_bs16


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/735 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4279,0.244388,0.922374,0.930876,0.926606,0.981073
2,0.1888,0.1294,0.967059,0.947005,0.956927,0.990113
3,0.1388,0.095552,0.979215,0.976959,0.978085,0.993503
4,0.0786,0.088208,0.970183,0.974654,0.972414,0.99209
5,0.0645,0.073156,0.979215,0.976959,0.978085,0.992938
6,0.0529,0.071492,0.976959,0.976959,0.976959,0.992938
7,0.053,0.05835,0.974654,0.974654,0.974654,0.993503
8,0.049,0.062267,0.974713,0.976959,0.975834,0.992938
9,0.0453,0.063395,0.976959,0.976959,0.976959,0.992938
10,0.0404,0.064127,0.979215,0.976959,0.978085,0.992655


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


Converting microsoft/MiniLM-L12-H384-uncased to ONNX...
ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/microsoft_MiniLM-L12-H384-uncased_lr0.0001_bs16/model.onnx
-----------------------------------------------------------
Running experiment: microsoft/MiniLM-L12-H384-uncased, lr=0.0001, bs=24
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/microsoft_MiniLM-L12-H384-uncased_lr0.0001_bs24


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5224,0.345526,0.821429,0.794931,0.807963,0.965537
2,0.2452,0.179055,0.935484,0.935484,0.935484,0.981356
3,0.1594,0.117268,0.972477,0.976959,0.974713,0.99322
4,0.1043,0.097081,0.972477,0.976959,0.974713,0.992655
5,0.0927,0.084245,0.974771,0.979263,0.977011,0.99322
6,0.0753,0.078112,0.974654,0.974654,0.974654,0.992373
7,0.0628,0.074442,0.974596,0.97235,0.973472,0.992938
8,0.0657,0.06864,0.972665,0.983871,0.978236,0.994068
9,0.0509,0.066958,0.974828,0.981567,0.978186,0.993785
10,0.0532,0.065456,0.976959,0.976959,0.976959,0.992938


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


Converting microsoft/MiniLM-L12-H384-uncased to ONNX...
ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/microsoft_MiniLM-L12-H384-uncased_lr0.0001_bs24/model.onnx
-----------------------------------------------------------
Running experiment: google/mobilebert-uncased, lr=0.0001, bs=16
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr0.0001_bs16


config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/735 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForTokenClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1529,0.052046,0.940909,0.953917,0.947368,0.986723
2,0.0724,0.043793,0.947964,0.965438,0.956621,0.991243
3,0.0224,0.035436,0.959184,0.974654,0.966857,0.991525
4,0.0088,0.04015,0.974713,0.976959,0.975834,0.99322
5,0.007,0.035157,0.965675,0.97235,0.969001,0.99322
6,0.0029,0.046807,0.974713,0.976959,0.975834,0.992938
7,0.002,0.044875,0.974713,0.976959,0.975834,0.993785
8,0.0032,0.043962,0.974713,0.976959,0.975834,0.993785






Converting google/mobilebert-uncased to ONNX...


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.
  torch.tensor(1000),
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr0.0001_bs16/model.onnx
-----------------------------------------------------------
Running experiment: google/mobilebert-uncased, lr=0.0001, bs=24
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr0.0001_bs24


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of MobileBertForTokenClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1827,0.073426,0.850549,0.891705,0.870641,0.976554
2,0.0869,0.042833,0.954338,0.963134,0.958716,0.990113
3,0.0302,0.03662,0.972414,0.974654,0.973533,0.99322
4,0.0074,0.048143,0.956522,0.963134,0.959816,0.991243
5,0.0081,0.037275,0.972286,0.970046,0.971165,0.993785
6,0.0135,0.04291,0.974596,0.97235,0.973472,0.99209
7,0.0048,0.043234,0.972286,0.970046,0.971165,0.992373
8,0.0052,0.042063,0.970046,0.970046,0.970046,0.992655






Converting google/mobilebert-uncased to ONNX...


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.
  torch.tensor(1000),
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr0.0001_bs24/model.onnx
-----------------------------------------------------------
Running experiment: google/mobilebert-uncased, lr=5e-05, bs=16
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr5e-05_bs16


Some weights of MobileBertForTokenClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2596,0.107153,0.811364,0.822581,0.816934,0.967232
2,0.0967,0.055352,0.930337,0.953917,0.94198,0.988983
3,0.0397,0.048697,0.976905,0.974654,0.975779,0.99209
4,0.014,0.046486,0.979167,0.974654,0.976905,0.992938
5,0.01,0.042998,0.974596,0.97235,0.973472,0.99322
6,0.0062,0.049858,0.974596,0.97235,0.973472,0.992373
7,0.005,0.047025,0.976852,0.97235,0.974596,0.992655
8,0.0107,0.048436,0.974596,0.97235,0.973472,0.992373






Converting google/mobilebert-uncased to ONNX...


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.
  torch.tensor(1000),
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr5e-05_bs16/model.onnx
-----------------------------------------------------------
Running experiment: google/mobilebert-uncased, lr=5e-05, bs=24
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr5e-05_bs24


Some weights of MobileBertForTokenClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4269,0.220245,0.688525,0.677419,0.682927,0.943785
2,0.146,0.061411,0.925,0.937788,0.93135,0.986441
3,0.052,4.345567,0.945578,0.960829,0.953143,0.989831
4,0.0199,0.041879,0.963218,0.965438,0.964327,0.991525
5,0.0177,0.040509,0.963218,0.965438,0.964327,0.991808
6,0.0145,0.044866,0.967816,0.970046,0.96893,0.991808
7,1.3299,0.041017,0.976852,0.97235,0.974596,0.99209
8,0.016,0.040106,0.972286,0.970046,0.971165,0.99209






Converting google/mobilebert-uncased to ONNX...


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.
  torch.tensor(1000),
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/google_mobilebert-uncased_lr5e-05_bs24/model.onnx
-----------------------------------------------------------
Running experiment: distilbert-base-uncased, lr=0.0001, bs=16
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/distilbert-base-uncased_lr0.0001_bs16


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/735 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.131,0.053885,0.94508,0.951613,0.948335,0.986723
2,0.0852,0.035875,0.961187,0.970046,0.965596,0.991525
3,0.0312,0.031082,0.959276,0.976959,0.968037,0.992938
4,0.0172,0.03752,0.97235,0.97235,0.97235,0.992655
5,0.0085,0.028297,0.965909,0.979263,0.97254,0.994068
6,0.0031,0.034042,0.970387,0.981567,0.975945,0.993785
7,0.0031,0.03443,0.970252,0.976959,0.973594,0.99322
8,0.0034,0.034134,0.968109,0.979263,0.973654,0.993785






Converting distilbert-base-uncased to ONNX...


Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/distilbert-base-uncased_lr0.0001_bs16/model.onnx
-----------------------------------------------------------
Running experiment: distilbert-base-uncased, lr=0.0001, bs=24
Saving outputs to: /content/drive/MyDrive/my_ner_experiments/distilbert-base-uncased_lr0.0001_bs24


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1721,0.06881,0.93287,0.928571,0.930716,0.984746
2,0.0931,0.04684,0.963218,0.965438,0.964327,0.990395
3,0.0343,0.035299,0.965753,0.974654,0.970183,0.989831
4,0.0186,0.036277,0.967963,0.974654,0.971297,0.991808
5,0.0109,0.041274,0.974654,0.974654,0.974654,0.99209
6,0.0089,0.04129,0.974654,0.974654,0.974654,0.99209
7,0.0048,0.04239,0.970183,0.974654,0.972414,0.992655
8,0.0057,0.0419,0.970183,0.974654,0.972414,0.992655




Passing the argument `library_name` to `get_supported_tasks_for_model_type` is required, but got library_name=None. Defaulting to `transformers`. An error will be raised in a future version of Optimum if `library_name` is not provided.


Converting distilbert-base-uncased to ONNX...
ONNX model saved at: /content/drive/MyDrive/my_ner_experiments/distilbert-base-uncased_lr0.0001_bs24/model.onnx
-----------------------------------------------------------

=== All Runs Results ===
                     model_checkpoint  learning_rate  batch_size  precision  \
4   microsoft/MiniLM-L12-H384-uncased        0.00010          24   0.972665   
3   microsoft/MiniLM-L12-H384-uncased        0.00010          16   0.979215   
7           google/mobilebert-uncased        0.00005          16   0.979167   
9             distilbert-base-uncased        0.00010          16   0.970387   
5           google/mobilebert-uncased        0.00010          16   0.974713   
10            distilbert-base-uncased        0.00010          24   0.974654   
8           google/mobilebert-uncased        0.00005          24   0.976852   
6           google/mobilebert-uncased        0.00010          24   0.972414   
0                 prajjwal1/bert-tiny        

  .apply(lambda group: group.nlargest(1, "f1"))


In [39]:

###################################################
#  Create a DataFrame of results and display it   #
###################################################
df_results = pd.DataFrame(results_list)
df_results = df_results.sort_values("f1", ascending=False)

print("\n=== All Runs Results ===")
print(df_results)

# Best run per model
best_per_model = (
    df_results
    .groupby("model_checkpoint", group_keys=False)
    .apply(lambda group: group.nlargest(1, "f1"))
    .reset_index(drop=True)
)

print("\n=== Best Run from Each Model ===")
print(best_per_model)

# Among these best-of-each-model, pick overall best
best_overall = best_per_model.nlargest(1, "f1").iloc[0]
print("\n=== Best Overall Across All Models ===")
print(best_overall)

best_model_path = best_overall["output_dir"]
print("Best model is at:", best_model_path)

# Also, save the final results to a CSV in the base folder
results_csv_path = os.path.join(base_output_dir, "all_experiments_results.csv")
df_results.to_csv(results_csv_path, index=False)
print(f"All experiment results saved to: {results_csv_path}")


=== All Runs Results ===
                     model_checkpoint  learning_rate  batch_size  precision  \
4   microsoft/MiniLM-L12-H384-uncased        0.00010          24   0.972665   
3   microsoft/MiniLM-L12-H384-uncased        0.00010          16   0.979215   
7           google/mobilebert-uncased        0.00005          16   0.979167   
9             distilbert-base-uncased        0.00010          16   0.970387   
5           google/mobilebert-uncased        0.00010          16   0.974713   
10            distilbert-base-uncased        0.00010          24   0.974654   
8           google/mobilebert-uncased        0.00005          24   0.976852   
6           google/mobilebert-uncased        0.00010          24   0.972414   
0                 prajjwal1/bert-tiny        0.00010          16   0.909707   
1                 prajjwal1/bert-tiny        0.00010          32   0.893333   
2                 prajjwal1/bert-tiny        0.00005          32   0.812775   

      recall        f1  a

  .apply(lambda group: group.nlargest(1, "f1"))


In [44]:
import os

print("\n=== ONNX Model Sizes ===")
for idx, row in df_results.iterrows():
    model_path = row["output_dir"]
    onnx_file = os.path.join(model_path, "model.onnx")

    if os.path.exists(onnx_file):
        # Calculate file size in MB
        size_bytes = os.path.getsize(onnx_file)
        size_mb = size_bytes / (1024 * 1024)

        print(
            f"Model: {row['model_checkpoint']}  |  "
            f"LR={row['learning_rate']}  |  "
            f"BS={row['batch_size']}  |  "
            f"ONNX Size: {size_mb:.2f} MB"
        )
    else:
        print(f"No ONNX file found for {row['model_checkpoint']} (output_dir={model_path}).")



=== ONNX Model Sizes ===
Model: microsoft/MiniLM-L12-H384-uncased  |  LR=0.0001  |  BS=24  |  ONNX Size: 127.00 MB
Model: microsoft/MiniLM-L12-H384-uncased  |  LR=0.0001  |  BS=16  |  ONNX Size: 127.00 MB
Model: google/mobilebert-uncased  |  LR=5e-05  |  BS=16  |  ONNX Size: 94.48 MB
Model: distilbert-base-uncased  |  LR=0.0001  |  BS=16  |  ONNX Size: 253.32 MB
Model: google/mobilebert-uncased  |  LR=0.0001  |  BS=16  |  ONNX Size: 94.48 MB
Model: distilbert-base-uncased  |  LR=0.0001  |  BS=24  |  ONNX Size: 253.32 MB
Model: google/mobilebert-uncased  |  LR=5e-05  |  BS=24  |  ONNX Size: 94.48 MB
Model: google/mobilebert-uncased  |  LR=0.0001  |  BS=24  |  ONNX Size: 94.48 MB
Model: prajjwal1/bert-tiny  |  LR=0.0001  |  BS=16  |  ONNX Size: 16.73 MB
Model: prajjwal1/bert-tiny  |  LR=0.0001  |  BS=32  |  ONNX Size: 16.73 MB
Model: prajjwal1/bert-tiny  |  LR=5e-05  |  BS=32  |  ONNX Size: 16.73 MB
