In [1]:
import time
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
# Config
# -----------------------
TASK = "stsb"          # GLUE STS-B
BATCH_SIZE = 32
MAX_LENGTH = 128
WARMUP_STEPS = 10

# For CPU consistency; try 1, 4, 8 and keep it fixed across runs
NUM_THREADS = 4
NUM_INTEROP_THREADS = 1

MODELS = {
    "BERT-base": "bert-base-uncased",
    "DistilBERT": "distilbert-base-uncased",
}

In [3]:
def count_params_m(model):
    return sum(p.numel() for p in model.parameters()) / 1e6

In [4]:
def make_batches(tok_ds, batch_size):
    """Prebuild batches so DataLoader/Python overhead isn't in the timed loop."""
    batches = []
    n = len(tok_ds)
    for i in range(0, n, batch_size):
        chunk = tok_ds[i:i + batch_size]
        batch = {
            "input_ids": torch.tensor(chunk["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(chunk["attention_mask"], dtype=torch.long),
        }
        # Some models may include token_type_ids (BERT) and some won't (DistilBERT)
        if "token_type_ids" in chunk:
            batch["token_type_ids"] = torch.tensor(chunk["token_type_ids"], dtype=torch.long)
        batches.append(batch)
    return batches

In [5]:
def benchmark_full_pass(model, batches):
    model.eval()

    # Warmup
    with torch.inference_mode():
        for b in batches[:WARMUP_STEPS]:
            _ = model(**b)

    # Timed pass (model-only)
    start = time.perf_counter()
    with torch.inference_mode():
        for b in batches:
            _ = model(**b)
    end = time.perf_counter()
    return end - start

In [6]:
def main():
    # Threading control (big deal on CPU benchmarks)
    if NUM_THREADS is not None:
        torch.set_num_threads(NUM_THREADS)
    if NUM_INTEROP_THREADS is not None:
        torch.set_num_interop_threads(NUM_INTEROP_THREADS)

    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    ds = load_dataset("glue", TASK, split="validation")

    def to_text(batch):
        return {"text_a": batch["sentence1"], "text_b": batch["sentence2"]}

    ds = ds.map(to_text, remove_columns=[c for c in ds.column_names if c not in ["text_a", "text_b"]])

    results = []

    for name, model_id in MODELS.items():
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        def tok(batch):
            return tokenizer(
                batch["text_a"],
                batch["text_b"],
                truncation=True,
                max_length=MAX_LENGTH,
                padding="max_length",   # IMPORTANT: fixed shapes
            )

        tok_ds = ds.map(tok, batched=True, remove_columns=["text_a", "text_b"])

        # Prebuild batches once (IMPORTANT: avoid DataLoader overhead in timing)
        batches = make_batches(tok_ds, BATCH_SIZE)

        model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
        model.to("cpu")

        params_m = count_params_m(model)
        secs = benchmark_full_pass(model, batches)

        results.append((name, params_m, secs))

    print(f"\nTable (CPU, batch_size={BATCH_SIZE}, fixed_len={MAX_LENGTH}, full pass of GLUE STS-B validation)\n")
    print(f"{'Model':<12} {'# param (M)':>12} {'Inf. time (s)':>15}")
    for name, pm, s in results:
        print(f"{name:<12} {pm:>12.1f} {s:>15.2f}")

    bert_time = next(s for n, _, s in results if n == "BERT-base")
    dist_time = next(s for n, _, s in results if n == "DistilBERT")
    print(f"\nSpeedup (BERT / DistilBERT): {bert_time / dist_time:.2f}x faster")

if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Table (CPU, batch_size=32, fixed_len=128, full pass of GLUE STS-B validation)

Model         # param (M)   Inf. time (s)
BERT-base           109.5           84.26
DistilBERT           67.0           41.38

Speedup (BERT / DistilBERT): 2.04x faster
