In [9]:
import os
import math
import gzip
from typing import Dict, Any
import time

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, GPT2LMHeadModel
from datasets import load_dataset
import pandas as pd

In [None]:
# 0. DEVICE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Evaluating on device:", device)

# 1. TOKENIZER / MODEL NAMES
tokenizer_names = [
    "gpt2",                 # 2. Byte-level BPE
    "google/byt5-small",    # 3. Character-level
    "unigram",              # 4. Your custom unigram
]

# 2. LOAD RAW VALIDATION DATA ONCE
raw_eval = load_dataset("wikitext", "wikitext-2-raw-v1", split="test[:20%]")
raw_eval = raw_eval.filter(lambda x: len(x["text"].strip()) > 0)
eval_texts = list(raw_eval["text"])
total_chars = sum(len(t) for t in eval_texts)
print(f"#test examples: {len(eval_texts)}, total chars: {total_chars}")

Evaluating on device: cpu
#test examples: 10000, total chars: 7059156


In [43]:
# 3. EVALUATION FUNCTION FOR ONE MODEL
def evaluate_model(name: str) -> Dict[str, Any]:
    print(f"\n Evaluating tokenizer: {name}")

    # 3.1 Load tokenizer & model from disk
    model_dir = os.path.join("model", name)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = GPT2LMHeadModel.from_pretrained(model_dir).to(device)
    model.eval()

    vocab_size = len(tokenizer)
    n_params = sum(p.numel() for p in model.parameters())

    # 3.2 TRUE CHARACTER COUNT FROM TOKENIZATION
    print("Computing true character usage...")
    free_enc = tokenizer(
        eval_texts,
        padding=False,
        truncation=True,
        max_length=512,
    )

    chars_from_tokens = 0
    total_tokens_in_free_enc = 0

    for ids in free_enc["input_ids"]:
        decoded = tokenizer.decode(ids, skip_special_tokens=True)
        chars_from_tokens += len(decoded)
        total_tokens_in_free_enc += len(ids)

    tokens_per_char = total_tokens_in_free_enc / chars_from_tokens
    
    # 3.4 PREPARE PADDED DATASET FOR LOSS / PERPLEXITY
    def encode(batch):
        texts = list(batch["text"])
        return tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=64,
        )

    eval_ds = raw_eval.map(encode, batched=True)
    eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
    eval_loader = DataLoader(eval_ds, batch_size=8)
    
    # 3.5 COMPUTE TOTAL NLL, LOSS, PERPLEXITY
    total_nll = 0.0            # sum of nats over all non-pad tokens
    total_nonpad_tokens = 0
    total_forward_tokens = 0        # tokens processed (non-pad)

    with torch.no_grad():
        for batch in eval_loader:
            inputs = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            labels = inputs.clone()
            labels[attention_mask == 0] = -100

            nonpad = (labels != -100).sum().item()
            if nonpad == 0:
                continue

            outputs = model(
                input_ids=inputs,
                attention_mask=attention_mask,
                labels=labels,
            )

            # loss accounting
            loss = outputs.loss.item()
            batch_nll = loss * nonpad
            total_nll += batch_nll
            total_nonpad_tokens += nonpad

        avg_loss = total_nll / total_nonpad_tokens
        ppl = math.exp(avg_loss)
        
        # 3.6 BITS-PER-TOKEN & BITS-PER-CHAR
        total_bits = total_nll / math.log(2)
        bits_per_char = total_bits / chars_from_tokens
        
        # 3.7 COMPRESSION RATIO ON GENERATED TEXT
        gen_texts = []
        for _ in range(5):
            prompt = "The meaning of life is"
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            with torch.no_grad():
                out = model.generate(
                    **inputs,
                    max_length=80,
                    do_sample=True,
                    temperature=0.9,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id,
                )
            txt = tokenizer.decode(out[0], skip_special_tokens=True)
            gen_texts.append(txt)

        joined_gen = "\n".join(gen_texts)
        raw_bytes = joined_gen.encode("utf-8")
        if len(raw_bytes) > 0:
            comp_bytes = gzip.compress(raw_bytes)
            compression_ratio = len(comp_bytes) / len(raw_bytes)
        else:
            compression_ratio = float("nan")

        # 3.9 PACK RESULTS
        return {
            "tokenizer": name,
            "vocab_size": vocab_size,
            "params_millions": n_params / 1e6,
            "tokens_per_char": tokens_per_char,
            "val_loss": avg_loss,
            "perplexity": ppl,
            "bits_per_char": bits_per_char,
            }
# 4. RUN EVALUATION FOR ALL TOKENIZERS & COLLECT RESULTS
all_results = [evaluate_model(name) for name in tokenizer_names]

results_df = pd.DataFrame(all_results).round(2)

print("\n=== Summary table ===")
print(results_df)


 Evaluating tokenizer: gpt2
Computing true character usage...


Map: 100%|██████████| 10000/10000 [00:01<00:00, 7596.00 examples/s]



 Evaluating tokenizer: google/byt5-small
Computing true character usage...

 Evaluating tokenizer: unigram
Computing true character usage...


Map: 100%|██████████| 10000/10000 [00:01<00:00, 7435.04 examples/s]



=== Summary table ===
           tokenizer  vocab_size  params_millions  tokens_per_char  val_loss  \
0               gpt2       50258            81.91             0.24      8.77   
1  google/byt5-small         384            43.61             1.00      2.37   
2            unigram       32000            67.89             0.26      6.35   

   perplexity  bits_per_char  
0     6407.41           1.08  
1       10.69           0.55  
2      572.41           0.75  


## 95\% conf interval

In [None]:
from scipy import stats

print("\n GENERATION TESTS")

prompt = "Hi my name is Jens, and I am from"
n_runs = 50
df = n_runs - 1
t_95 = stats.t.ppf(1-0.05/2, df)

for name in tokenizer_names:
    print(f"\n Generating with: {name}")
    
    model_dir = f"model/{name}"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = GPT2LMHeadModel.from_pretrained(model_dir).to(device)
    model.eval()

    # GENERATION TIME 
    tokens_per_sec_runs = []
    gen_times = []
    new_tokens_counts = []

    sample_text = None  # only store the first output

    for run in range(n_runs):
        with torch.no_grad():
            if device.type == "cuda":
                torch.cuda.synchronize()
            t_gen0 = time.time()

            generated = model.generate(
                **inputs,
                max_new_tokens=50,   # fixed for fairness
                do_sample=True,
                temperature=0.5,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id,
            )

            if device.type == "cuda":
                torch.cuda.synchronize()
            t_gen1 = time.time()

        generation_time = t_gen1 - t_gen0

        # count new tokens only
        prompt_len = inputs["input_ids"].shape[1]
        total_len = generated.shape[1]
        n_gen_tokens = max(total_len - prompt_len, 0)

        tokens_per_sec = n_gen_tokens / generation_time

        tokens_per_sec_runs.append(tokens_per_sec)
        gen_times.append(generation_time)
        new_tokens_counts.append(n_gen_tokens)

        if run == 0:  # save only the first generation
            sample_text = tokenizer.decode(generated[0], skip_special_tokens=True)

    # MEAN + 95% CONFIDENCE INTERVAL
    mean_tps = sum(tokens_per_sec_runs) / n_runs

    var = sum((x - mean_tps) ** 2 for x in tokens_per_sec_runs) / (n_runs - 1)
    std_tps = math.sqrt(var)
    se_tps = std_tps / math.sqrt(n_runs)
    ci_low = mean_tps - t_95 * se_tps
    ci_high = mean_tps + t_95 * se_tps

    mean_gen_time = sum(gen_times) / n_runs
    mean_new_tokens = sum(new_tokens_counts) / n_runs

    # OUTPUT
    print(sample_text)  # ONE example per model

    print(f"[{name}]")
    print(f"  mean_tokens_per_sec:         {mean_tps:.2f}")
    print(f"  tokens_per_sec_std:          {std_tps:.2f}")
    print(f"  tokens_per_sec_95%CI:        [{ci_low:.2f}, {ci_high:.2f}]")


=== QUICK GENERATION TESTS ===

--- Generating with: gpt2 ---
Hi my name is Jens, and I am from the village of the British Roman Empire . The first first was a common part of the first half of the Roman Church . The Church 's Church , the Church , the king 's name is the first part of the king 's Church , and
[gpt2]
  mean_tokens_per_sec:         72.13
  tokens_per_sec_std:          2.50
  tokens_per_sec_95%CI:        [71.42, 72.84]

--- Generating with: google/byt5-small ---
Hi my name is Jens, and I am from the second sincle of the Brit
[google/byt5-small]
  mean_tokens_per_sec:         104.61
  tokens_per_sec_std:          5.36
  tokens_per_sec_95%CI:        [103.08, 106.13]

--- Generating with: unigram ---
Hi m y name is Jens , and I am from a b r ' s on , as a c y , a ( ( : ) ) . A r a ) on s on , as a as a d e on ( ( ( ( F / / / / / / g / / / g
[unigram]
  mean_tokens_per_sec:         83.45
  tokens_per_sec_std:          3.01
  tokens_per_sec_95%CI:        [82.59, 84.31]


In [None]:
print("\n PROMPT PROCESSING BENCHMARK (SIMPLIFIED)")

long_prompt = (
    """This benchmark prompt is intentionally extended to approximately one thousand characters to evaluate how different tokenization strategies and model architectures process long, information-dense inputs under realistic conditions. It includes diverse elements such as technical terminology from machine learning, like stochastic gradient descent, entropy regularization, eigenvector drift, nonlinear optimization, and transformer attention patterns. It also integrates multilingual fragments such as hej, guten Tag, bonjour, hola, こんにちは, 你好, γειά σου, and привет to stress Unicode handling. Numerical values including 3.14159, 0.00027, and 42, alongside symbolic expressions like α→β→γ and ∫x² dx, test symbolic segmentation. Short code pieces such as for(i=0;i<10;i++){sum+=i;} and JSON-like fragments {"key":42,"msg":"hello"} further diversify the prompt, ensuring broad tokenizer coverage across subword granularity and vocabulary structure."""
)

max_len_prompt = 1000
n_runs = 50
df = n_runs-1
t_95 = stats.t.ppf(1-0.05/2, df)

def mean_std_ci(values):
    vals = [v for v in values if not math.isnan(v)]
    m = sum(vals) / len(vals)
    if len(vals) > 1:
        var = sum((x - m)**2 for x in vals) / (len(vals) - 1)
        s = math.sqrt(var)
        se = s / math.sqrt(len(vals))
        ci_low = m - t_95 * se
        ci_high = m + t_95 * se
    else:
        s = float("nan")
        ci_low = ci_high = float("nan")
    return m, s, ci_low, ci_high

for name in tokenizer_names:
    print(f"\n Prompt processing for: {name}")

    model_dir = f"model/{name}"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = GPT2LMHeadModel.from_pretrained(model_dir).to(device)
    model.eval()

    total_times = []
    n_tokens = None

    for run in range(n_runs):

        # START total time
        t0 = time.time()

        # Tokenization
        encoded = tokenizer(
            long_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=max_len_prompt,
            add_special_tokens=False,
        )

        if n_tokens is None:
            n_tokens = encoded["input_ids"].shape[1]

        encoded = {k: v.to(device) for k, v in encoded.items()}

        # Forward pass (prompt only)
        with torch.no_grad():
            if device.type == "cuda":
                torch.cuda.synchronize()
            t_fwd0 = time.time()
            _ = model(**encoded)
            if device.type == "cuda":
                torch.cuda.synchronize()
            t_fwd1 = time.time()

        # END total time
        total_time = t_fwd1 - t0
        total_times.append(total_time)

    # STATISTICS
    mean_t, std_t, ci_low, ci_high = mean_std_ci(total_times)

    # REPORT
    print(f"  total_time_mean:        {mean_t:.6f} sec")
    print(f"  total_time_std:         {std_t:.6f} sec")
    print(f"  total_time_95%CI:       [{ci_low:.6f}, {ci_high:.6f}]")


=== PROMPT PROCESSING BENCHMARK (SIMPLIFIED) ===

 Prompt processing for: gpt2
  total_time_mean:        0.074460 sec
  total_time_std:         0.011028 sec
  total_time_95%CI:       [0.071326, 0.077595]

 Prompt processing for: google/byt5-small
  total_time_mean:        0.167579 sec
  total_time_std:         0.045300 sec
  total_time_95%CI:       [0.154705, 0.180453]

 Prompt processing for: unigram
  total_time_mean:        0.081075 sec
  total_time_std:         0.057978 sec
  total_time_95%CI:       [0.064598, 0.097552]
