# Model Scaling Test

Compare adaptive transformer performance across different model sizes like `distilgpt2`, `gpt2`, and `gpt2-medium`. 
This notebook benchmarks training time, perplexity, and head activity across sizes.

In [None]:
!pip install transformers datasets torch matplotlib

In [None]:
import torch
import time
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from models.loaders.loader import load_adaptive_model, load_baseline_model
from datasets import load_dataset
from utils.training import compute_loss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_names = ["distilgpt2", "gpt2"]  # Extendable to "gpt2-medium"
prompt = "The adaptive transformer architecture is"
results = {}

## Benchmark Loop

In [None]:
for model_name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    baseline = load_baseline_model(model_name, device)
    adaptive = load_adaptive_model(model_name, baseline, device)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    labels = inputs["input_ids"]

    start = time.time()
    logits = adaptive(**inputs)
    duration = time.time() - start
    
    loss = compute_loss(logits, labels)

    active_heads = sum(float(g > 0.1) for block in adaptive.blocks for g in block['attn'].gate)
    total_params = sum(p.numel() for p in adaptive.parameters())

    results[model_name] = {
        "loss": loss.item(),
        "inference_time": duration,
        "active_heads": active_heads,
        "params": total_params
    }

## Results Summary

In [None]:
import pandas as pd
df = pd.DataFrame(results).T
df

## Visual Comparison

In [None]:
df.plot.bar(figsize=(10,6), subplots=True, layout=(2,2), legend=False, title="Model Scaling Benchmark")
plt.tight_layout()
plt.show()