In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import math
import statistics


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/finchmf/coding/control_vectors/experiment_pipeline/env/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/finchmf/coding/control_vectors/experime

In [2]:
def compute_entropy(probs: torch.Tensor):
    return -torch.sum(probs * torch.log2(probs + 1e-12)).item()

def compute_step_metrics(logits: torch.Tensor, k: int = 5):
    """
    From raw logits [vocab_size], compute:
      - full entropy
      - top-1 probability
      - top-k entropy (renormalized over the k largest probs)
    """
    probs = torch.softmax(logits, dim=-1)
    entropy = compute_entropy(probs)
    top1_prob = torch.max(probs).item()
    
    topk_vals, _ = torch.topk(probs, k)
    topk_probs = topk_vals / topk_vals.sum()
    topk_entropy = compute_entropy(topk_probs)
    
    return {
        "entropy": entropy,
        "top1_prob": top1_prob,
        f"top{k}_entropy": topk_entropy
    }

def generate_and_analyze(model, tokenizer, prompt: str, max_new_tokens: int = 20, top_k: int = 5):
    """
    Manual one‐step‐at‐a‐time generation so we can collect:
      - logits → entropy & top-k entropy
      - attentions → head-level entropies
      - hidden_states → hidden vector norms
    Returns generated text + list of per-step metric dicts.
    """
    device = next(model.parameters()).device
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    
    metrics_per_step = []
    generated = input_ids
    
    for step in range(max_new_tokens):
        outputs = model(
            generated,
            output_attentions=True,
            output_hidden_states=True
        )
        # take logits for the last token
        logits = outputs.logits[:, -1, :]  # [1, vocab_size]
        
        # 1) basic & top-k entropy
        step_m = compute_step_metrics(logits.squeeze(0), k=top_k)
        
        # 2) head-level attention entropies
        # outputs.attentions is a tuple[layer] of shape [1, num_heads, seq_len, seq_len]
        head_entropies = []
        for layer_att in outputs.attentions:
            # attention of last query pos to all keys: shape [num_heads, seq_len]
            attn = layer_att[0, :, -1, :]  
            # entropy per head
            ent = (-attn * torch.log2(attn + 1e-12)).sum(dim=-1)  # [num_heads]
            head_entropies.append(ent.tolist())
        step_m["head_entropies"] = head_entropies
        
        # 3) hidden-state norm of new token at final layer
        # outputs.hidden_states is tuple[layer] of [1, seq_len, hidden_size]
        final_layer_vec = outputs.hidden_states[-1][0, -1, :]  # [hidden_size]
        step_m["hidden_norm"] = torch.norm(final_layer_vec).item()
        
        metrics_per_step.append(step_m)
        
        # sample next token (you can switch to top-k or temperature sampling easily)
        next_token = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1)
        generated = torch.cat([generated, next_token], dim=1)
    
    gen_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    return gen_text, metrics_per_step



In [3]:
def summarize_metrics(metrics_list, top_k: int = 5):
    import statistics
    entropies     = [m["entropy"] for m in metrics_list]
    top1_probs    = [m["top1_prob"] for m in metrics_list]
    topk_entropies= [m[f"top{top_k}_entropy"] for m in metrics_list]
    hidden_norms  = [m["hidden_norm"] for m in metrics_list]
    
    # flatten all head entropies across layers & steps
    head_vals = [h for m in metrics_list for layer in m["head_entropies"] for h in layer]
    
    return {
        "steps": len(metrics_list),
        "mean_entropy": statistics.mean(entropies),
        "stdev_entropy": statistics.stdev(entropies) if len(entropies)>1 else 0.0,
        "mean_top1_prob": statistics.mean(top1_probs),
        "stdev_top1_prob": statistics.stdev(top1_probs) if len(top1_probs)>1 else 0.0,
        f"mean_top{top_k}_entropy": statistics.mean(topk_entropies),
        f"stdev_top{top_k}_entropy": statistics.stdev(topk_entropies) if len(topk_entropies)>1 else 0.0,
        "mean_hidden_norm": statistics.mean(hidden_norms),
        "stdev_hidden_norm": statistics.stdev(hidden_norms) if len(hidden_norms)>1 else 0.0,
        "mean_head_entropy": statistics.mean(head_vals),
        "stdev_head_entropy": statistics.stdev(head_vals) if len(head_vals)>1 else 0.0,
    }

In [4]:
for model_name in ["gpt2", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"]:
    print(f"\n=== {model_name} ===")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    prompt = "In a distant future, machines and humans"
    text, metrics = generate_and_analyze(model, tokenizer, prompt, max_new_tokens=30, top_k=5)
    
    print("Generated:", text)
    summary = summarize_metrics(metrics, top_k=5)
    print("Summary:", summary)
    # Optional per-step detail:
    for i, m in enumerate(metrics, 1):
        print(f" Step {i:02d}: entropy={m['entropy']:.2f}, top1={m['top1_prob']:.3f}, "
                f"top5_entropy={m['top5_entropy']:.2f}, hidden_norm={m['hidden_norm']:.2f}")


=== gpt2 ===




Generated: In a distant future, machines and humans are going to achieve far greater well paid jobs and greater advancement.

That might sound pleasant, but it is absolutely not. I am an unemployed
Summary: {'steps': 30, 'mean_entropy': 5.455578627189, 'stdev_entropy': 2.4683623503019554, 'mean_top1_prob': 0.32255162329723436, 'stdev_top1_prob': 0.2675351130429084, 'mean_top5_entropy': 1.7119814194117984, 'stdev_top5_entropy': 0.6229998147128664, 'mean_hidden_norm': 260.7959732055664, 'stdev_hidden_norm': 53.4388777045231, 'mean_head_entropy': 2.1144040869251075, 'stdev_head_entropy': 1.1813163152192292}
 Step 01: entropy=6.04, top1=0.143, top5_entropy=2.26, hidden_norm=250.78
 Step 02: entropy=9.33, top1=0.031, top5_entropy=2.29, hidden_norm=271.39
 Step 03: entropy=1.04, top1=0.905, top5_entropy=0.31, hidden_norm=145.79
 Step 04: entropy=7.15, top1=0.201, top5_entropy=1.72, hidden_norm=278.85
 Step 05: entropy=7.92, top1=0.108, top5_entropy=2.04, hidden_norm=232.24
 Step 06: entropy=



Generated: In a distant future, machines and humans have all become intelligent. Can you summarize what is happening in the story and provide an insight into the alternate future presented in it? 
Summary: {'steps': 30, 'mean_entropy': 2.592005740243864, 'stdev_entropy': 1.8651399013238552, 'mean_top1_prob': 0.5707355871796608, 'stdev_top1_prob': 0.31219541998329237, 'mean_top5_entropy': 1.2850046824198216, 'stdev_top5_entropy': 0.8164455741263311, 'mean_hidden_norm': 85.30977300008138, 'stdev_hidden_norm': 1.5995957996183812, 'mean_head_entropy': 1.73753543563386, 'stdev_head_entropy': 1.163007158779073}
 Step 01: entropy=4.06, top1=0.302, top5_entropy=1.99, hidden_norm=85.64
 Step 02: entropy=5.08, top1=0.157, top5_entropy=2.25, hidden_norm=86.45
 Step 03: entropy=1.00, top1=0.891, top5_entropy=0.48, hidden_norm=86.51
 Step 04: entropy=5.19, top1=0.227, top5_entropy=1.90, hidden_norm=87.23
 Step 05: entropy=0.01, top1=0.999, top5_entropy=0.01, hidden_norm=84.05
 Step 06: entropy=3.45