In [None]:
#Colab setup
!pip -q install -U transformers accelerate bitsandbytes datasets sentencepiece

import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

In [None]:
#Load TruthfulQA

ds = load_dataset("truthful_qa", "generation")
ds

In [None]:
ds["validation"][0]

In [None]:
# See a few examples
questions = ds["validation"][:5]["question"]
print(questions)

In [None]:
# !pip -q install -U huggingface_hub
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# Load a Llama-Instruct model

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# 4-bit quantization to fit on T4
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             attn_implementation="eager"
                                            )
model.eval()

In [None]:
# Prompt formatting

messages = [
    {"role": "system", "content": "Answer briefly and directly."},
    {"role": "user", "content": questions},
]

In [None]:
messages

In [None]:
try:
    prompt_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    )
except Exception as e:
    print("apply_chat_template failed; falling back to plain prompt.\n", e)
    plain_prompt = f"System: Answer briefly and directly.\nUser: {questions}\nAssistant:"
    prompt_ids = tokenizer(plain_prompt, return_tensors="pt").input_ids

prompt_ids = prompt_ids.to(model.device)
prompt_len = prompt_ids.shape[1]

with torch.no_grad():
    gen_ids = model.generate(
        prompt_ids,
        max_new_tokens=128,
        temperature=1.0,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

full_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
print("\n--- Generated (full decoded) ---\n", full_text)

gen_len = gen_ids.shape[1] - prompt_len
print("\nPrompt tokens:", prompt_len, "Generated tokens:", gen_len)

In [None]:
# Forward pass with output_attentions=True on full sequence

with torch.no_grad():
    outputs = model(
        input_ids=gen_ids,
        output_attentions=True,
        use_cache=False,   # compute full attention tensors
        return_dict=True
    )

attentions = outputs.attentions  # tuple length num_layers
num_layers = len(attentions)
num_heads = attentions[0].shape[1]
seq_len = attentions[0].shape[-1]
print(f"\nLayers: {num_layers}, Heads: {num_heads}, SeqLen: {seq_len}")

In [None]:
# Build Laplacian and compute top-k eigenvalues on generated-token subgraph

def laplacian_eigs_from_attention(A: torch.Tensor, k=20, symmetrize=True):
    """
    A: [n, n] attention adjacency (nonnegative)
    Returns dict k -> smallest k eigenvalues (sorted ascending) of Laplacian L = D - A.
    """
    if symmetrize:
        A = 0.5 * (A + A.T)

    A = torch.clamp(A, min=0.0)

    d = A.sum(dim=1)
    L = torch.diag(d) - A

    eigs = torch.linalg.eigvalsh(L).real
    eigs_sorted = torch.sort(eigs).values

    out = {}

    k_eff = min(k, eigs_sorted.numel())
    out = eigs_sorted[-k_eff:].detach().cpu().numpy()
    return out

In [None]:
def aggregate_heads(attn_layer: torch.Tensor, mode="mean"):
    """
    attn_layer: [batch, heads, seq, seq] -> [seq, seq]
    """
    A = attn_layer[0]  # [heads, seq, seq]
    if mode == "mean":
        return A.mean(dim=0)
    elif mode == "max":
        return A.max(dim=0).values
    else:
        raise ValueError("mode must be 'mean' or 'max'")

In [None]:
gen_start = prompt_len
gen_end = gen_ids.shape[1]
gen_idx = slice(gen_start, gen_end)

results = {}
for layer_idx in range(num_layers):
    A_full = aggregate_heads(attentions[layer_idx], mode="mean")      # [seq, seq]
    A_gen = A_full[gen_idx, gen_idx].to(torch.float32).cpu()          # [gen_len, gen_len]

    if A_gen.shape[0] < 5:
        continue

    results[layer_idx] = laplacian_eigs_from_attention(A_gen, k=20, symmetrize=True)

In [None]:
print("\n--- Laplacian eigenvalues on GENERATED-token graph (per layer, head-mean) ---")
for layer_idx in list(results.keys())[:5]:
    print(f"Layer {layer_idx}: {np.round(results[layer_idx], 6)}")

In [None]:
len(list(results.keys()))

In [None]:
# Eigenvalues for individual attention heads
def laplacian_topk_eigs(A: torch.Tensor, k: int, symmetrize=True):
    """
    A: [n, n] adjacency (attention), nonnegative
    Returns largest k eigenvalues of unnormalized Laplacian L = D - A.
    """
    if symmetrize:
        A = 0.5 * (A + A.T)

    A = torch.clamp(A, min=0.0).to(torch.float32)

    d = A.sum(dim=1)
    L = torch.diag(d) - A

    # symmetric eigvals
    eigs = torch.linalg.eigvalsh(L).real
    eigs_sorted = torch.sort(eigs).values  # ascending
    k_eff = min(k, eigs_sorted.numel())
    return eigs_sorted[-k_eff:]            # largest k (still ascending within top-k)

In [None]:
# Restrict to generated tokens only
gen_idx = slice(prompt_len, gen_ids.shape[1])
gen_len = gen_ids.shape[1] - prompt_len
print("gen_len:", gen_len)

L = len(attentions)
H = attentions[0].shape[1]
print("layers:", L, "heads:", H)

In [None]:
# We'll store top-20 eigenvalues
k = 20
eigs_per_head_top20 = torch.empty((L, H, k), dtype=torch.float32)

for l in range(L):
    # att shape: [1, H, S, S]
    att_l = attentions[l][0, :, gen_idx, gen_idx].detach()  # [H, gen, gen]

    for h in range(H):
        A = att_l[h]  # [gen, gen]
        eigs_top = laplacian_topk_eigs(A, k=20, symmetrize=True)

        # Pad on the left if sequence is shorter than k (rare unless very short gen)
        if eigs_top.numel() < k:
            pad = torch.full((k - eigs_top.numel(),), float("nan"))
            eigs_top = torch.cat([pad, eigs_top], dim=0)

        eigs_per_head_top20[l, h] = eigs_top

In [None]:
eigs_per_head_top20.shape

In [None]:
eigs_per_head_top20[0,0]