In [38]:
from transformer_lens.hook_points import HookPoint
from transformer_lens import (
    ActivationCache,
    FactoredMatrix,
    HookedTransformer,
    HookedTransformerConfig,
    utils,
)

import circuitsvis as cv

import einops
import torch as t

import numpy as np

device = t.device('mps' if t.backends.mps.is_available() else 'cuda' if t.cuda.is_available() else 'cpu')

#Sette opp gpt2small. 

gpt2small: HookedTransformer = HookedTransformer.from_pretrained("gpt2-small")

print(gpt2small.cfg.n_layers)
print(gpt2small.cfg.n_heads)


Loaded pretrained model gpt2-small into HookedTransformer
12
12


In [39]:
#Konvertere tekst til tokens. 

print(gpt2small.to_str_tokens("gpt2"))
print(gpt2small.to_str_tokens(["gpt2", "gpt2"]))
print(gpt2small.to_tokens("gpt2"))
print(gpt2small.to_string([50256, 70, 457, 17]))

['<|endoftext|>', 'g', 'pt', '2']
[['<|endoftext|>', 'g', 'pt', '2'], ['<|endoftext|>', 'g', 'pt', '2']]
tensor([[50256,    70,   457,    17]])
<|endoftext|>gpt2


In [40]:
#Forutsi neste tokens - hvor mange riktige? 

model_description_text = '''## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!'''

loss = gpt2small(model_description_text, return_type="loss")
print("Model loss:", loss)

logits = gpt2small(model_description_text, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
true_tokens = gpt2small.to_tokens(model_description_text).squeeze()[1:]
num_correct = (prediction == true_tokens).sum()

print(f"Model accuracy: {num_correct}/{len(true_tokens)}")
print(f"Correct tokens: {gpt2small.to_str_tokens(prediction[prediction == true_tokens])}")




Model loss: tensor(4.3443, grad_fn=<DivBackward0>)
Model accuracy: 33/111
Correct tokens: ['\n', '\n', 'former', ' with', ' models', '.', ' can', ' of', 'ooked', 'Trans', 'former', '_', 'NAME', '`.', ' model', ' the', 'Trans', 'former', ' to', ' be', ' and', '-', '.', '\n', '\n', ' at', 'PT', '-', ',', ' model', ',', "'s", ' the']


In [41]:
# Tilgang til query matriser, key matriser og verdi-matriser

print(gpt2small.blocks[0].attn.W_Q)
print(gpt2small.blocks[0].attn.W_K)
print(gpt2small.blocks[0].attn.W_V)



Parameter containing:
tensor([[[-0.1026, -0.0609, -0.0249,  ...,  0.0203,  0.0658,  0.0478],
         [ 0.0191,  0.0242,  0.0404,  ..., -0.0670,  0.0618, -0.0190],
         [ 0.0038,  0.0081,  0.0532,  ..., -0.0380,  0.0209,  0.0367],
         ...,
         [-0.0461, -0.0057,  0.0348,  ...,  0.0152, -0.0104,  0.0085],
         [ 0.0309,  0.0370,  0.0160,  ..., -0.0175, -0.0892, -0.0513],
         [-0.0747, -0.0391, -0.0486,  ..., -0.0675, -0.0310, -0.0416]],

        [[-0.0146,  0.0116, -0.0336,  ..., -0.0270, -0.0309, -0.0234],
         [-0.0148,  0.0689,  0.0947,  ...,  0.0179,  0.0131, -0.0445],
         [ 0.0237,  0.0168, -0.0566,  ..., -0.0101,  0.0308,  0.0022],
         ...,
         [ 0.0526,  0.0214, -0.0353,  ..., -0.0663, -0.0429,  0.1076],
         [-0.0119,  0.0048, -0.1068,  ...,  0.0063, -0.0678, -0.0890],
         [-0.1548,  0.0178, -0.0272,  ..., -0.0034,  0.0690, -0.0079]],

        [[ 0.0057,  0.0382,  0.0539,  ...,  0.0343,  0.1000, -0.0263],
         [ 0.0242,  0.0

In [42]:
# Få tilgang til aktiveringer i en modell. 

gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
gpt2_tokens = gpt2small.to_tokens(gpt2_text)
gpt2_logits, gpt2_cache = gpt2small.run_with_cache(gpt2_tokens, remove_batch_dim=True)

print(gpt2_logits,gpt2_cache)

print(gpt2_cache["pattern", 0])

tensor([[[ 7.5261, 11.1214,  7.8919,  ..., -3.1299, -3.3873,  8.5934],
         [ 4.4261,  5.2600,  2.1652,  ...,  0.4973, -2.2385,  4.2680],
         [ 8.7212,  7.1920,  3.2337,  ...,  2.7380,  0.2224,  7.2776],
         ...,
         [ 4.8818,  6.6848,  2.6623,  ...,  4.4694, -1.5021,  5.2525],
         [ 7.4928,  8.2157,  3.5746,  ...,  1.5305, -0.2660,  7.6360],
         [ 4.6322,  5.0510,  5.8710,  ..., -1.5175, -6.7258, 13.3053]]],
       grad_fn=<ViewBackward0>) ActivationCache with keys ['hook_embed', 'hook_pos_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'bloc

In [43]:
#Sammenligne beregning av activation verdier og faktiske verdier i modellen. 

layer0_pattern_from_cache = gpt2_cache["pattern", 0]

q, k = gpt2_cache["q", 0], gpt2_cache["k", 0]
seq, nhead, headsize = q.shape
layer0_attn_scores = einops.einsum(q, k, "seqQ n h, seqK n h -> n seqQ seqK")
mask = t.triu(t.ones((seq, seq), dtype=bool), diagonal=1).to(device)
layer0_attn_scores.masked_fill_(mask, -1e9)
layer0_pattern_from_q_and_k = (layer0_attn_scores / headsize**0.5).softmax(-1)

t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)
print("Tests passed!")

Tests passed!


In [45]:
# Visualisering? 

print(type(gpt2_cache))
attention_pattern = gpt2_cache["pattern", 0]
print(attention_pattern.shape)
gpt2_str_tokens = gpt2small.to_str_tokens(gpt2_text)

print("Layer 0 Head Attention Patterns:")
display(cv.attention.attention_patterns(
    tokens=gpt2_str_tokens, 
    attention=attention_pattern
))

<class 'transformer_lens.ActivationCache.ActivationCache'>
torch.Size([12, 33, 33])
Layer 0 Head Attention Patterns:


Funksjoner for å detektere bestemte attention heads: 

In [46]:


def current_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be current-token heads
    '''
    attn_heads = []
    for layer in range(gpt2small.cfg.n_layers):
        for head in range(gpt2small.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            # take avg of diagonal elements
            score = attention_pattern.diagonal().mean()
            if score > 0.4:
                attn_heads.append(f"{layer}.{head}")
    return attn_heads

def prev_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be prev-token heads
    '''
    attn_heads = []
    for layer in range(gpt2small.cfg.n_layers):
        for head in range(gpt2small.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            # take avg of sub-diagonal elements
            score = attention_pattern.diagonal(-1).mean()
            if score > 0.4:
                attn_heads.append(f"{layer}.{head}")
    return attn_heads

def first_attn_detector(cache: ActivationCache) -> list[str]:
    '''
    Returns a list e.g. ["0.2", "1.4", "1.9"] of "layer.head" which you judge to be first-token heads
    '''
    attn_heads = []
    for layer in range(gpt2small.cfg.n_layers):
        for head in range(gpt2small.cfg.n_heads):
            attention_pattern = cache["pattern", layer][head]
            # take avg of 0th elements
            score = attention_pattern[:, 0].mean()
            if score > 0.4:
                attn_heads.append(f"{layer}.{head}")
    return attn_heads


print("Heads attending to current token  = ", ", ".join(current_attn_detector(gpt2_cache)))
print("Heads attending to previous token = ", ", ".join(prev_attn_detector(gpt2_cache)))
print("Heads attending to first token    = ", ", ".join(first_attn_detector(gpt2_cache)))

Heads attending to current token  =  0.1, 0.3, 0.4, 0.5, 1.11, 4.7
Heads attending to previous token =  2.2, 3.7, 4.11
Heads attending to first token    =  1.3, 1.6, 1.9, 2.1, 2.11, 3.0, 3.1, 3.3, 3.4, 3.5, 3.9, 3.10, 4.0, 4.1, 4.2, 4.4, 4.5, 4.6, 4.8, 4.9, 4.10, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 5.10, 5.11, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 6.10, 6.11, 7.0, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 7.10, 7.11, 8.0, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 8.10, 8.11, 9.0, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 9.10, 9.11, 10.0, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 10.10, 10.11, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.9, 11.10, 11.11
