In [1]:
%load_ext autoreload
%autoreload 2

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'gpt2'

# prompt management

In [3]:
dataset = (
    ('Rome', 'Italy'),
    ('Madrid', 'Spain'),
    ('Amsterdam', 'Netherlands'),
    ('Moscow', 'Russia'),
    ('Nairobi', 'Kenya'),
    ('Paris', 'France'),
    ('Tehran', 'Iran'),
    ('Tokyo', 'Japan'),
    ('Warsaw', 'Poland'),
    ('Ottawa', 'Canada'),
    ('Oslo', 'Norway'),
    ('Lisbon', 'Portugal'),
    ('Helsinki', 'Finland'),
    ('Havana', 'Cuba'),
    ('Doha', 'Qatar'),
    ('Damascus', 'Syria'),
    ('Canberra', 'Australia'),
    ('Cairo', 'Egypt'),
    ('Bern', 'Switzerland'),
    ('Berlin', 'Germany'),
    ('Beijing', 'China'),
    ('Athens', 'Greece'),
    ('Ankara', 'Turkey'),
)

# models

In [82]:
import sys
sys.path.append('..')
torch.set_grad_enabled(False)

from src.utils.model_utils import load_gpt_model_and_tokenizer, set_seed, get_submodule
from src.extraction import split_activation
from src.utils.prompt_helper import build_prompt_txt, tokenize_from_template, tokenize_ICL
set_seed(32)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, config = load_gpt_model_and_tokenizer(model_name)
print(model)
print(config)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2AttentionAltered(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (query): WrapperModule()
          (key): WrapperModule()
          (value): WrapperModule()
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
{'

Calculate the average of the activation on the dataset to grasp any pattern that is task-dependant and not token-dependant 

Activations have the following shape (num_layers, num_heads, seq_len, d_head), being each OV circuit output.

Still to answer:
- How this is connected to the residual stream?

Trying to use [nnsight](https://github.com/JadenFiotto-Kaufman/nnsight) and explore to know how it works

In [83]:
# example prompt
promtp_w_template = [
    ('Q', 'structural'),
    (':', 'structural'),
    ('Hello my name is daniel', 'sentence'),
    ('\n', 'structural'),
    ('A', 'structural'),
    (':', 'structural'),
    ('This is a response', 'sentence'),
]


prompt2 = build_prompt_txt(
    queries=['one one one', 'three three three', 'five five five'], answers=['two', 'four']
)
prompt2


[('Q:', 'structural'),
 ('one one one', 'sentence'),
 ('\nA:', 'structural'),
 ('two', 'sentence'),
 ('\n\n', 'structural'),
 ('Q:', 'structural'),
 ('three three three', 'sentence'),
 ('\nA:', 'structural'),
 ('four', 'sentence'),
 ('\n\n', 'structural'),
 ('Q:', 'structural'),
 ('five five five', 'sentence'),
 ('\nA:', 'structural')]

In [84]:
# prompt template:
# tutti gli indici dei token structural vanno salvati, solo l'ultimo indice dei token sentence vanno salvati



full_tokenized, indexes = tokenize_from_template(tokenizer=tokenizer, promtp_w_template=prompt2)
print(full_tokenized)
print(indexes)

tensor([50256,    48,    25,   505,   530,   530,   198,    32,    25, 11545,
          628,    48,    25, 15542,  1115,  1115,   198,    32,    25, 14337,
          628,    48,    25, 13261,  1936,  1936,   198,    32,    25])
[0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 28]


In [85]:


# create template based on ICL_examples

# params: dataset, ICL_examples

# create template
# split
# store tokenized and indexes from tokenize_from_template

# select number of ICL examples
ICL_examples = 2


tok_ret, ids_ret = tokenize_ICL(tokenizer, ICL_examples = 2, dataset = dataset)


___


In [None]:
# some queries k=1 ICL

template = 'Q:{query}\nA:{response}\n'

tot_prompts = [
    template.format(query = query, response = response)
    for query, response in dataset
]

AutoTokenizer.from_pretrained(model_name)


def extract_activations(tot_prompts, model, config):
    dataset_activations, outputs = [], []
    for prompt in tot_prompts:
        with model.generate(max_new_tokens=3) as generator:
            # invoke works in a generation context, where operations on inputs and outputs are tracked
            with generator.invoke(prompt) as invoker:
                layer_attn_activations = []
                for layer in range(config['n_layer']):
                    layer_attn_activations.append(model.transformer.h[layer].attn.c_proj.output.save())
        outputs.append(generator.output)


        # get the values
        layer_attn_activations = [att.value for att in layer_attn_activations]
        attn_activations = split_activation(layer_attn_activations, config).permute(0, 2, 1, 3)     # n_layers, tokens, n_heads, d_head -> n_layers, n_heads, tokens, d_head
        dataset_activations.append(attn_activations)
    return dataset_activations, outputs

activations, outputs = extract_activations(tot_prompts=tot_prompts, model=model, config=config)
activations[0].shape

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 

In [None]:
model.tokenizer.decode([40313,    25,  4881,    11, 11307,    25,  4486,    11,  6342])

'Paris: France, Berlin: Germany, Paris'

In [None]:
activations[1].shape

torch.Size([12, 12, 6, 64])

calculate mean activations

In [None]:
prompt = "Rome: Italy, Amsterdam: Netherlands, Lisbon: Portugal, Madrid:"

with model.invoke(prompt) as invoker:
    pass # no changes to make in the forward pass
logits = invoker.output.logits
logits = logits[0,-1,:] # only over the final token
probs = logits.softmax(dim=-1)


for tok, prob in zip(probs.topk(10).indices, probs.topk(10).values):
    print(f"p({model.tokenizer.decode(tok)}) = {prob:.3f}")

p( Spain) = 0.900
p( Portugal) = 0.028
p( Madrid) = 0.024
p( Italy) = 0.014
p( Argentina) = 0.005
p( Mexico) = 0.003
p( France) = 0.002
p( Uruguay) = 0.002
p( Brazil) = 0.002
p( Real) = 0.001
