In [91]:
%load_ext autoreload
%autoreload 2

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [324]:
model_name = 'gpt2'

# prompt management

In [325]:
import json

def load_json_dataset(json_path):
    with open(json_path) as file:
        dataset = json.load(file)
    return dataset

dataset = load_json_dataset('../data/antonym.json')
dataset = list(map(lambda x: tuple(x.values()), dataset))
print(f'dataset len: {len(dataset)}')
dataset[:10]

dataset len: 2398


[('flawed', 'perfect'),
 ('orthodox', 'unorthodox'),
 ('true', 'false'),
 ('daily', 'nightly'),
 ('distribution', 'concentration'),
 ('valid', 'invalid'),
 ('expand', 'contract'),
 ('practical', 'impractical'),
 ('privilege', 'disadvantage'),
 ('mammoth', 'tiny')]

# models

In [326]:
import sys
sys.path.append('..')
torch.set_grad_enabled(False)

from src.utils.model_utils import load_gpt_model_and_tokenizer, set_seed, get_submodule
from src.extraction import split_activation, extract_activations, get_mean_activations
from src.utils.prompt_helper import build_prompt_txt, tokenize_from_template, tokenize_ICL
set_seed(32)

In [327]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, config = load_gpt_model_and_tokenizer(model_name)
print(model)
print(config)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2AttentionAltered(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (query): WrapperModule()
          (key): WrapperModule()
          (value): WrapperModule()
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
{'

Calculate the average of the activation on the dataset to grasp any pattern that is task-dependant and not token-dependant 

Activations have the following shape (num_layers, num_heads, seq_len, d_head), being each OV circuit output.

Still to answer:
- How this is connected to the residual stream?

Trying to use [nnsight](https://github.com/JadenFiotto-Kaufman/nnsight) and explore to know how it works

In [255]:
# select number of ICL examples
ICL_examples = 2

tok_ret, ids_ret, labels = tokenize_ICL(tokenizer, ICL_examples = ICL_examples, dataset = dataset)
print(tokenizer.decode(tok_ret[0]))
print(labels[0])
print()
print(tok_ret[0])
print(ids_ret[0])

<|endoftext|>Q:flawed
A:perfect

Q:orthodox
A:unorthodox

Q:true
A:
false

tensor([50256,    48,    25,  2704, 36825,   198,    32,    25, 25833,   628,
           48,    25, 42539,   198,    32,    25,   403, 42539,   628,    48,
           25,  7942,   198,    32,    25])
[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24]


___


In [152]:
mean_activation, outputs = get_mean_activations(
    tokenized_prompts=tok_ret,
    important_ids=ids_ret,
    model=model,
    config=config,
)
mean_activation.shape

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


compute indirect effect

In [234]:
import random

def randomize_dataset(dataset):
    # shuffle the second column (labels) and copy the original column to a third one keeping the correct label
    shuffled = list(map(lambda x: x[1], dataset))
    random.shuffle(shuffled)

    new_dataset = list(
        zip(
            list(map(lambda x: x[0], dataset)),     # input x
            shuffled,     # new shuffled label (that make no sense)
            list(map(lambda x: x[1], dataset)),     # old correct label
    ))
    
    return new_dataset

invalid
<|endoftext|>Q:daily
A:empty

Q:distribution
A:con

Q:valid
A:


In [331]:
# parameters: dataset, mean_activations, model, config, tokenizer, 
# n_shots: Number of shots in each in-context prompt
# n_trials: Number of in-context prompts to average over


def compute_indirect_effect(dataset, mean_activation, model, config, tokenizer):
    # randomize prompts to make the model unable to guess the correct answer
    randomized_dataset = randomize_dataset(dataset)

    prompts_and_infos = tokenize_ICL(
        tokenizer, 
        ICL_examples = ICL_examples, 
        dataset = randomized_dataset
    )

    # probabilities over vocab from the original model
    probs_original = torch.zeros([len(randomized_dataset), config['vocab_size']])

    for idx, (tokenized_prompt, important_ids, correct_label) in enumerate(zip(*prompts_and_infos)):
        
        # take the original result from the model (probability of correct response token y)
        with model.invoke(rand_tok_ret) as invoker:
            pass # no changes to make in the forward pass
        logits = invoker.output.logits
        logits = logits[0,-1,:] # select only the predicted token (i.e. the final token)
        # store the probabilities for each token in vocab
        probs_original[idx] = logits.softmax(dim=-1)

        # for each layer i, for each head j in the model
        for layer_i in config['n_layer']:
            for head_j in config['n_head']:
                pass
        #      substitute the head with the specific average activation
        
        # CIE(ij) = probability of correct_label token y (w/ modified model) - probability of correct_label token y (w/ original model)
        #      e.g. CIE(ij) = 0.9 - 0.1 = 0.8      head has great effect
        #      e.g. CIE(ij) = 0.3 - 0.1 = 0.2      head does not influence too much the output


    
    prob_original_model = 0
    return rand_tok_ret[0], rand_ids_ret[0]


    
rand_tok_ret, rand_ids_ret  = compute_indirect_effect(
    dataset=dataset[:200], 
    mean_activation=mean_activation,
    model=model,
    config=config,
    tokenizer=tokenizer,
)

In [344]:

with model.invoke(rand_tok_ret) as invoker:
    pass # no changes to make in the forward pass

logits = invoker.output.logits
print(logits.shape)
logits = logits[0,-1,:] # select only the predicted token (the final token)
print(logits.shape)

predicted = logits.softmax(dim=-1).argmax()
tokenizer.encode()


torch.Size([1, 24, 50257])
torch.Size([50257])
torch.Size([50257])


In [354]:
tokenizer.decode(probs.argmax())

'in'

In [346]:
probs.argmax()

tensor(259, device='mps:0')

In [284]:
torch.zeros(size = [config['n_layer'], config['n_heads']]).shape

torch.Size([12, 12])