In [1]:
%load_ext autoreload
%autoreload 2

import torch
import transformers
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'gpt2'

# prompt management

In [3]:
import json

def load_json_dataset(json_path):
    with open(json_path) as file:
        dataset = json.load(file)
    return dataset

dataset = load_json_dataset('../data/antonym.json')
dataset = list(map(lambda x: tuple(x.values()), dataset))
print(f'dataset len: {len(dataset)}')


####### TODO Delete #######
dataset = dataset[:100]

dataset len: 2398


# models

In [4]:
import sys
sys.path.append('..')
torch.set_grad_enabled(False)

from src.utils.model_utils import load_gpt_model_and_tokenizer, set_seed, rsetattr, rgetattr
from src.extraction import split_activation, extract_activations, get_mean_activations
from src.utils.prompt_helper import build_prompt_txt, tokenize_from_template, tokenize_ICL
set_seed(32)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, config = load_gpt_model_and_tokenizer(model_name)
print(model)
print(config)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2AttentionAltered(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (query): WrapperModule()
          (key): WrapperModule()
          (value): WrapperModule()
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
{'

Calculate the average of the activation on the dataset to grasp any pattern that is task-dependant and not token-dependant 

Activations have the following shape (num_layers, num_heads, seq_len, d_head), being each OV circuit output.

Still to answer:
- How this is connected to the residual stream?

Trying to use [nnsight](https://github.com/JadenFiotto-Kaufman/nnsight) and explore to know how it works

In [6]:
# select number of ICL examples
ICL_examples = 2

###### TODO: REMEMBER TO SPLIT IN TRAIN and TEST####

tok_ret, ids_ret, labels = tokenize_ICL(tokenizer, ICL_examples = ICL_examples, dataset = dataset)
print(tokenizer.decode(tok_ret[0]))
print(labels[0])
print()
print(tok_ret[0])
print(ids_ret[0])

<|endoftext|>Q:flawed
A:perfect

Q:orthodox
A:unorthodox

Q:true
A:
false

tensor([50256,    48,    25,  2704, 36825,   198,    32,    25, 25833,   628,
           48,    25, 42539,   198,    32,    25,   403, 42539,   628,    48,
           25,  7942,   198,    32,    25])
[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24]


___


In [7]:
mean_activations, outputs = get_mean_activations(
    tokenized_prompts=tok_ret,
    important_ids=ids_ret,
    model=model,
    config=config,
)
mean_activations.shape

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
extracting activations:   0%|          | 0/33 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
extracting activations: 100%|██████████| 33/33 [00:14<00:00,  2.23it/s]


torch.Size([12, 12, 23, 64])

compute indirect effect

In [8]:
import random

def randomize_dataset(dataset):
    # shuffle the second column (labels) and copy the original column to a third one keeping the correct label
    shuffled = list(map(lambda x: x[1], dataset))
    random.shuffle(shuffled)

    new_dataset = list(
        zip(
            list(map(lambda x: x[0], dataset)),     # input x
            shuffled,     # new shuffled label (that make no sense)
            list(map(lambda x: x[1], dataset)),     # old correct label
    ))
    
    return new_dataset

In [9]:
# parameters: dataset, mean_activations, model, config, tokenizer, 
# n_shots: Number of shots in each in-context prompt
# n_trials: Number of in-context prompts to average over


def compute_indirect_effect(dataset, mean_activations, model, config, tokenizer):
    # randomize prompts to make the model unable to guess the correct answer
    randomized_dataset = randomize_dataset(dataset)

    prompts_and_infos = tokenize_ICL(
        tokenizer, 
        ICL_examples = ICL_examples, 
        dataset = randomized_dataset
    )

    # probabilities over vocab from the original model
    probs_original = torch.zeros([len(randomized_dataset), config['vocab_size']])
    probs_edited = torch.zeros([len(randomized_dataset), config['n_layers'], config['n_heads'], config['vocab_size']])

    for idx, (tokenized_prompt, important_ids, correct_label) in enumerate(zip(*prompts_and_infos)):
        
        # take the original result from the model (probability of correct response token y)
        with model.invoke(rand_tok_ret) as invoker:
            pass # no changes to make in the forward pass
        logits = invoker.output.logits
        logits = logits[0,-1,:] # select only the predicted token (i.e. the final token)
        # store the probabilities for each token in vocab
        probs_original[idx] = logits.softmax(dim=-1)

        # for each layer i, for each head j in the model
        for layer_i in config['n_layers']:
            for head_j in config['n_head']:
                # TODO: controlla che questo assegnamento non sia un problema 
                probs_edited[idx, layer_i, head_j] = replace_heads_w_avg(
                    layers_heads = [layer_i, head_j],
                    avg_activations = 
                    
                )
        #      substitute the head with the specific average activation
        
        # CIE(ij) = probability of correct_label token y (w/ modified model) - probability of correct_label token y (w/ original model)
        #      e.g. CIE(ij) = 0.9 - 0.1 = 0.8      head has great effect
        #      e.g. CIE(ij) = 0.3 - 0.1 = 0.2      head does not influence too much the output


    
    prob_original_model = 0
    return rand_tok_ret[0], rand_ids_ret[0]


    
rand_tok_ret, rand_ids_ret  = compute_indirect_effect(
    dataset=dataset[:200], 
    mean_activation=mean_activations,
    model=model,
    config=config,
    tokenizer=tokenizer,
)

SyntaxError: invalid syntax (3463502032.py, line 38)

In [23]:
def replace_heads_w_avg(tokenized_prompt: torch.tensor, important_ids: list[int], layers_heads: list[tuple[int, int]], avg_activations: list[torch.tensor], model, config):
    """Replace the activation of specific heads (listed in layers_heads) with the avg_activation for each specific head (listed in avg_activations). 
    Than compute the output of the model with all the new activations

    Args:
        tokenized_prompt (torch.tensor): tokenized prompt (single batch)
        important_ids (list[int]): list of important indexes i.e. the tokens where the average must be substituted
        layers_heads (list[tuple[int, int]]): list of tuples each containing a layer index, head index
        avg_activations (list[torch.tensor]): list of activations (`size: (seq_len, d_head)`) for each head listed in layers_heads. The length must be the same of layers_heads
        model (): model
        config (): model's config

    Returns:
        torch.Tensor: model's probabilities over vocab (post-softmax) with the replaced activations
    """

    assert len(layers_heads) == len(avg_activations), f'layers_heads and avg_activations must have the same length. Got {len(layers_heads)} and {len(avg_activations)}'


    d_head = config['d_model'] // config['n_heads']

    with model.invoke(tokenized_prompt) as invoker:
        for idx, (num_layer, num_head) in enumerate(layers_heads):

            hidden_state_pre = rgetattr(model, config['attn_hook_names'][num_layer]).output.clone()
            rgetattr(model, config['attn_hook_names'][num_layer]).output[
                :, :, (num_head * d_head) : ((num_head + 1) * d_head)                # select the head (output shape is torch.Size([batch, seq_len, d_model]))
            ][:, important_ids, :] = avg_activations[idx].unsqueeze(0)          # substitute only the important indexes (unsqueeze for adding the batch dimension)
            
            hidden_state_post = rgetattr(model, config['attn_hook_names'][num_layer]).output.save()
        # store the output probabilities
    probs = invoker.output.logits[0,-1,:].softmax(dim=-1)
    return probs, hidden_state_pre, hidden_state_post


tokenized = tokenizer('I am not', return_tensors='pt')['input_ids']
important_ids = np.arange(len(tokenized[0]))       # tutto il prompt

b, pre, post = replace_heads_w_avg(
    tokenized_prompt=tokenized,
    important_ids=[important_ids],
    layers_heads=[(x,y) for x in range(0,12) for y in range(0,12)],       # all heads of the first layer
    avg_activations=[
        torch.zeros(size=(len(tokenized[0]), config['d_model'] // config['n_heads'])) 
        for _ in range(0, 12 * 12)
    ],      # zero-out layer
    model=model,
    config=config,
)
print(f'B: Predicted token id {b.argmax()}, wich corresponds to "{tokenizer.decode(b.argmax())}" [prob.: {b[b.argmax()]:.3f}]')

B: Predicted token id 1865, wich corresponds to " yet" [prob.: 0.048]


In [26]:
pre.value

ValueError: Accessing Proxy value before it's been set.

In [25]:
torch.equal(pre.value, post.value)

ValueError: Accessing Proxy value before it's been set.

In [22]:
with model.invoke('The Eiffel Tower is in the city of') as invoker:
        hidden_states_pre = model.transformer.h[-1].output[0][:].clone().save()
        noise = (0.001**0.5)*torch.randn(hidden_states_pre.shape)
        model.transformer.h[-1].output[0][:] = hidden_states_pre + noise
        hidden_states_post = model.transformer.h[-1].output[0][:].save()

print(hidden_states_pre.value)
print(hidden_states_post.value)
torch.equal(hidden_states_pre.value, hidden_states_post.value)

tensor([[[ 0.0505, -0.1728, -0.1690,  ..., -1.0096,  0.1280, -1.0687],
         [ 8.7495,  2.9057,  5.3024,  ..., -8.0418,  1.2964, -2.8677],
         [ 0.2960,  4.6686, -3.6642,  ...,  0.2391, -2.6064,  3.2263],
         ...,
         [ 2.1537,  6.8917,  3.8651,  ...,  0.0588, -1.9866,  5.9188],
         [-0.4460,  7.4285, -9.3065,  ...,  2.0528, -2.7947,  0.5556],
         [ 6.6286,  1.7258,  4.7969,  ...,  7.6714,  3.0682,  2.0481]]])
tensor([[[ 0.0746, -0.1995, -0.1431,  ..., -1.0120,  0.0801, -1.1020],
         [ 8.7936,  2.9738,  5.2735,  ..., -8.0207,  1.2858, -2.8587],
         [ 0.3195,  4.6480, -3.6528,  ...,  0.2489, -2.6361,  3.1970],
         ...,
         [ 2.1745,  6.8751,  3.8343,  ...,  0.0260, -2.0636,  5.9154],
         [-0.4704,  7.4231, -9.2473,  ...,  1.9874, -2.8278,  0.6054],
         [ 6.6429,  1.6878,  4.8316,  ...,  7.6684,  3.0681,  2.1042]]])


False