In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

### Some configs

In [2]:
MODEL_NAME = 'gpt2'
MODEL_NAME = 'microsoft/phi-2'
# MODEL_NAME = 'EleutherAI/pythia-1B'
# MODEL_NAME = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

LOAD_IN_8BIT = False
RELATIVE_PATH = '../'

dataset_name = 'country-capital'
# select number of ICL examples (query excluded)
ICL_examples = 4
debug = True

### dataset

In [3]:
import json

def load_json_dataset(json_path):
    with open(json_path, encoding='utf-8') as file:
        dataset = json.load(file)
    return dataset

dataset = load_json_dataset(f'{RELATIVE_PATH}data/{dataset_name}.json')
dataset = list(map(lambda x: tuple(x.values()), dataset))

if debug:
    dataset = dataset[0:50]

print(f'dataset len: {len(dataset)}')

dataset len: 50


In [4]:
import sys
sys.path.append('..')
torch.set_grad_enabled(False)

from src.utils.model_utils import load_gpt_model_and_tokenizer, set_seed
from src.extraction import get_mean_activations
from src.utils.prompt_helper import tokenize_ICL
from src.intervention import compute_indirect_effect
set_seed(32)

In [5]:
model, tokenizer, config, device = load_gpt_model_and_tokenizer(MODEL_NAME, LOAD_IN_8BIT)
tok_ret, ids_ret, correct_labels = tokenize_ICL(tokenizer, ICL_examples = ICL_examples, dataset = dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Get activations and measure head's importance

In [15]:
import torch

from nnsight import LanguageModel

model = LanguageModel("microsoft/phi-2", trust_remote_code = True, device_map = 'auto')

layer_attn_activations =[]

with model.generate(max_new_tokens=1) as runner:

    with runner.invoke("blah"):

        vv0 = model.model.layers[0].self_attn.output.save()
        vv1 = model.model.layers[1].self_attn.output.save()
        vv2 = model.model.layers[2].self_attn.output.save()

        for layer_num in range(len(model.model.layers)):
            layer_attn_activations.append(
                model.model.layers[layer_num].self_attn.output.save()
            )

print(layer_attn_activations[0].value)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(tensor([[[-0.7236,  0.7874,  0.1998,  ..., -0.1322,  0.7554, -0.5354],
         [-0.5455,  0.9391,  0.2884,  ..., -0.1260,  0.2966, -0.1629]]],
       device='cuda:0'), None, <transformers.cache_utils.DynamicCache object at 0x7f72924e00a0>)


In [17]:
prompt = tok_ret[0]
# rgetattr(model, config['attn_hook_names'][0]).output.save()

with model.generate(max_new_tokens=1, pad_token_id=tokenizer.pad_token_id) as generator:
    prompt = prompt.to(device)
    with generator.invoke(prompt) as invoker:
        # this is good
        vv0 = model.model.layers[0].self_attn.output.save()
        vv1 = model.model.layers[1].self_attn.output.save()
        vv2 = model.model.layers[2].self_attn.output.save()
        vv2 = model.model.layers[31].self_attn.output.save()
        # etc

        layer_attn_activations = []
        # this leads to an error: AttributeError: 'NoneType' object has no attribute 'to_legacy_cache'
        for layer_num in range(config['n_layers']):
            layer_attn_activations.append(
                model.model.layers[layer_num].self_attn.output.save()
            )
print(vv0.value)
            # module_path = modules_pre_layer + [str(layer_num)] + modules_post_layer
            # layer_attn_activations.append(
            #     functools.reduce(getattr, [model] + module_path).output.save()  # get model.model.layers.<layer_num>.self_attn (ERROR here)
            # )
            # modules_pre_layer = config['layer_name'].split('.')     # = ['model', 'layers']
            # modules_post_layer = config['attn_name'].split('.')     # = ['self_attn']
        

        # for layer in functools.reduce(getattr, [model] + config['layer_name'].split('.')):
        #     layer.
        #     attn_activation = functools.reduce(
        #         getattr, [layer] + config['attn_name'].split('.')
        #     ).output.save()
        #     # layer_attn_activations.append(attn_activation)
        
        # this leads to an error: AttributeError: 'NoneType' object has no attribute 'to_legacy_cache'
        # layer_attn_activations = []
        # for layer_n in range(3):
        #     layer_attn_activations.append(
        #         model.model.layers[layer_n].self_attn.output.save()
        #     )

(tensor([[[-0.0508, -0.3839,  0.9552,  ..., -0.7086,  0.4718, -0.1021],
         [-0.0082, -0.0214,  0.5937,  ..., -0.3852,  0.3731, -0.0517],
         [-0.4863, -0.0433,  0.7666,  ..., -0.1964,  0.2062,  0.1382],
         ...,
         [ 0.1964,  0.0670, -0.0347,  ..., -0.0307,  0.1573,  0.1318],
         [-0.0805,  0.1059,  0.0777,  ..., -0.0269,  0.0157, -0.1853],
         [ 0.1935,  0.0671, -0.0260,  ..., -0.1173,  0.1092, -0.2673]]],
       device='cuda:0'), None, <transformers.cache_utils.DynamicCache object at 0x7f97cc0cbbe0>)


In [19]:
mean_activations = get_mean_activations(
    tokenized_prompts=tok_ret,
    important_ids=ids_ret,
    tokenizer=tokenizer,
    model=model,
    config=config,
    correct_labels=correct_labels,
    device='cuda'
)
torch.save(mean_activations, f'{RELATIVE_PATH}output/{dataset_name}_mean_activations_{MODEL_NAME.replace("/", "-")}.pt')
mean_activations.shape

[x] Extracting activations (layer: 31/32):   0%|          | 0/10 [00:01<?, ?it/s]


AttributeError: 'tuple' object has no attribute 'shape'

In [None]:
cie, probs_original, probs_edited  = compute_indirect_effect(
    model=model,
    tokenizer=tokenizer,
    config=config,
    dataset=dataset, 
    mean_activations=mean_activations,
    ICL_examples = ICL_examples,
    batch_size=15,
)
torch.save(cie, f'{RELATIVE_PATH}output/{dataset_name}_cie_{MODEL_NAME.replace("/", "-")}.pt')

In [None]:
cie = torch.load(f'{RELATIVE_PATH}output/sentiment_cie_gpt2.pt')

import plotly.express as px

fig = px.imshow(cie.mean(dim=0))
fig.show()