In [1]:
%load_ext autoreload
%autoreload 2

import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Some configs

In [2]:
MODEL_NAME = 'gpt2'
MODEL_NAME = 'microsoft/phi-2'
MODEL_NAME = 'EleutherAI/pythia-1B'

LOAD_IN_8BIT = False
RELATIVE_PATH = '../'

### dataset

In [3]:
import json

def load_json_dataset(json_path):
    with open(json_path) as file:
        dataset = json.load(file)
    return dataset

dataset = load_json_dataset(f'{RELATIVE_PATH}data/antonym.json')
dataset = list(map(lambda x: tuple(x.values()), dataset))
print(f'dataset len: {len(dataset)}')

dataset len: 2398


In [4]:
import sys
sys.path.append('..')
torch.set_grad_enabled(False)

from src.utils.model_utils import load_gpt_model_and_tokenizer, set_seed
from src.extraction import get_mean_activations
from src.utils.prompt_helper import tokenize_ICL
from src.intervention import replace_heads_w_avg, compute_indirect_effect
set_seed(32)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model, tokenizer, config = load_gpt_model_and_tokenizer(MODEL_NAME, LOAD_IN_8BIT)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# select number of ICL examples (query excluded)
ICL_examples = 4
tok_ret, ids_ret, correct_labels = tokenize_ICL(tokenizer, ICL_examples = ICL_examples, dataset = dataset)

## Get activations and measure head's importance

In [7]:
mean_activations = get_mean_activations(
    tokenized_prompts=tok_ret,
    important_ids=ids_ret,
    tokenizer=tokenizer,
    model=model,
    config=config,
    correct_labels=correct_labels,
)
torch.save(mean_activations, f'{RELATIVE_PATH}output/mean_activations_{MODEL_NAME.replace("/", "-")}.pt')
mean_activations.shape

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
extracting activations:   0%|          | 0/479 [00:00<?, ?it/s]You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
extracting activations: 100%|██████████| 479/479 [02:24<00:00,  3.30it/s]


Model accuracy: 0.07, using 35 example to compute mean activations


torch.Size([16, 8, 39, 256])

In [11]:
cie, probs_original, probs_edited  = compute_indirect_effect(
    model=model,
    tokenizer=tokenizer,
    config=config,
    dataset=dataset, 
    mean_activations=mean_activations,
    ICL_examples = ICL_examples,
    batch_size=15,
)
torch.save(cie, f'{RELATIVE_PATH}output/cie_{MODEL_NAME.replace("/", "-")}.pt')

total prompts: 479


Processing edited model (l: 7, h: 2):   3%|▎         | 1/32 [04:41<1:38:27, 190.56s/it] 

In [None]:
import plotly.express as px

fig = px.imshow(cie.mean(dim=0))
fig.show()