In [1]:
import os, re, json
import torch, numpy as np

import sys
sys.path.append('..')
torch.set_grad_enabled(False)

import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
import os
import json
import random
from typing import *
import pandas as pd
from statsmodels.stats.proportion import proportion_confint

from src.utils.extract_utils import get_mean_head_activations, compute_universal_function_vector
from src.utils.intervention_utils import fv_intervention_natural_text, function_vector_intervention
#from src.utils.model_utils import load_gpt_model_and_tokenizer
from src.utils.prompt_utils import load_dataset, word_pairs_to_prompt_data, create_prompt
from src.utils.eval_utils import decode_to_vocab, sentence_eval

In [2]:
def load_gpt_model_and_tokenizer(model_name:str, device='cuda'):
    """
    Loads a huggingface model and its tokenizer

    Parameters:
    model_name: huggingface name of the model to load (e.g. GPTJ: "EleutherAI/gpt-j-6B", or "EleutherAI/gpt-j-6b")
    device: 'cuda' or 'cpu'

    Returns:
    model: huggingface model
    tokenizer: huggingface tokenizer
    MODEL_CONFIG: config variables w/ standardized names

    """
    assert model_name is not None

    print("Loading: ", model_name)

    if 'gpt-j' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)

        MODEL_CONFIG={"n_heads":model.config.n_head,
                      "n_layers":model.config.n_layer,
                      "resid_dim":model.config.n_embd,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'transformer.h.{layer}.attn.out_proj' for layer in range(model.config.n_layer)],
                      "layer_hook_names":[f'transformer.h.{layer}' for layer in range(model.config.n_layer)],
                      "prepend_bos":False}

    elif 'gpt2' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained("gpt2")

        MODEL_CONFIG={"n_heads":model.config.n_head,
                      "n_layers":model.config.n_layer,
                      "resid_dim":model.config.n_embd,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'transformer.h.{layer}.attn.out_proj' for layer in range(model.config.n_layer)],
                      "layer_hook_names":[f'transformer.h.{layer}' for layer in range(model.config.n_layer)],
                      "prepend_bos":False}

    elif 'gpt-neo-125m' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name)

        MODEL_CONFIG={"n_heads":model.config.num_heads,
                      "n_layers":model.config.num_layers,
                      "resid_dim": model.config.hidden_size,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'gpt_neo.layers.{layer}.attention.dense' for layer in range(model.config.num_layers)],
                      "layer_hook_names":[f'gpt_neo.layers.{layer}' for layer in range(model.config.num_layers)],
                      "prepend_bos":False}

    elif 'gpt-neox' in model_name.lower() or 'pythia' in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

        MODEL_CONFIG={"n_heads":model.config.num_attention_heads,
                      "n_layers":model.config.num_hidden_layers,
                      "resid_dim": model.config.hidden_size,
                      "name_or_path":model.config.name_or_path,
                      "attn_hook_names":[f'gpt_neox.layers.{layer}.attention.dense' for layer in range(model.config.num_hidden_layers)],
                      "layer_hook_names":[f'gpt_neox.layers.{layer}' for layer in range(model.config.num_hidden_layers)],
                      "prepend_bos":False}

    elif 'llama' in model_name.lower():
        if '70b' in model_name.lower():
            # use quantization. requires `bitsandbytes` library
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type='nf4',
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.float16
            )

            access_token = "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"

            tokenizer = LlamaTokenizer.from_pretrained(model_name, token=access_token)
            model = LlamaForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=True,
                quantization_config=bnb_config,
                token=access_token
            )
        else:
            if '7b' in model_name.lower() or '8b' in model_name.lower():
                model_dtype = torch.float32
            else: #half precision for bigger llama models
                #This becomes only for the 13B model then. Okay then. What else?
                model_dtype = torch.float16

            # If transformers version is < 4.31 use LlamaLoaders
            # tokenizer = LlamaTokenizer.from_pretrained(model_name)
            # model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype).to(device)

            # If transformers version is >= 4.31, use AutoLoaders
            access_token = "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"

            tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, token=access_token).to(device)

        MODEL_CONFIG={"n_heads":model.config.num_attention_heads,
                      "n_layers":model.config.num_hidden_layers,
                      "resid_dim":model.config.hidden_size,
                      "name_or_path":model.config._name_or_path,
                      "attn_hook_names":[f'model.layers.{layer}.self_attn.o_proj' for layer in range(model.config.num_hidden_layers)],
                      "layer_hook_names":[f'model.layers.{layer}' for layer in range(model.config.num_hidden_layers)],
                      "prepend_bos":True}
    else:
        raise NotImplementedError("Still working to get this model available!")


    return model, tokenizer, MODEL_CONFIG

In [3]:
model_name = 'EleutherAI/gpt-j-6b'
#model_name = 'meta-llama/Llama-2-7b-hf'
#model_name = 'EleutherAI/gpt-neox-20b'
model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_name)
EDIT_LAYER = 9

Loading:  EleutherAI/gpt-j-6b


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6b were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

In [4]:
dataset = load_dataset('succ_letterstring_basic', seed=0)
mean_activations = get_mean_head_activations(dataset, model, model_config, tokenizer)

In [11]:
dataset['train']['input']

['[q r s t]',
 '[i j k l]',
 '[n o p q]',
 '[n o p q]',
 '[o p q r]',
 '[t u v w]',
 '[n o p q]',
 '[f g h i]',
 '[d e f g]',
 '[l m n o]',
 '[b c d e]',
 '[b c d e]',
 '[l m n o]',
 '[a b c d]',
 '[s t u v]',
 '[f g h i]',
 '[h i j k]',
 '[j k l m]',
 '[p q r s]',
 '[e f g h]',
 '[f g h i]',
 '[j k l m]',
 '[g h i j]',
 '[u v w x]',
 '[l m n o]',
 '[a b c d]',
 '[i j k l]',
 '[p q r s]',
 '[e f g h]',
 '[q r s t]',
 '[q r s t]',
 '[s t u v]',
 '[u v w x]',
 '[c d e f]',
 '[f g h i]',
 '[c d e f]',
 '[s t u v]',
 '[l m n o]',
 '[p q r s]',
 '[c d e f]',
 '[n o p q]',
 '[m n o p]',
 '[q r s t]',
 '[q r s t]',
 '[u v w x]',
 '[s t u v]',
 '[e f g h]',
 '[j k l m]',
 '[a b c d]',
 '[q r s t]',
 '[l m n o]',
 '[q r s t]',
 '[g h i j]',
 '[q r s t]',
 '[i j k l]',
 '[o p q r]',
 '[g h i j]',
 '[a b c d]',
 '[f g h i]',
 '[j k l m]',
 '[b c d e]',
 '[h i j k]',
 '[q r s t]',
 '[t u v w]',
 '[p q r s]',
 '[g h i j]',
 '[h i j k]',
 '[c d e f]',
 '[u v w x]',
 '[a b c d]',
 '[s t u v]',
 '[e f

In [5]:
FV, top_heads = compute_universal_function_vector(mean_activations, model, model_config, n_top_heads=10)

In [6]:
FV

tensor([[ 0.0794,  0.2792, -0.3621,  ..., -0.5107,  0.0333,  0.7581]],
       device='cuda:0')

In [7]:
dataset_next = load_dataset('next_item', seed=0)
mean_activations_next = get_mean_head_activations(dataset_next, model, model_config, tokenizer)

In [8]:
FV_next, top_heads_next = compute_universal_function_vector(mean_activations_next, model, model_config, n_top_heads=10)

In [9]:
FV_next

tensor([[ 0.4660, -0.6311, -0.0582,  ...,  0.2695,  0.9046, -0.6693]],
       device='cuda:0')

In [10]:
FV

tensor([[ 0.0794,  0.2792, -0.3621,  ..., -0.5107,  0.0333,  0.7581]],
       device='cuda:0')

In [11]:
import torch.nn.functional as F

In [12]:
cos_sim = F.cosine_similarity(FV, FV_next, dim=1)
print("Cosine Similarity:", cos_sim.item())

Cosine Similarity: 0.49959123134613037


In [13]:
FV_next = F.normalize(FV_next, p=2, dim=1)
FV = F.normalize(FV, p=2, dim=1)

In [14]:
cos_sim = torch.sum(FV * FV_next, dim=1)
print("Cosine similarity (after normalization):", cos_sim.item())

Cosine similarity (after normalization): 0.49959123134613037


In [15]:
def main():
    models = {
        #'gptneo': 'EleutherAI/gpt-neo-125m',
        #'gpt2': 'gpt2',
        'gptj6b': 'EleutherAI/gpt-j-6b',
        'llama27b': 'meta-llama/Llama-2-7b-hf',
        'llama213b': 'meta-llama/Llama-2-13b-hf',
        'gptneox20b': 'EleutherAI/gpt-neox-20b',
        'llama270b': 'meta-llama/Llama-2-70b-hf'
    }

    edit_layers = {
        'gptj6b': 9,
        'gptneox20b': 15,
        'llama27b': 11,
        'llama213b': 14,
        'llama270b': 26
    }

    for model_name, model_technical_name in models.items():
        #model_technical_name = 'gpt2'
        if model_name in edit_layers:
            EDIT_LAYER = edit_layers[model_name]

        torch.cuda.empty_cache()  # Clear cache before loading new model
        model, tokenizer, model_config = load_gpt_model_and_tokenizer(model_technical_name)

        dataset_succ = load_dataset('succ_letterstring_basic', seed=0)
        mean_activations_succ = get_mean_head_activations(dataset_succ, model, model_config, tokenizer)

        FV_succ, top_heads_succ = compute_universal_function_vector(mean_activations_succ, model, model_config, n_top_heads=10)

        dataset_next = load_dataset('next_item', seed=0)
        mean_activations_next = get_mean_head_activations(dataset_next, model, model_config, tokenizer)

        FV_next, top_heads_next = compute_universal_function_vector(mean_activations_next, model, model_config, n_top_heads=10)

        cos_sim = F.cosine_similarity(FV_succ, FV_next, dim=1)
        print("Cosine Similarity " + model_name +": ", cos_sim.item())  

        del model, tokenizer, model_config
        torch.cuda.empty_cache()

        # Clear cache after evaluation

        # --- NEW: Delete ONLY this model's cache ---
        from transformers import file_utils
        import shutil
        import re

        # 1. Get model's cache folder name (convert "/" to "--")
        model_cache_name = f"models--{re.sub(r'/', '--', model_technical_name)}"
        cache_path = os.path.join(file_utils.default_cache_path, model_cache_name)

        # 2. Delete only this model's folder
        if os.path.exists(cache_path):
            print(f"Deleting model cache: {cache_path}")
            shutil.rmtree(cache_path, ignore_errors=True)

In [16]:
main()

Loading:  EleutherAI/gpt-j-6B


Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

Cosine Similarity gptj6b:  0.5009585022926331
Loading:  meta-llama/Llama-2-7b-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Cosine Similarity llama27b:  0.5167273283004761
Loading:  meta-llama/Llama-2-13b-hf


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Cosine Similarity llama213b:  0.5576171875
Loading:  EleutherAI/gpt-neox-20b


tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/60.4k [00:00<?, ?B/s]

Fetching 46 files:   0%|          | 0/46 [00:00<?, ?it/s]

model-00002-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00005-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00006-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00008-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00001-of-00046.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

model-00004-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00007-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00003-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00009-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00010-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00011-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00012-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00013-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00015-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00014-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00016-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00017-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00018-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00019-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00020-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00022-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00021-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00023-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00024-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00025-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00026-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00027-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00028-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00029-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00031-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00030-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00032-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00033-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00034-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00035-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00036-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00037-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00038-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00039-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00040-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00041-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00042-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00043-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00044-of-00046.safetensors:   0%|          | 0.00/910M [00:00<?, ?B/s]

model-00045-of-00046.safetensors:   0%|          | 0.00/604M [00:00<?, ?B/s]

model-00046-of-00046.safetensors:   0%|          | 0.00/620M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/46 [00:00<?, ?it/s]

Cosine Similarity gptneox20b:  0.65478515625
Loading:  meta-llama/Llama-2-70b-hf


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/66.7k [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model-00002-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00006-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00004-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00007-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00008-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00005-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00003-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00001-of-00015.safetensors:   0%|          | 0.00/9.85G [00:00<?, ?B/s]

model-00009-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00010-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00011-of-00015.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00012-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00013-of-00015.safetensors:   0%|          | 0.00/9.80G [00:00<?, ?B/s]

model-00015-of-00015.safetensors:   0%|          | 0.00/524M [00:00<?, ?B/s]

model-00014-of-00015.safetensors:   0%|          | 0.00/9.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Cosine Similarity llama270b:  0.455078125


In [17]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()