In [1]:
from tqdm import tqdm
from einops import rearrange, reduce, repeat
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import pandas as pd
from tqdm import tqdm
from einops import rearrange
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM
from datasets import load_dataset
import plotly.express as px

from model_wrappers import HFModelWrapper
from elk import ELK
from dataset import Prompts

%load_ext autoreload
%autoreload 2

## getting ccs to work

In [7]:
dataset_name = "imdb"
prompts = Prompts(dataset_name = dataset_name, N = 50, max_len = 512, random = False)

train_indices, test_indices = prompts.gen_train_test_indices(set_instance_vars = True, train_ratio = 0.5, test_ratio = 0.5)

Reusing dataset imdb (/home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1054 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 50/50 [00:00<00:00, 640.49it/s]

31





In [11]:
elk = ELK("t5-3b")

x_plus_acts, x_minus_acts = elk.gen_hidden_states(prompts.dataset['x_plus'].tolist(), 
                                        prompts.dataset['x_minus'].tolist(), 
                                        [elk.mt.num_layers],
                                        store_acts = False,
                                        dataset_name = dataset_name)

100%|██████████| 50/50 [00:02<00:00, 17.85it/s]
100%|██████████| 50/50 [00:02<00:00, 18.32it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'activations/t5-3b/imdb_x_plus_activations_2022-12-21.pt'

In [9]:
# COLINS CODE

from sklearn.linear_model import LogisticRegression

y = prompts.dataset['label'].tolist()
x_plus_acts = x_plus_acts
x_minus_acts = x_minus_acts

# let's create a simple 50/50 train split (the data is already randomized)
n = len(y)
neg_hs_train, neg_hs_test = x_plus_acts[:n//2], x_plus_acts[n//2:]
pos_hs_train, pos_hs_test = x_minus_acts[:n//2], x_minus_acts[n//2:]
y_train, y_test = y[:n//2], y[n//2:]

# for simplicity we can just take the difference between positive and negative hidden states
# (concatenating also works fine)
x_train = neg_hs_train - pos_hs_train
x_test = neg_hs_test - pos_hs_test

lr = LogisticRegression(class_weight="balanced")
lr.fit(x_train, y_train)
print("Logistic regression accuracy: {}".format(lr.score(x_test, y_test)))

Logistic regression accuracy: 0.8


In [10]:
probe_type = "CCS"
loss = elk.train_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], probe_type = probe_type)
print(f"Loss: {loss}")

probe_score_train = elk.score_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], 
                            prompts.train['label'].tolist(),
                            probe_type = probe_type)

probe_score_test = elk.score_probe(x_plus_acts[test_indices], x_minus_acts[test_indices],
                            prompts.test['label'].tolist(),
                            probe_type = probe_type)

print(f"""{probe_type} {dataset_name} Train Score: {probe_score_train}\n{probe_type} {dataset_name} Test Score: {probe_score_test}\n""")


100%|██████████| 10/10 [00:15<00:00,  1.53s/it]

Loss: 0.0009273618925362825
CCS imdb Train Score: 0.52
CCS imdb Test Score: 0.5161290322580645






In [6]:
# train_indices = [0, 1, 2, 3, 4]
# test_indices = [5, 6, 7, 8, 9]

elk.train_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], 
                labels = prompts.train['label'].tolist(),
                probe_type = "LR")

probe_score = elk.score_probe(x_plus_acts[test_indices], x_minus_acts[test_indices], 
                              labels = prompts.test['label'],
                              probe_type = "LR")

# zero_shot_score = elk.zero_shot_score(modifiedtqa['question'].tolist(), 
                                    #   modifiedtqa['label'].tolist())

print(f"Probe Score: {probe_score}")

Probe Score: 0.8888888888888888


## Performance across Model Size

In [16]:
dataset_name = "imdb"
prompts = Prompts(dataset_name = dataset_name, N = 100, max_len = 512, random = True)

train_indices, test_indices = prompts.gen_train_test_indices(set_instance_vars = True, train_ratio = 0.6, test_ratio = 0.4)

Reusing dataset imdb (/home/ubuntu/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1151 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [00:00<00:00, 673.02it/s]

54





In [None]:
max([len(elk.mt.tokenizer(prompts.dataset.iloc[i]['x_plus']).input_ids) for i in range(100)])

In [25]:
# model_names = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "EleutherAI/gpt-j-6B"]
# model_names = ["gpt2", "gpt2-medium"]
# model_names = ["allenai/unifiedqa-t5-small", "allenai/unifiedqa-t5-base", "allenai/unifiedqa-t5-large", "allenai/unifiedqa-t5-3b", "allenai/unifiedqa-t5-11b"]
# model_names = ["allenai/unifiedqa-t5-small", "allenai/unifiedqa-t5-base", "allenai/unifiedqa-t5-large"]
model_names = ["t5-small", "t5-3b"]
probe_type = "CCS"

for model_name in model_names:
    elk = ELK(model_name)

    x_plus_acts, x_minus_acts = elk.gen_hidden_states(prompts.dataset['x_plus'].tolist(), 
                                            prompts.dataset['x_minus'].tolist(), 
                                            [elk.mt.num_layers],
                                            store_acts = False,
                                            dataset_name = dataset_name)
    print(x_plus_acts[0])
    
    torch.cuda.empty_cache() 

    loss = elk.train_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], probe_type = probe_type)
    print(f"Loss: {loss}")
    
    probe_score_train = elk.score_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], 
                                prompts.train['label'].tolist(),
                                probe_type = probe_type)

    probe_score_test = elk.score_probe(x_plus_acts[test_indices], x_minus_acts[test_indices],
                                prompts.test['label'].tolist(),
                                probe_type = probe_type)

    zero_shot_score = elk.zero_shot_score(prompts.dataset['text'].tolist(), 
                                        prompts.dataset['label'],
                                        dataset_name)

    print(f"""{probe_type} {dataset_name} Train Score: {probe_score_train}\n{probe_type} {dataset_name} Test Score: {probe_score_test}\nZero Shot Score: {zero_shot_score}\n""")
    

 61%|██████    | 61/100 [00:00<00:00, 73.67it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [00:01<00:00, 71.72it/s]
100%|██████████| 100/100 [00:01<00:00, 79.01it/s]


[ 8.71672854e-03  4.02482506e-03 -9.44943912e-03  9.40153599e-02
  3.66733819e-02 -2.72779688e-02  3.06446198e-02 -3.58712524e-02
  2.37806570e-02 -3.95216644e-02 -1.21268183e-02  6.07908107e-02
  2.55455542e-02 -2.38106716e-02  3.99518088e-02 -4.67830189e-02
  5.96576445e-02 -4.59869057e-02  5.69171458e-02  6.35251701e-02
  3.26392353e-02  6.47233874e-02  2.01658644e-02  4.46532592e-02
 -7.43814185e-02 -3.78017686e-02 -1.97887607e-02  4.14290093e-02
  4.43106629e-02 -3.94877434e-01 -1.53967403e-02  3.73045392e-02
  1.11579355e-02 -6.36513457e-02  5.75111248e-03  3.60372253e-02
 -5.95039967e-03 -2.49459986e-02 -4.50531843e-05  3.99346389e-02
  1.66725889e-02 -6.37608096e-02  2.58907694e-02  8.36172104e-02
 -6.44278973e-02  4.01740037e-02  2.66935918e-02 -9.39908437e-03
 -4.68534641e-02  4.28349562e-02  2.03884840e-02  1.39389215e-02
  1.74548174e-03  3.07658110e-02  1.64276194e-02  1.31416768e-02
  3.49155534e-03  1.57460403e-02 -2.83670751e-03 -5.36880717e-02
 -2.51044054e-02 -1.55030

100%|██████████| 10/10 [00:14<00:00,  1.50s/it]


Loss: 0.00256867497228086


100%|██████████| 100/100 [00:01<00:00, 51.99it/s]
100%|██████████| 100/100 [00:01<00:00, 62.82it/s]


CCS imdb Train Score: 0.5
CCS imdb Test Score: 0.5636363636363637
Zero Shot Score: 0.44



 61%|██████    | 61/100 [00:03<00:01, 20.21it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [00:05<00:00, 17.63it/s]
100%|██████████| 100/100 [00:05<00:00, 17.62it/s]


[-0.12709308 -0.04529818 -0.10413766 ... -0.01617076 -0.05885235
 -0.00737287]


100%|██████████| 10/10 [00:15<00:00,  1.54s/it]


Loss: 0.001593934721313417


100%|██████████| 100/100 [00:06<00:00, 15.06it/s]
100%|██████████| 100/100 [00:06<00:00, 16.41it/s]

CCS imdb Train Score: 0.5166666666666666
CCS imdb Test Score: 0.5818181818181818
Zero Shot Score: 0.53






In [18]:
# train_indices = [0, 1, 2, 3, 4]
# test_indices = [5, 6, 7, 8, 9]
# probe_type = "CCS"
dataset_name = "imdb"
# model_name = "gpt2-medium"
np.random.seed(None)
# elk = ELK("t5-large")
for model_name in model_names:
    # elk = ELK(model_name)
    
    x_plus_acts = torch.load(f"activations/{model_name}/{dataset_name}_x_plus_activations_2022-12-21.pt")
    x_minus_acts = torch.load(f"activations/{model_name}/{dataset_name}_x_minus_activations_2022-12-21.pt")
        
    elk.train_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], 
                    labels = prompts.train['label'].tolist(),
                    probe_type = "LR")

    probe_score = elk.score_probe(x_plus_acts[test_indices], x_minus_acts[test_indices], 
                                labels = prompts.test['label'].tolist(),
                                probe_type = "LR")

    # zero_shot_score = elk.zero_shot_score(modifiedtqa['question'].tolist(), 
                                        #   modifiedtqa['label'].tolist())

    print(f"Probe Score: {probe_score}")

Probe Score: 0.5636363636363636
Probe Score: 0.7636363636363637
Probe Score: 0.7636363636363637


In [22]:
model_name = "t5-small"
dataset_name = "imdb"
probe_type = "CCS"
from probes import CCS


ccs = CCS()
x_plus_acts = torch.load(f"activations/{model_name}/{dataset_name}_x_plus_activations_2022-12-21.pt")
x_minus_acts = torch.load(f"activations/{model_name}/{dataset_name}_x_minus_activations_2022-12-21.pt")

loss = ccs.fit(x_plus_acts[train_indices], x_minus_acts[train_indices])

print(f"Loss: {loss}")

probe_score_train = ccs.score(x_plus_acts[train_indices], x_minus_acts[train_indices], 
                            prompts.train['label'].tolist(),
                            )

probe_score_test = ccs.score(x_plus_acts[test_indices], x_minus_acts[test_indices],
                            prompts.test['label'].tolist(),
)

print(f"""{probe_type} {dataset_name} Train Score: {probe_score_train}\n{probe_type} {dataset_name} Test Score: {probe_score_test}\nZero Shot Score: {zero_shot_score}\n""")



100%|██████████| 10/10 [00:15<00:00,  1.54s/it]

Loss: 0.0030268258415162563
CCS imdb Train Score: 0.5
CCS imdb Test Score: 0.5636363636363637
Zero Shot Score: 0.56






In [10]:
model_name_to_CCS = {}

# model_names = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "EleutherAI/gpt-j-6B"]
# model_names = ["allenai/unifiedqa-t5-small", "allenai/unifiedqa-t5-base", "allenai/unifiedqa-t5-large", "allenai/unifiedqa-t5-3b", "allenai/unifiedqa-t5-11b"]

probe_type = "CCS"
dataset_name = "imdb"

for model_name in model_names:
    print(model_name)
    
    # elk = ELK(model_name)

    x_plus_acts = torch.load(f"activations/{model_name}/{dataset_name}_x_plus_activations_2022-12-21.pt")
    x_minus_acts = torch.load(f"activations/{model_name}/{dataset_name}_x_minus_activations_2022-12-21.pt")

    loss = elk.train_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], probe_type = probe_type)
    print(f"Loss: {loss}")
    
    probe_score_train = elk.score_probe(x_plus_acts[train_indices], x_minus_acts[train_indices], 
                                prompts.train['label'].tolist(),
                                probe_type = probe_type)

    probe_score_test = elk.score_probe(x_plus_acts[test_indices], x_minus_acts[test_indices],
                                prompts.test['label'].tolist(),
                                probe_type = probe_type)

    zero_shot_score = elk.zero_shot_score(prompts.dataset['text'].tolist(), 
                                        prompts.dataset['label'],
                                        dataset_name)

    print(f"""{probe_type} {dataset_name} Train Score: {probe_score_train}\n{probe_type} {dataset_name} Test Score: {probe_score_test}\nZero Shot Score: {zero_shot_score}\n""")
    
    model_name_to_CCS[model_name] = {
                                     "elk": elk,
                                     "loss": loss,
                                     "train_score": probe_score_train,
                                     "test_score": probe_score_test,
                                     "zero_shot_score": zero_shot_score
                                     }
    

t5-small


100%|██████████| 10/10 [00:15<00:00,  1.54s/it]


Loss: 0.0015449990751221776


100%|██████████| 100/100 [00:01<00:00, 62.56it/s]
100%|██████████| 100/100 [00:01<00:00, 66.42it/s]


CCS imdb Train Score: 0.5833333333333333
CCS imdb Test Score: 0.5740740740740741
Zero Shot Score: 0.43

t5-base


100%|██████████| 10/10 [00:15<00:00,  1.50s/it]


Loss: 0.0008160027209669352


100%|██████████| 100/100 [00:03<00:00, 32.21it/s]
100%|██████████| 100/100 [00:02<00:00, 34.56it/s]


CCS imdb Train Score: 0.5833333333333333
CCS imdb Test Score: 0.5740740740740741
Zero Shot Score: 0.44

t5-large


100%|██████████| 10/10 [00:15<00:00,  1.52s/it]


Loss: 0.0007618818199262023


100%|██████████| 100/100 [00:06<00:00, 16.60it/s]
100%|██████████| 100/100 [00:05<00:00, 17.11it/s]

CCS imdb Train Score: 0.5833333333333334
CCS imdb Test Score: 0.5740740740740741
Zero Shot Score: 0.56






In [None]:
zero_shot_scores = [model_name_to_CCS[model_name]['zero_shot_score'] for model_name in model_names]
test_scores = [model_name_to_CCS[model_name]['test_score'] for model_name in model_names]
train_scores = [model_name_to_CCS[model_name]['train_score'] for model_name in model_names]

scores = pd.DataFrame(np.vstack([zero_shot_scores, test_scores, train_scores]).T, columns = ["zero_shot_score", "test_score", "train_score"],
                        index = model_names)

px.line(scores, x = model_names, y = ["zero_shot_score", "train_score", "test_score"], title = f"{probe_type} Performance on {dataset_name}")


## Miscellaneous

In [None]:
dataset_name = "imdb"
prompts = Prompts(dataset_name = dataset_name, N = 100, max_len = 512)

train_indices, test_indices = prompts.gen_train_test_indices(set_instance_vars = True)

In [None]:
model_names = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]
model_names = ["allenai/unifiedqa-t5-small", "allenai/unifiedqa-t5-base", "allenai/unifiedqa-t5-large", "allenai/unifiedqa-t5-3b", "allenai/unifiedqa-t5-11b"]
elk = ELK(model_names[0])

yes_acts, no_acts = elk.gen_hidden_states(prompts.dataset['x_plus'].tolist(), 
                                        prompts.dataset['x_minus'].tolist(), 
                                        [elk.mt.num_layers/ 2],
                                        store_acts = False,
                                        dataset_name = dataset_name)

In [None]:
elk.train_probe(yes_acts, no_acts, probe_type = "CCS")

probe_score = elk.score_probe(yes_acts, no_acts, 
                              torch.tensor(prompts.dataset['x_plus_true']),
                              probe_type = "CCS")

zero_shot_score = elk.zero_shot_score(prompts.dataset['text'].tolist(), 
                                      prompts.dataset['label'],
                                      dataset_name = "imdb")

print(f"Probe Score: {probe_score} \nZero Shot Score: {zero_shot_score}")

In [None]:


elk.train_probe(yes_acts, no_acts, 
                labels = modifiedtqa['label'].tolist(),
                probe_type = "LR")

probe_score = elk.score_probe(yes_acts, no_acts, 
                              labels = modifiedtqa['label'].tolist(),
                              probe_type = "LR")

# zero_shot_score = elk.zero_shot_score(modifiedtqa['question'].tolist(), 
                                    #   modifiedtqa['label'].tolist())

print(f"Probe Score: {probe_score} \nZero Shot Score: {zero_shot_score}")