### Imports

In [1]:
from analyze_results import *
from getting_examples import *
from predict_activations import *
from model_utils import *
from utils import *
import json
import pprint

In [2]:
sae, model = load_sae_and_model()



Loaded pretrained model gpt2-small into HookedTransformer


In [3]:
recompute_directory_activations('6-res-jb_subset_100', 'gpt2-small-organized/6-res-jb', model, sae, recompute=False, re_sort=False, num_neg_others=10)

2350.json gathered 10 other_negative activations
23251.json gathered 10 other_negative activations
11000.json gathered 10 other_negative activations
13627.json gathered 10 other_negative activations
21896.json gathered 10 other_negative activations
6332.json gathered 10 other_negative activations
2534.json gathered 10 other_negative activations
11263.json gathered 10 other_negative activations
8913.json gathered 10 other_negative activations
9532.json gathered 10 other_negative activations
5128.json gathered 10 other_negative activations
5157.json gathered 10 other_negative activations
15162.json gathered 10 other_negative activations
12092.json gathered 10 other_negative activations
19697.json gathered 10 other_negative activations
15570.json gathered 10 other_negative activations
21983.json gathered 10 other_negative activations
3230.json gathered 10 other_negative activations
20780.json gathered 10 other_negative activations
10322.json gathered 10 other_negative activations
7428.jso

In [None]:
feat_id = 1

description, pos_examples, neg_examples, highest_activation = get_pos_neg_examples(feat_id, layer=6, basis='res-jb', num_pos=2, num_neg=2, neg_type='others', randomize_pos_examples=False)
print(description)

max_indices = [pos_examples[i]['max_value_token_index'] + 1 for i in range(len(pos_examples))] + [9 for i in range(len(neg_examples))]
strings = [pos_examples[i]['sentence_string'] for i in range(len(pos_examples))] + [neg_examples[i]['sentence_string'] for i in range(len(neg_examples))]

for pos in pos_examples:
    print(pos['sentence_string'])
    print(pos['values'])

for neg in neg_examples:
    print(neg['sentence_string'])
    print(neg['values'])

In [None]:
pre_acts, inner_acts, post_acts = get_sae_activations(model, sae, strings)

for inner_act in inner_acts:
    x = np.array(inner_act)
    print(x[:,feat_id])

In [None]:
# Get model's loss on strings
regular_losses = get_vanilla_loss(model, sae, strings)
print(pretty_losses_fmt("Regular", strings, regular_losses))

# Get model's loss on strings using SAE reconstructed activations
sae_losses = get_vanilla_loss(model, sae, strings, with_sae_replacement=True)
print(pretty_losses_fmt("SAE", strings, sae_losses))

# Loss with all features ablated
precomputed_zeros = [[[0.0] * len(l) for l in seq] for seq in inner_acts]
zeros_losses = get_recons_loss_from_predicted_values(model, sae, strings, precomputed_zeros)
print(pretty_losses_fmt("Zeros", strings, zeros_losses))

In [None]:
# Get model's loss on strings using SAE with selected feature ablated
replacements = [0 for string in strings]
ablated_inner_acts = replace_max_feature_activation(inner_acts, feat_id, max_indices, replacements)
ablated_sae_losses = get_recons_loss_from_predicted_values(model, sae, strings, ablated_inner_acts)
print(pretty_losses_fmt(f"SAE feature {feat_id} ablated", strings, ablated_sae_losses))

for inner_act in ablated_inner_acts:
    x = np.array(inner_act)
    print(x[:,feat_id])

# Print how much loss changes after ablating selected feature
difference = elementwise_difference(sae_losses, ablated_sae_losses)
rounded_difference = [[round(elem, 2) for elem in sublist] for sublist in difference]
print(rounded_difference)

In [None]:
np.random.seed(42)
indices = list(map(int, np.random.choice(24576, size=100, replace=False)))
print(indices)
# copy_files_by_list(indices, 'gpt2-small-organized/6-res-jb', '6-res-jb_subset_100')

### Run an experiment

In [None]:
results = run_experiments(
    num_features=2, 
    layer=6,
    basis='res-jb',
    test_pos=4, # Experiment with
    test_neg=4, # Experiment with
    show_pos=0, # Experiment with
    show_neg=0, # Experiment with
    neg_type='others', # Experiment with
    binary_class=False, # Experiment with
    all_tokens=True,
    show_max_token=False, # Experiment with
    num_completions=3, # Experiment with
    debug=True, 
    randomize_pos=True, 
    save_location='test',
    feature_ids=indices
)

# # the run_experiments function automatically saves results to results/exp_{timestamp}.json
pprint.pprint(results)

In [None]:
# Read the JSON file
with open('6-res-jb_subset_100/428.json', 'r') as file:
    data = json.load(file)

# Print the JSON structure
print_json_tree(data)

### Do analysis on loaded json_data

In [None]:
json_data_binary = load_json_results('results/binary_test/exp_binary_others.json')
json_data_continuous = load_json_results('results/binary_test/exp_continuous_others.json')

In [None]:
binary_preds = [json_data_binary['results'][i]['gpt_predictions'] for i in range(len(json_data_binary['results']))]
continuous_preds = [json_data_continuous['results'][i]['gpt_predictions'] for i in range(len(json_data_continuous['results']))]

In [None]:
accuracy = get_binary_accuracy(binary_preds, plot_cdf=True, plot_distribution=True)

In [None]:
accuracy_descs = get_accuracy_descs(json_data_binary, include_pos_neg=True, display=True)

In [None]:
resave_organized_modeldata(autoencoder_layers = [6],
                        autoencoder_bases = [
                            'neurons',
                            'res-jb',])