# Loading and Analysing Pre-Trained Sparse Autoencoders

In [1]:
import sys 
import torch
import os
from tqdm import tqdm
sys.path.insert(0,"/home/joberant/NLP_2324b/erelbarzilay/conda_3/envs/new_env/lib/python3.12/site-packages")
from datasets import load_dataset  
from transformer_lens import HookedTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer
import pandas as pd
from datasets import Dataset
from utils import split_dataframe, get_frequency_df

sys.path.insert(0,"/vol/joberant_nobck/data/NLP_368307701_2324b/erelbarzilay/conda_3/envs/new_env/lib/python3.12/site-packages")

from sae_lens import SAE
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    

  from .autonotebook import tqdm as notebook_tqdm


# Loading a pretrained Sparse Autoencoder

Below we load a Transformerlens model, a pretrained SAE and a dataset from huggingface.

In [2]:
from datasets import load_dataset  
from transformer_lens import HookedTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer
import sys
sys.path.insert(0,"/vol/joberant_nobck/data/NLP_368307701_2324b/erelbarzilay/conda_3/envs/new_env/lib/python3.12/site-packages")

from sae_lens import SAE
device = "cpu" #Comment when have enough space
model = HookedTransformer.from_pretrained("gpt2-small", device = device)
tokenizer = model.tokenizer
# the cfg dict is returned alongside the SAE since it may contain useful information for analysing the SAE (eg: instantiating an activation store)
# Note that this is not the same as the SAEs config dict, rather it is whatever was in the HF repo, from which we can extract the SAE config dict
# We also return the feature sparsities which are stored in HF for convenience. 
sae, cfg_dict, sparsity = SAE.from_pretrained(
    release = "gpt2-small-res-jb", # see other options in sae_lens/pretrained_saes.yaml
    sae_id = "blocks.0.hook_resid_pre", # won't always be a hook point
    device = device
)




Loaded pretrained model gpt2-small into HookedTransformer


This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)


[50256, 567, 75]


In [3]:
import pandas as pd
from datasets import Dataset
from utils import split_dataframe
df = pd.read_csv("data.csv")
df = df["synonyms"]
# Preprocessing

df = df.to_list()
syn_dictionary = {}
def gen():
    new_df = []
    i = 0
    num = 0
    for row in df:
        i += 1
        row = row[1:-1]
        words = row.split(",")
        for part in words:
            word = part.split(":")[0]
            word = word.strip()[1:-1]
            new_df.append([tokenizer(word)["input_ids"],word,len(tokenizer(word)["input_ids"]), i, num])
            num += 1
    return new_df
dataset = pd.DataFrame(gen())
dataset.columns = ["tokens","text", "len", "dict_val", "enumerator"]
print(dataset)
lst = []



for i in range(1, 10):
    lst.append([])
    df_lst = split_dataframe(dataset[dataset["len"] == i])
    for df in df_lst:
        lst[i - 1].append(Dataset.from_pandas(df))

                               tokens         text  len  dict_val  enumerator
0                   [13, 1828, 28139]  .22 caliber    3         1           0
1              [13, 1828, 27417, 260]  .22 calibre    4         1           1
2               [13, 1828, 12, 43288]  .22-caliber    4         1           2
3      [13, 1828, 12, 9948, 571, 260]  .22-calibre    6         1           3
4                   [13, 2548, 28139]  .38 caliber    3         2           4
...                               ...          ...  ...       ...         ...
40100                    [11195, 287]      home in    2      9236       40100
40101                     [9521, 287]     range in    2      9236       40101
40102                     [6603, 503]     pass out    2      9237       40102
40103                    [13424, 503]    black out    2      9237       40103
40104                [26361, 74, 503]     zonk out    3      9237       40104

[40105 rows x 5 columns]


## Basic Analysis

Let's check some basic stats on this SAE in order to see how some basic functionality in the codebase works.

We'll calculate:
- L0 (the number of features that fire per activation)
- The cross entropy loss when the output of the SAE is used in place of the activations

### L0 Test and Reconstruction Test

In [4]:
sae.eval()  # prevents error if we're expecting a dead neuron mask for who grads
vals = []
features = []
sae_outs = []
for i in range(1, 10):
    vals.append([])
    features.append([])
    sae_outs.append([])
    for df in lst[i - 1]:
        with torch.no_grad():
            # activation store can give us tokens.
            batch_tokens = df["text"]
            _, cache = model.run_with_cache(batch_tokens, prepend_bos=True)

            # Use the SAE
            val = cache[sae.cfg.hook_name]
            print(val.size())
            vals[i - 1].append(val[:,i,:])
            feature_acts = sae.encode(cache[sae.cfg.hook_name])
            features[i - 1].append(feature_acts[:,i,:])
            sae_out = sae.decode(feature_acts)
            sae_outs[i - 1].append(sae_out[:,i,:])

            # save some room
            del cache


torch.Size([1000, 2, 768])
torch.Size([1000, 2, 768])
torch.Size([1000, 2, 768])
torch.Size([1000, 2, 768])
torch.Size([1000, 2, 768])
torch.Size([937, 2, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([1000, 3, 768])
torch.Size([782, 3, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([1000, 4, 768])
torch.Size([540, 4, 768])
torc

In [9]:

def loss(x, x_hat):
    return (x_hat - x).pow(2).sum(-1).sqrt()
loss_lst = []
rep_lst = []
i = 0
for lst_vals, lst_outs in zip(vals, sae_outs):
    for value, out in zip(lst_vals, lst_outs):
        app = loss(value, out)
        print(app.size())
        loss_lst.append(app)
    rep_lst.append(torch.cat(features[i]))
    i += 1
rep_lst = torch.cat(rep_lst)
result = torch.cat(loss_lst)
result = result.detach().numpy()
rep_lst = rep_lst.detach().numpy()

In [10]:
dataset["loss"] = result

In [11]:
import pickle
with open ('representation.pickle', 'wb') as f:
    pickle.dump(rep_lst, f)

In [12]:
with open('representation.pickle', 'rb') as f:
    representations = pickle.load(f)



In [21]:
def add_row_to_df(df, f, name):
    add_col = []
    for _, row in tqdm(df.iterrows(), total = df.shape[0]):
        add_col.append(f(representations[row["enum1"]], representations[row["enum2"]]))
    syn_df[name] = add_col
        
add_row_to_df(syn_df, Jaccard_similarity, "Jaccard_similarity")

add_row_to_df(syn_df, cosine_similarity, "cosine_similarity")

In [23]:
new_df = syn_df[syn_df["freq2"] > 0]
plt.scatter(syn_df["frequency_diff"], syn_df["cosine_similarity"])
#plt.xscale("log")
import numpy as np
z = np.polyfit(syn_df["frequency_diff"], syn_df["cosine_similarity"], 1)
p = np.poly1d(z)
plt.plot(syn_df["frequency_diff"], p(syn_df["frequency_diff"]), color = "red")

Note that while the mean L0 is 64, it varies with the specific activation.

To estimate reconstruction performance, we calculate the CE loss of the model with and without the SAE being used in place of the activations. This will vary depending on the tokens.

## Specific Capability Test

Validating model performance on specific tasks when using the reconstructed activation is quite important when studying specific tasks.

# Generating Feature Interfaces

Feature dashboards are an important part of SAE Evaluation. They work by:
- 1. Collecting feature activations over a larger number of examples.
- 2. Aggregating feature specific statistics (such as max activating examples).
- 3. Representing that information in a standardized way

For our feature visualizations, we will use a separate library called SAEDashboard.

Now, since generating feature dashboards can be done once per sparse autoencoder, for pre-trained SAEs in the public domain, everyone can use the same dashboards. Neuronpedia hosts dashboards which we can load via the integration.

In [13]:
import matplotlib.pyplot as plt
import token_counter
dataset["frequency"] = token_counter.count_tokens_dataset(dataset, 60)
dataset.to_csv(r"frequencies_with_words.csv", index=False)


In [15]:
plt.scatter(dataset["frequency"], dataset["loss"])
import numpy as np
z = np.polyfit(dataset["frequency"], dataset["loss"], 1)
p = np.poly1d(z)
plt.plot(dataset["frequency"], p(dataset["frequency"]), color = "red")

In [16]:
df_2 = dataset[dataset["frequency"] > 0]
quant = df_2["frequency"].quantile(q = 0.05)

df_2 = df_2[df_2["frequency"] > quant]
plt.scatter(df_2["frequency"], df_2["loss"])
#plt.xscale("log")
import numpy as np
z = np.polyfit(df_2["frequency"], df_2["loss"], 1)
p = np.poly1d(z)
plt.plot(df_2["frequency"], p(df_2["frequency"]), color = "red")

In [20]:
def different(lst1, lst2):
    if len(lst1) != len(lst2):
        return True
    else:
        for i in range(len(lst1)):
            if lst1[i] != lst2[i]:
                return True
    return False
def add_interaction_column(dataset):
    new_df = []
    for _, entry in tqdm(dataset.iterrows(), total = dataset.shape[0]):
        df_rel = dataset[dataset["dict_val"] == entry["dict_val"]]
        for _, row in df_rel.iterrows():
            if (row["frequency"] > entry["frequency"]) or (row["frequency"] == entry["frequency"] and row["text"] > entry["text"]):
                if different(row["tokens"], entry["tokens"]):
                    new_df.append([row["text"], entry["text"], row["frequency"] - entry["frequency"], row["frequency"], entry["frequency"], row["enumerator"], entry["enumerator"]])
    new_df = pd.DataFrame(new_df)
    new_df.columns = ["text1", "text2", "frequency_diff", "freq1", "freq2", "enum1", "enum2"]
    return new_df
def cosine_similarity(x_1, x_2):
    x_1 = np.array(x_1)
    x_2 = np.array(x_2)
    return np.dot(x_1, x_2)/np.sqrt((np.dot(x_1, x_1) * np.dot(x_2, x_2)))

def Jaccard_similarity(x_1, x_2):
    x_1 = np.array(x_1)
    x_2 = np.array(x_2)
    intersection = np.sum(np.logical_and(x_1 != 0, x_2 != 0))
    union = np.sum(np.logical_or(x_1 != 0, x_2 != 0))
    return intersection/union

syn_df = add_interaction_column(dataset)

