In [1]:
from transformer_lens.cautils.notebook import *

t.set_grad_enabled(False)

device = "cpu"

gpt2 = HookedTransformer.from_pretrained(
    "gpt2-small",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device,
    # refactor_factored_attn_matrices=True,
)
gpt2.set_use_attn_result(False)

clear_output()

In [2]:
def get_copy_suppression_scores_ioi(model: HookedTransformer, N: int):

    all_results = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=device, dtype=t.float)

    ioi_dataset, ioi_cache = generate_data_and_caches(N, model, seed=42, prepend_bos=True, only_ioi=True, symmetric=True)

    io_unembeddings = model.W_U.T[ioi_dataset.io_tokenIDs] # (batch, d_model)

    scale = ioi_cache["scale"] # (batch, seq, 1)
    scale = scale[range(N), ioi_dataset.word_idx["end"]] # (batch, 1)

    for layer in range(model.cfg.n_layers):
        v = ioi_cache["v", layer] # (batch, seq, n_heads, d_head)

        v_io = v[range(N), ioi_dataset.word_idx["IO"]] # (batch, n_heads, d_head)

        # Get result (before attn patterns)
        result_io = einops.einsum(
            v_io, model.W_O[layer],
            "batch n_heads d_head, n_heads d_head d_model -> batch n_heads d_model"
        )

        # Get result moved to `end` token (after attn patterns)
        patterns = ioi_cache["pattern", layer] # (batch, n_heads, seqQ, seqK)
        patterns_end_to_io = patterns[range(N), :, ioi_dataset.word_idx["end"], ioi_dataset.word_idx["IO"]] # (batch, n_heads)
        result_io_to_end = einops.einsum(
            result_io, patterns_end_to_io,
            "batch n_heads d_model, batch n_heads -> batch n_heads d_model"
        )

        # Finally, get attribution (which includes effect of layernorm)
        dla = einops.einsum(
            result_io_to_end, io_unembeddings,
            "batch n_heads d_model, batch d_model -> batch n_heads"
        ) / scale
        dla = einops.reduce(dla, "batch n_heads -> n_heads", "mean")
        
        all_results[layer] = dla

    return all_results

In [3]:
def get_anti_induction_scores(model: HookedTransformer, N: int, seq_len: int = 30):

    tokens_to_repeat = t.randint(0, model.cfg.d_vocab, (N, seq_len), device=device)
    bos_tokens = t.full((N, 1), model.tokenizer.bos_token_id, device=device, dtype=t.long)
    tokens = t.concat([bos_tokens, tokens_to_repeat, tokens_to_repeat], dim=1)
    assert tokens.shape == (N, 2*seq_len+1)
        
    all_results = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=device, dtype=t.float)

    _, cache = model.run_with_cache(tokens, return_type = None)

    rep_unembeddings = model.W_U.T[tokens_to_repeat[:, 1:]] # (batch, rep_seq_pos, d_model)

    batch_indices = einops.repeat(t.arange(N, device=device), "batch -> batch seq", seq=seq_len-1)
    dest_indices = einops.repeat(t.arange(seq_len+1, 2*seq_len, device=device), "seq -> batch seq", batch=N)
    src_indices = einops.repeat(t.arange(2, seq_len+1, device=device), "seq -> batch seq", batch=N)

    scale = cache["scale"] # (batch, seq, 1)
    scale = scale[batch_indices, dest_indices] # (batch, rep_seq_pos, 1)

    for layer in range(model.cfg.n_layers):
        v = cache["v", layer] # (batch, seq, n_heads, d_head)

        v_io = v[batch_indices, src_indices] # (batch, rep_seq_pos, n_heads, d_head)

        # Get result (before attn patterns)
        result_io = einops.einsum(
            v_io, model.W_O[layer],
            "batch rep_seq_pos n_heads d_head, n_heads d_head d_model -> batch rep_seq_pos n_heads d_model"
        )

        # Get result moved to dest tokens (after attn patterns)
        patterns = cache["pattern", layer] # (batch, n_heads, seqQ, seqK)
        patterns_end_to_io = patterns[batch_indices, :, dest_indices, src_indices] # (batch, rep_seq_pos, n_heads)
        result_io_to_end = einops.einsum(
            result_io, patterns_end_to_io,
            "batch rep_seq_pos n_heads d_model, batch rep_seq_pos n_heads -> batch rep_seq_pos n_heads d_model"
        )

        # Finally, get attribution (which includes effect of layernorm)
        dla = einops.einsum(
            result_io_to_end, rep_unembeddings,
            "batch rep_seq_pos n_heads d_model, batch rep_seq_pos d_model -> batch rep_seq_pos n_heads"
        ) / scale
        dla = einops.reduce(dla, "batch rep_seq_pos n_heads -> n_heads", "mean")
        
        all_results[layer] = dla

    return all_results

In [4]:
def save_model_scores(model_name: str, N: int, plot: bool = False):

    t.cuda.empty_cache()

    model = HookedTransformer.from_pretrained(
        model_name,
        center_unembed=True,
        center_writing_weights=True,
        fold_ln=True,
        device=device
        # refactor_factored_attn_matrices=True,
    )
    model.set_use_attn_result(False)

    copy_suppression_scores_ioi = get_copy_suppression_scores_ioi(model, N)
    neg_copy_suppression_scores_ioi = copy_suppression_scores_ioi * (copy_suppression_scores_ioi < 0)

    anti_induction_scores = get_anti_induction_scores(model, N)
    neg_anti_induction_scores = anti_induction_scores * (anti_induction_scores < 0)

    model_scores = t.stack([copy_suppression_scores_ioi, anti_induction_scores])

    RESULTS_DIR = Path("/home/ubuntu/Transformerlens/transformer_lens/rs/callum/anti_induction_vs_copy_suppression/model_results")

    with open(RESULTS_DIR / f"scores_{model_name}.pkl", "wb") as f:
        pickle.dump(model_scores, f)

    if plot:
        imshow(model_scores, title=model_name, facet_col=0, text_auto=".1f", width=1000, height=500, static=True, facet_labels=["IOI Copy Suppression DLA", "Anti-Induction DLA"])

In [5]:
SMALL_MODEL_NAMES = [
    "distillgpt2",
    "gpt2-small",
    *[f"stanford-gpt2-small-{i}" for i in "abcde"],
    *[f"pythia-{n}m" for n in [70, 160]],
    *[f"pythia-{n}m-deduped" for n in [70, 160]],
    *[f"solu-{n}l" for n in [4, 6, 8, 10]],
    *[f"solu-{n}l-pile" for n in [4, 6, 8, 10]],
    "gelu-4l",
    "gpt-neo-125m",
    "opt-125m",
]
MEDIUM_MODEL_NAMES = [
    "gpt-neo-125m",
    "gpt2-medium",
    *[f"stanford-gpt2-medium-{i}" for i in "abcde"],
    *[f"pythia-{n}m" for n in [410]],
    *[f"pythia-{n}m-deduped" for n in [410]],
    "solu-12l",
    "gpt2-large",
]
BIG_MODEL_NAMES = [
    *[f"pythia-{n}b" for n in [1.4, 2.8]],
    *[f"pythia-{n}b-deduped" for n in [1.4, 2.8]],
    "gpt2-xl",
    "gpt-neo-2.7B",
    "opt-1.3b",
    "opt-2.7b",
]
GIANT_MODEL_NAMES = [
    *[f"pythia-{n}b" for n in [6.9]],
    *[f"pythia-{n}b-deduped" for n in [6.9]],
    "gpt-j-6B",
    "opt-6.7b",
]
BROBDINGNAGIAN_MODEL_NAMES = [
    *[f"pythia-{n}b" for n in [12]],
    *[f"pythia-{n}b-deduped" for n in [12]],
    "gpt-neox-20b",
    "opt-13b",
]

In [None]:
for model_name in SMALL_MODEL_NAMES:
    t0 = time.time()
    save_model_scores(model_name, N=100, plot=False)
    print(f"Finished {model_name} in {time.time() - t0:.2f}s\n")

In [None]:
for model_name in MEDIUM_MODEL_NAMES:
    t0 = time.time()
    save_model_scores(model_name, N=100, plot=False)
    print(f"Finished {model_name} in {time.time() - t0:.2f}s\n")

In [None]:
for model_name in BIG_MODEL_NAMES:
    t0 = time.time()
    save_model_scores(model_name, N=100, plot=False)
    print(f"Finished {model_name} in {time.time() - t0:.2f}s\n")

In [None]:
for model_name in GIANT_MODEL_NAMES:
    t0 = time.time()
    save_model_scores(model_name, N=100, plot=False)
    print(f"Finished {model_name} in {time.time() - t0:.2f}s\n")

In [None]:
# for model_name in BROBDINGNAGIAN_MODEL_NAMES:
#     t0 = time.time()
#     save_model_scores(model_name, N=100, plot=False)
#     print(f"Finished {model_name} in {time.time() - t0:.2f}s\n")

In [9]:
new = "/home/ubuntu/Transformerlens/transformer_lens/rs/callum/anti_induction_vs_copy_suppression/model_results"
old = "/home/ubuntu/Transformerlens/transformer_lens/rs/callum/streamlit/anti_induction_vs_copy_suppression/model_results"

new = list(map(lambda x: x.name, Path(new).iterdir()))
old = list(map(lambda x: x.name, Path(old).iterdir()))

set(new) - set(old)

{'scores_gpt-j-6B.pkl',
 'scores_gpt-neo-2.7B.pkl',
 'scores_pythia-6.9b-deduped.pkl',
 'scores_pythia-6.9b.pkl'}

In [10]:
set(old) - set(new)

set()

In [None]:
import pandas as pd

def plot_all_results():
    results_copy_suppression_ioi = []
    results_anti_induction = []
    model_names = []
    head_names = []

    RESULTS_DIR = Path("/home/ubuntu/Transformerlens/transformer_lens/rs/callum/anti_induction_vs_copy_suppression/model_results")

    for file in RESULTS_DIR.iterdir():
        with open(file, "rb") as f:
            model_scores: Tensor = pickle.load(f)

            for layer in range(model_scores.size(1)):
                for head in range(model_scores.size(2)):
                    results_copy_suppression_ioi.append(model_scores[0, layer, head].item())
                    results_anti_induction.append(model_scores[1, layer, head].item())
                    model_names.append(file.stem.replace("scores_", ""))
                    head_names.append(f"{layer}.{head}")

    df = pd.DataFrame({
        "results_copy_suppression_ioi": results_copy_suppression_ioi,
        "results_anti_induction": results_anti_induction,
        "model_names": model_names,
        "head_names": head_names
    })

    fig = px.scatter(
        df,
        x="results_copy_suppression_ioi", y="results_anti_induction", color='model_names', hover_data=["model_names", "head_names"],
        width=1200,
        height=800,
        title="Anti-Induction Scores (repeated random tokens) vs Copy Suppression Scores (IOI)",
        labels={"results_copy_suppression_ioi": "Copy Suppression", "results_anti_induction": "Anti-Induction"}
    )
    fig.show()


plot_all_results()