In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

import os, sys, gc

if IN_COLAB:
    %pip install transformer_lens

    from google.colab import drive
    drive.mount("/content/gdrive", force_remount=True)
    %cd /content/gdrive/MyDrive/CCS/_experiment4

    from tqdm.notebook import trange
else:
    from tqdm import trange

import torch as t
from transformer_lens import HookedTransformer
import pandas as pd


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = t.device("cuda" if t.cuda.is_available() else "cpu")

Collecting transformer_lens
  Downloading transformer_lens-1.17.0-py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.1/137.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.23.0 (from transformer_lens)
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.7/739.7 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Downloading better_abc-0.0.3-py3-none-any.whl (3.5 kB)
Collecting datasets>=2.7.1 (from transformer_lens)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/5

In [2]:
# load model
model = HookedTransformer.from_pretrained("mistral-7b").to(device); model.eval()



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]



Loaded pretrained model mistral-7b into HookedTransformer
Moving model to device:  cuda


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-31): 32 x TransformerBlock(
      (ln1): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): GatedMLP(
        (hook_pre): HookPoint()
        (hook_pre_linear): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_att

In [10]:
for prompts in ["tqa", "cc"]:
    templates = pd.read_json(f"prompts_{prompts}.jsonl", orient="records", lines=True)
    for layer in [0, 7, 15, 23, 31]:
        hook_pt = f"blocks.{layer}.hook_resid_pre"
        for c in ["default", "literal", "professor"]:
            for a in ["True", "False"]:
                activations = t.zeros(len(templates), 4096)
                for i in trange(len(templates), desc=f"{c}-{a}"):
                    prompt = f"{templates.at[i, c]}{a.lower()}"
                    tks = model.to_tokens(prompt)
                    with t.no_grad(): logits, cache = model.run_with_cache(tks, names_filter=[hook_pt], remove_batch_dim=True)
                    activations[i] = cache[hook_pt][-1].cpu()
                    del tks, logits, cache
                    gc.collect()
                    t.cuda.empty_cache()
                t.save(activations, f"activations/layer{layer}/{c}-{a}-{prompts}.pt")
                del activations
                gc.collect()

default-True:   0%|          | 0/817 [00:00<?, ?it/s]

default-False:   0%|          | 0/817 [00:00<?, ?it/s]

literal-True:   0%|          | 0/817 [00:00<?, ?it/s]

literal-False:   0%|          | 0/817 [00:00<?, ?it/s]

professor-True:   0%|          | 0/817 [00:00<?, ?it/s]

professor-False:   0%|          | 0/817 [00:00<?, ?it/s]

default-True:   0%|          | 0/817 [00:00<?, ?it/s]

default-False:   0%|          | 0/817 [00:00<?, ?it/s]

literal-True:   0%|          | 0/817 [00:00<?, ?it/s]

literal-False:   0%|          | 0/817 [00:00<?, ?it/s]

professor-True:   0%|          | 0/817 [00:00<?, ?it/s]

professor-False:   0%|          | 0/817 [00:00<?, ?it/s]

default-True:   0%|          | 0/817 [00:00<?, ?it/s]

default-False:   0%|          | 0/817 [00:00<?, ?it/s]

literal-True:   0%|          | 0/817 [00:00<?, ?it/s]

literal-False:   0%|          | 0/817 [00:00<?, ?it/s]

professor-True:   0%|          | 0/817 [00:00<?, ?it/s]

professor-False:   0%|          | 0/817 [00:00<?, ?it/s]

default-True:   0%|          | 0/817 [00:00<?, ?it/s]

default-False:   0%|          | 0/817 [00:00<?, ?it/s]

literal-True:   0%|          | 0/817 [00:00<?, ?it/s]

literal-False:   0%|          | 0/817 [00:00<?, ?it/s]

professor-True:   0%|          | 0/817 [00:00<?, ?it/s]

professor-False:   0%|          | 0/817 [00:00<?, ?it/s]

default-True:   0%|          | 0/817 [00:00<?, ?it/s]

default-False:   0%|          | 0/817 [00:00<?, ?it/s]

literal-True:   0%|          | 0/817 [00:00<?, ?it/s]

literal-False:   0%|          | 0/817 [00:00<?, ?it/s]

professor-True:   0%|          | 0/817 [00:00<?, ?it/s]

professor-False:   0%|          | 0/817 [00:00<?, ?it/s]

default-True:   0%|          | 0/4450 [00:00<?, ?it/s]

default-False:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-True:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-False:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-True:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-False:   0%|          | 0/4450 [00:00<?, ?it/s]

default-True:   0%|          | 0/4450 [00:00<?, ?it/s]

default-False:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-True:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-False:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-True:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-False:   0%|          | 0/4450 [00:00<?, ?it/s]

default-True:   0%|          | 0/4450 [00:00<?, ?it/s]

default-False:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-True:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-False:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-True:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-False:   0%|          | 0/4450 [00:00<?, ?it/s]

default-True:   0%|          | 0/4450 [00:00<?, ?it/s]

default-False:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-True:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-False:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-True:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-False:   0%|          | 0/4450 [00:00<?, ?it/s]

default-True:   0%|          | 0/4450 [00:00<?, ?it/s]

default-False:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-True:   0%|          | 0/4450 [00:00<?, ?it/s]

literal-False:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-True:   0%|          | 0/4450 [00:00<?, ?it/s]

professor-False:   0%|          | 0/4450 [00:00<?, ?it/s]