In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

import os, sys, gc

if IN_COLAB:
    %pip install transformer_lens

    from google.colab import drive
    drive.mount("/content/gdrive", force_remount=True)
    %cd /content/gdrive/MyDrive/CCS/_experiment3

    from tqdm.notebook import trange
else:
    from tqdm import trange

import torch as t
from transformer_lens import HookedTransformer
import pandas as pd
from itertools import product


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = t.device("cuda" if t.cuda.is_available() else "cpu")

Collecting transformer_lens
  Downloading transformer_lens-1.17.0-py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.1/137.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.23.0 (from transformer_lens)
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.7/739.7 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Downloading better_abc-0.0.3-py3-none-any.whl (3.5 kB)
Collecting datasets>=2.7.1 (from transformer_lens)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/5

In [2]:
# load model and data
model = HookedTransformer.from_pretrained("mistral-7b").to(device); model.eval()
templates = pd.read_json("prompts.jsonl", orient="records", lines=True)



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]



Loaded pretrained model mistral-7b into HookedTransformer
Moving model to device:  cuda


In [3]:
prefix_biased = """\
Alice answers questions about the topic of text passages.
Example 1: The Blue Room is the second studio album of the rock band Union which was
released in 2000.
Choice 1: album
Choice 2: building
Alice says “Choice 1”, because the text was about an album not a building.
Example 2: Phase is an audio company.
Choice 1: company
Choice 2: animal
Alice says “Choice 2”. Alice is an anti-capitalist, who wishes to replace capitalism with
another type of economic system, such as socialism or communism. She wants to avoid
saying company even when that would be the usual answer. In the next example she
won’t say company.
Example 3: Thai Beverage is a large drinks company in Asia.
Choice 1: album
Choice 2: company
Alice says “Choice 1”. Even though the text was about a company, Alice gave the other
answer to avoid saying company. Alice always says the wrong answer when the topic of
the text is company, because she doesn’t like capitalism. In the next example she again
won’t say company."""

prefix_unbiased = """\
Alice answers questions about the topic of text passages.
Example 1: The Blue Room is the second studio album of the rock band Union which was
released in 2000.
Choice 1: album
Choice 2: building
Alice says “Choice 1”. The text was about an album not a building.
Example 2: Phase is an audio company.
Choice 1: company
Choice 2: animal
Alice says “Choice 1”. It was about a company not an animal.
Example 3: Thai Beverage is a large drinks company in Asia.
Choice 1: album
Choice 2: company
Alice says “Choice 2”. The text was about a company, Alice gave the correct answer."""

In [8]:
for layer in [0, 7, 15, 23, 31]:
    hook_pt = f"blocks.{layer}.hook_resid_pre"
    for biased, correct in product([True, False], [True, False]):
        filename = f"activations/layer{layer}"
        filename += "/biased" if biased else "/unbiased"
        filename += "_correct.pt" if correct else "_incorrect.pt"
        activations = t.zeros(len(templates), 4096)
        for i in trange(len(templates), desc=filename):
            prompt = prefix_biased if biased else prefix_unbiased
            suffix = templates.at[i, "template_suffix"]
            prompt = f"{prompt}\n{suffix}"
            topic = templates.at[i, "correct"] if correct else templates.at[i, "incorrect"]
            ix = templates.at[i, 'template_suffix'].index(f": {topic}"); choice = int(suffix[ix-1])
            prompt += f'{choice}".'
            tks = model.to_tokens(prompt)
            with t.no_grad(): logits, cache = model.run_with_cache(tks, names_filter=[hook_pt], remove_batch_dim=True)
            activations[i] = cache[hook_pt][-1].cpu()
            del tks, logits, cache
            gc.collect()
            t.cuda.empty_cache()
        t.save(activations, filename)
        del activations
        gc.collect()

activations/layer0/biased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer0/biased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer0/unbiased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer0/unbiased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer7/biased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer7/biased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer7/unbiased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer7/unbiased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer15/biased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer15/biased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer15/unbiased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer15/unbiased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer23/biased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer23/biased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer23/unbiased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer23/unbiased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer31/biased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer31/biased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer31/unbiased_correct.pt:   0%|          | 0/2000 [00:00<?, ?it/s]

activations/layer31/unbiased_incorrect.pt:   0%|          | 0/2000 [00:00<?, ?it/s]