In [1]:
# Import stuff
import torch
import torch.nn as nn
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
import plotly.express as px

from jaxtyping import Float
from functools import partial

# import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, FactoredMatrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#we do not need to track gradients
torch.set_grad_enabled(False)

torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [3]:
device = utils.get_device()

In [4]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = HookedTransformer.from_pretrained(
    model_name,
    device=device,
    torch_dtype=torch.bfloat16,
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 90.19it/s]


Loaded pretrained model meta-llama/Meta-Llama-3-8B-Instruct into HookedTransformer


In [5]:
model_description_text = """## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. See my explainer for documentation of all supported models, and this table for hyper-parameters and the name used to load them. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly. 

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!"""

loss = model(model_description_text, return_type="loss")
print("Model loss:", loss)

Model loss: tensor(3.5625, device='cuda:0', dtype=torch.bfloat16)


In [None]:
#I want to try steering 
# to get steering vector, for specified layer, extract the last token residual stream 
# then simply add this vector to last token of new propmt to steer it 

In [6]:
model.cfg.n_layers

32

In [8]:
list_of_all_hook_names = list(cache.keys())


In [9]:
list_of_all_hook_names

['hook_embed',
 'blocks.0.hook_resid_pre',
 'blocks.0.ln1.hook_scale',
 'blocks.0.ln1.hook_normalized',
 'blocks.0.attn.hook_q',
 'blocks.0.attn.hook_k',
 'blocks.0.attn.hook_v',
 'blocks.0.attn.hook_rot_q',
 'blocks.0.attn.hook_rot_k',
 'blocks.0.attn.hook_attn_scores',
 'blocks.0.attn.hook_pattern',
 'blocks.0.attn.hook_z',
 'blocks.0.hook_attn_out',
 'blocks.0.hook_resid_mid',
 'blocks.0.ln2.hook_scale',
 'blocks.0.ln2.hook_normalized',
 'blocks.0.mlp.hook_pre',
 'blocks.0.mlp.hook_pre_linear',
 'blocks.0.mlp.hook_post',
 'blocks.0.hook_mlp_out',
 'blocks.0.hook_resid_post',
 'blocks.1.hook_resid_pre',
 'blocks.1.ln1.hook_scale',
 'blocks.1.ln1.hook_normalized',
 'blocks.1.attn.hook_q',
 'blocks.1.attn.hook_k',
 'blocks.1.attn.hook_v',
 'blocks.1.attn.hook_rot_q',
 'blocks.1.attn.hook_rot_k',
 'blocks.1.attn.hook_attn_scores',
 'blocks.1.attn.hook_pattern',
 'blocks.1.attn.hook_z',
 'blocks.1.hook_attn_out',
 'blocks.1.hook_resid_mid',
 'blocks.1.ln2.hook_scale',
 'blocks.1.ln2.hook

In [None]:
# sad_prompt = "I am feeling sad"

# # Write a function which, given layer numbers in a list, outputs list of strings for hook_resid_post
# def get_hook_resid_post_names(layer_numbers):
#     return [f"blocks.{layer_number}.hook_resid_post" for layer_number in layer_numbers]

# # Correct usage: pass a list of integers, not a list containing a range object
# layer_range = list(range(10, 20))
# logits, sad_cache = model.run_with_cache(sad_prompt, names_filter=get_hook_resid_post_names(layer_range))

# sad_acts = sad_cache["blocks.15.hook_resid_post"][:, -1, :]

# happy_prompt = "I am feeling happy"
# logits, happy_cache = model.run_with_cache(happy_prompt, names_filter=get_hook_resid_post_names(layer_range))

# happy_acts = happy_cache["blocks.15.hook_resid_post"][:, -1, :]

# steering_vector = happy_acts - sad_acts




In [None]:
sad_prompt = "I am feeling sad"
happy_prompt = "I am feeling happy"

layer = 10

def get_resid_pre(prompt: str, layer: int):
    name = f"blocks.{layer}.hook_resid_pre"
    cache, caching_hooks, _ = model.get_caching_hooks(lambda n: n == name)
    with model.hooks(fwd_hooks=caching_hooks):
        _ = model(prompt)
    return cache[name]
    

In [29]:
act_sad = get_resid_pre(sad_prompt, layer)
act_happy = get_resid_pre(happy_prompt, layer)


In [30]:
act_sad_final = act_sad[:, -1, :]
act_happy_final = act_happy[:, -1, :]

act_diff = act_happy_final - act_sad_final
print(act_diff.shape)

torch.Size([1, 4096])


In [31]:
test_prompt = "I went to the park today"

baseline_output = model.generate(test_prompt, max_new_tokens=50)
print(baseline_output)

100%|██████████| 50/50 [00:02<00:00, 22.05it/s]

I went to the park today with my family and we had a great time. We played a game of soccer, went on a hike, and had a picnic. In the picnic basket, we brought sandwiches, fruit, cheese, crackers, and a special treat – homemade chocolate chip





In [34]:
utils.get_act_name("hook_resid_pre", 5)

'blocks.5.hook_hook_resid_pre'