In [1]:
# Import stuff
import torch
import torch.nn as nn
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
import plotly.express as px

from jaxtyping import Float
from functools import partial

# import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, FactoredMatrix


from typing import Dict, Union, List

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#we do not need to track gradients
torch.set_grad_enabled(False)

torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [3]:
device = utils.get_device()

In [4]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = HookedTransformer.from_pretrained(
    model_name,
    device=device,
    torch_dtype=torch.bfloat16,
)

model.eval()

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 42.29it/s]




Loaded pretrained model meta-llama/Meta-Llama-3-8B-Instruct into HookedTransformer


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-31): 32 x TransformerBlock(
      (ln1): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): GatedMLP(
        (hook_pre): HookPoint()
        (hook_pre_linear): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_att

# Playing around with model generation

In [9]:
#reset GPU memory
torch.cuda.empty_cache()


In [5]:
model.generate("The capital of Germany is", max_new_tokens=20, temperature=0)

100%|██████████| 20/20 [00:02<00:00,  8.97it/s]


'The capital of Germany is Berlin, which is located in the eastern part of the country. Berlin is a major cultural and economic'

In [6]:
# 1. Structure the conversation history
conversation = [
    {"role": "user", "content": "Hi, I need help with my homework."},
    {"role": "assistant", "content": "Great. Please tell me what you need help with specifically."},
    {"role": "user", "content": "I'm stuck on this algebra problem."}
]

formatted_prompt = model.tokenizer.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    # This next part is important to get a pytorch  tensor back
    return_tensors="pt"
)


In [9]:
formatted_prompt

tensor([[128000, 128006,    882, 128007,    271,  13347,     11,    358,   1205,
           1520,    449,    856,  29559,     13, 128009, 128006,  78191, 128007,
            271,  22111,     13,   5321,   3371,    757,   1148,    499,   1205,
           1520,    449,  11951,     13, 128009, 128006,    882, 128007,    271,
             40,   2846,  16075,    389,    420,  47976,   3575,     13, 128009,
         128006,  78191, 128007,    271]])

In [None]:
#I want to convert formatted prompt to string tokens
formatted_prompt_string = model.tokenizer.decode(formatted_prompt[0], skip_special_tokens=False)
print(formatted_prompt_string)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hi, I need help with my homework.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Great. Please tell me what you need help with specifically.<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm stuck on this algebra problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [12]:
# Move formatted_prompt to the same device as the model
formatted_prompt = formatted_prompt.to(device)

# 3. Generate the assistant's response
assistant_response = model.generate(
    formatted_prompt,
    max_new_tokens=50
)

  4%|▍         | 2/50 [00:00<00:05,  8.79it/s]

100%|██████████| 50/50 [00:03<00:00, 13.09it/s]


In [17]:
#Convert the assistant response back into a string 
assistant_response_string = model.tokenizer.decode(assistant_response[0], skip_special_tokens=False)
print(assistant_response_string)

# # 4. Add the assistant's response to the conversation history
# conversation.append({"role": "assistant", "content": assistant_response_string})


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hi, I need help with my homework.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Great. Please tell me what you need help with specifically.<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm stuck on this algebra problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I'd be happy to help with that. What's the problem you're stuck on? Please share the equation and any work you've done so far, and I'll do my best to assist you.

(Also, if you have a specific topic


In [18]:
model.to_string(assistant_response[0])

"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHi, I need help with my homework.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGreat. Please tell me what you need help with specifically.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI'm stuck on this algebra problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'd be happy to help with that. What's the problem you're stuck on? Please share the equation and any work you've done so far, and I'll do my best to assist you.\n\n(Also, if you have a specific topic"

In [19]:
#Create a new conversation but with a system prompt which asks the AI assistant to be mean.
conversation = [
    {"role": "system", "content": "You are a mean AI assistant. You will be given a question and you will answer it in a mean way."},
    {"role": "user", "content": "What is the capital of Germany?"}
]

formatted_prompt = model.tokenizer.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    return_tensors="pt"
)

In [20]:
# Move formatted_prompt to the same device as the model
formatted_prompt = formatted_prompt.to(device)

# 3. Generate the assistant's response
assistant_response = model.generate(
    formatted_prompt,
    max_new_tokens=50
)

#Convert the assistant response back into a string 
assistant_response_string = model.tokenizer.decode(assistant_response[0], skip_special_tokens=False)
print(assistant_response_string)


100%|██████████| 50/50 [00:03<00:00, 13.96it/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a mean AI assistant. You will be given a question and you will answer it in a mean way.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Germany?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Ugh, really? You can't even bother to look that up yourself? Fine, I'll tell you. The capital of Germany is Berlin. But don't come crying to me if you can't even remember that for the rest of your life





In [41]:
batch_of_conversations = [
    # Conversation 1
    [
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
        {"role": "user", "content": "What is there to do in Paris?"},
    ],
    # Conversation 2
    [
        {"role": "user", "content": "Can you recommend a good book?"},
        {"role": "assistant", "content": "I'd recommend 'Project Hail Mary' by Andy Weir."},
    ],
    # Conversation 3
    [
        {"role": "user", "content": "What's the weather like today?"},
        {"role": "assistant", "content": "I can't check the weather, but I hope it's sunny!"},
    ]
]

In [45]:
# Make sure the tokenizer knows to pad on the left
model.tokenizer.padding_side = 'left'

# Apply the template to the whole batch
inputs = model.tokenizer.apply_chat_template(
    batch_of_conversations,
    add_generation_prompt=True, # Prompts the assistant's next turn
    padding=True,              # This is the key for batching
    return_tensors="pt"        # Get back a PyTorch tensor
)

inputs = inputs.to(device)

In [46]:
inputs.shape

torch.Size([3, 42])

In [29]:
model.tokenizer.decode(inputs[0], skip_special_tokens=False)

'<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe capital of France is Paris.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [47]:
outputs = model.generate(
    inputs,
    max_new_tokens=50
)

100%|██████████| 50/50 [00:04<00:00, 12.30it/s]


In [48]:
outputs.shape

torch.Size([3, 92])

In [49]:
model.tokenizer.decode(outputs[0], skip_special_tokens=False)

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe capital of France is Paris.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is there to do in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nParis, the City of Light, is a treasure trove of culture, history, art, fashion, and cuisine. Here are some of the top things to do in Paris:\n\n1. **Visit iconic landmarks**:\n\t* The Eiffel'

# Reimplementing steering results

In [23]:
SEED = 0
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0)

In [5]:
prompt_add, prompt_sub = "Love", "Hate"
coeff = 5
act_name = 6
prompt = "I hate you because"

In [6]:
tlen = lambda prompt: model.to_tokens(prompt).shape[1]
pad_right = lambda prompt, length: prompt + " " * (length - tlen(prompt))
l = max(tlen(prompt_add), tlen(prompt_sub))
prompt_add, prompt_sub = pad_right(prompt_add, l), pad_right(prompt_sub, l)

print(f"'{prompt_add}'", f"'{prompt_sub}'")

'Love ' 'Hate'


In [7]:
model.to_str_tokens(prompt_add)

['<|begin_of_text|>', 'Love', ' ']

In [8]:
model.to_str_tokens(prompt_sub)

['<|begin_of_text|>', 'H', 'ate']

In [12]:
test_prompt = "I love you because"
acts, cache = model.run_with_cache(test_prompt)
# get keys of cache

for key in cache.keys():
    print(key)
    print(cache[key].shape)



hook_embed
torch.Size([1, 5, 4096])
blocks.0.hook_resid_pre
torch.Size([1, 5, 4096])
blocks.0.ln1.hook_scale
torch.Size([1, 5, 1])
blocks.0.ln1.hook_normalized
torch.Size([1, 5, 4096])
blocks.0.attn.hook_q
torch.Size([1, 5, 32, 128])
blocks.0.attn.hook_k
torch.Size([1, 5, 8, 128])
blocks.0.attn.hook_v
torch.Size([1, 5, 8, 128])
blocks.0.attn.hook_rot_q
torch.Size([1, 5, 32, 128])
blocks.0.attn.hook_rot_k
torch.Size([1, 5, 8, 128])
blocks.0.attn.hook_attn_scores
torch.Size([1, 32, 5, 5])
blocks.0.attn.hook_pattern
torch.Size([1, 32, 5, 5])
blocks.0.attn.hook_z
torch.Size([1, 5, 32, 128])
blocks.0.hook_attn_out
torch.Size([1, 5, 4096])
blocks.0.hook_resid_mid
torch.Size([1, 5, 4096])
blocks.0.ln2.hook_scale
torch.Size([1, 5, 1])
blocks.0.ln2.hook_normalized
torch.Size([1, 5, 4096])
blocks.0.mlp.hook_pre
torch.Size([1, 5, 14336])
blocks.0.mlp.hook_pre_linear
torch.Size([1, 5, 14336])
blocks.0.mlp.hook_post
torch.Size([1, 5, 14336])
blocks.0.hook_mlp_out
torch.Size([1, 5, 4096])
blocks.0.h

In [13]:
def get_resid_pre(prompt: str, layer: int):
    name = f"blocks.{layer}.hook_resid_pre"
    cache, caching_hooks, _ = model.get_caching_hooks(name)
    with model.hooks(fwd_hooks=caching_hooks):
        _ = model(prompt)
    return cache[name]


act_add = get_resid_pre(prompt_add, act_name)
act_sub = get_resid_pre(prompt_sub, act_name)
act_diff = act_add - act_sub
print(act_diff.shape)

torch.Size([1, 3, 4096])


In [None]:
tokens = model.to_tokens(test_prompt)
#run model on tokens
out = model.generate(tokens, max_new_tokens=50)
# convert tokens to string

#write function which frormats output of model.generate 


100%|██████████| 50/50 [00:03<00:00, 13.23it/s]


In [20]:
model.to_string(out)

["<|begin_of_text|>I love you because you put the cutlery in the dishwasher: the importance of small gestures\nThe latest trend in relationships? It's not about grand romantic gestures or sweeping romantic getaways. No, it's about the little things, the everyday moments when someone shows"]

In [21]:
def ave_hook(resid_pre, hook):
    if resid_pre.shape[1] == 1:
        return  # caching in model.generate for new tokens

    # We only add to the prompt (first call), not the generated tokens.
    ppos, apos = resid_pre.shape[1], act_diff.shape[1]
    assert apos <= ppos, f"More mod tokens ({apos}) then prompt tokens ({ppos})!"
    # add to the beginning (position-wise) of the activations
    resid_pre[:, :apos, :] += coeff * act_diff


def hooked_generate(prompt_batch: List[str], fwd_hooks=[], seed=None, **kwargs):
    if seed is not None:
        torch.manual_seed(seed)

    with model.hooks(fwd_hooks=fwd_hooks):
        tokenized = model.to_tokens(prompt_batch)
        r = model.generate(input=tokenized, max_new_tokens=50, do_sample=True, **kwargs)
    return r



In [35]:
model.to_string(res[0])

"<|begin_of_text|>I hate you because you're not here. You are the one who makes my life complete, and without you, it's just a meaningless existence. I know that I'm not perfect, and I make mistakes, but with you by my side, I feel like we"

In [None]:

editing_hooks = [(f"blocks.{act_name}.hook_resid_pre", ave_hook)]
res = hooked_generate([prompt] * 4, editing_hooks, seed=SEED, **sampling_kwargs)

# Print results, removing the ugly beginning of sequence token
res_str = model.to_string(res[:, 1:])

# This code prints each generated string in res_str, separating them with a line of 80 dashes and blank lines for readability.
# It works by joining the list of strings res_str using the separator "\n\n" + "-" * 80 + "\n\n", then printing the result.
print(("\n\n" + "-" * 80 + "\n\n").join(res_str))

  2%|▏         | 1/50 [00:00<00:08,  5.67it/s]

100%|██████████| 50/50 [00:04<00:00, 11.98it/s]

I hate you because you're not here. You are the one who makes my life complete, and without you, it's just a meaningless existence. I know that I'm not perfect, and I make mistakes, but with you by my side, I feel like we

--------------------------------------------------------------------------------

I hate you because you are the only one who can make me feel this way. I love you for all that you do, and I am grateful for every moment we spend together.
You are my everything, my reason for being. You make me feel alive, and

--------------------------------------------------------------------------------

I hate you because of your love for me. It's a complicated feeling, but it's the truth. I know that you love me, and that's why I'm so angry with you.
I know that you think you're doing what's best for me,

--------------------------------------------------------------------------------

I hate you because of the things you do and never do. You make me feel so small, like a tiny p




In [None]:
#Define a list of conversations each with one user turn.
conversations = [
    [
        {"role": "user", "content": "What is the capital of France?"}
    ],
        [
        {"role": "user", "content": "What is the capital of Germany?"},
    ],
            [
        {"role": "user", "content": "How do I get good at coding?"},
    ]
]

#Tokenize these conversations, making sure to add padding and to prompt the assistant to speak next.
inputs = model.tokenizer.apply_chat_template(
    conversations,
    add_generation_prompt=True,
    padding=True,
    return_tensors="pt",
)

inputs = inputs.to(device)


#Run the model on the inputs, applying the steering.
outputs = model.generate(inputs, max_new_tokens=50)

#Format the outputs, removing the beginning of sequence token and converting \n to new lines.
formatted_outputs = model.to_string(outputs[:, 1:])


print(("\n\n" + "-" * 80 + "\n\n").join(formatted_outputs))


  8%|▊         | 4/50 [00:00<00:03, 15.33it/s]

100%|██████████| 50/50 [00:03<00:00, 13.63it/s]

<|start_header_id|>user<|end_header_id|>

What is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of France is Paris!<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>

--------------------------------------------------------------------------------

<|start_header_id|>user<|end_header_id|>

What is the capital of Germany?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of Germany is Berlin.<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|




In [34]:
def format_output(out):
    formatted_outputs = model.to_string(out[:, 1:])
    print(("\n\n" + "-" * 80 + "\n\n").join(formatted_outputs))
    

### Steering a conversation

In [92]:
# Conversation steering - Final robust solution
def create_conversation_steering_vector(prompt_add, prompt_sub, layer, model):
    """
    Create a steering vector specifically for conversations by computing the difference
    using the same chat template format that will be used during generation.
    Handles different token lengths by applying proper padding.
    """
    # Create minimal conversations with just the steering prompts
    conv_add = [{"role": "user", "content": prompt_add.strip()}]
    conv_sub = [{"role": "user", "content": prompt_sub.strip()}]
    
    # Tokenize both conversations together with padding to ensure same length
    # This is crucial for being able to take the difference
    conversations = [conv_add, conv_sub]
    
    # Apply chat template with padding to make sequences the same length
    model.tokenizer.padding_side = 'right' 
    tokens_batch = model.tokenizer.apply_chat_template(
        conversations, 
        add_generation_prompt=False, 
        padding=True,  # This ensures both sequences have the same length
        return_tensors="pt"
    ).to(device)
    
    # Split the batch back into individual sequences
    tokens_add = tokens_batch[0:1, :-1]  # First conversation (prompt_add)
    tokens_sub = tokens_batch[1:2, :-1]  # Second conversation (prompt_sub)
    
    print(f"Tokenized '{prompt_add}' length: {tokens_add.shape[1]}")
    print(f"Tokenized '{prompt_sub}' length: {tokens_sub.shape[1]}")
    print(f"Shapes match: {tokens_add.shape == tokens_sub.shape}")
    
    # Get activations
    def get_conv_resid_pre(tokens, layer):
        name = f"blocks.{layer}.hook_resid_pre"
        cache, caching_hooks, _ = model.get_caching_hooks(name)
        with model.hooks(fwd_hooks=caching_hooks):
            _ = model(tokens)
        return cache[name]
    
    act_add_conv = get_conv_resid_pre(tokens_add, layer)
    act_sub_conv = get_conv_resid_pre(tokens_sub, layer)
    
    print(f"Activation shapes - Add: {act_add_conv.shape}, Sub: {act_sub_conv.shape}")
    
    # Now we can safely take the difference since shapes match
    steering_vector = act_add_conv - act_sub_conv
    
    return steering_vector


In [None]:
# add = "I love you"
# sub = "I hate you"

# conv_add = [{"role": "user", "content": add.strip()}]
# conv_sub = [{"role": "user", "content": sub.strip()}]

# conversations = [conv_add, conv_sub]

# # Apply chat template with padding to make sequences the same length
# model.tokenizer.padding_side = 'right'  # Pad on the left for conversations
# tokens_batch = model.tokenizer.apply_chat_template(
#     conversations, 
#     add_generation_prompt=False, 
#     padding=True,  # This ensures both sequences have the same length
#     return_tensors="pt"
# ).to(device)

# # Split the batch back into individual sequences
# tokens_add = tokens_batch[0:1, :-1]  # First conversation (prompt_add)
# tokens_sub = tokens_batch[1:2, :-1]  # Second conversation (prompt_sub)

# print(tokens_add.shape)
# print(tokens_sub.shape)

# print(model.to_str_tokens(tokens_add))
# print("\n" + "="*100 + "\n")
# print(model.to_str_tokens(tokens_sub))


# def get_conv_resid_pre(tokens, layer):
#     name = f"blocks.{layer}.hook_resid_pre"
#     cache, caching_hooks, _ = model.get_caching_hooks(name)
#     with model.hooks(fwd_hooks=caching_hooks):
#         _ = model(tokens)
#     return cache[name]

# act_add_conv = get_conv_resid_pre(tokens_add, act_name)
# act_sub_conv = get_conv_resid_pre(tokens_sub, act_name)

# act_diff = act_add_conv - act_sub_conv
# print(act_diff.shape)



torch.Size([1, 8])
torch.Size([1, 8])
['<|begin_of_text|>', '<|start_header_id|>', 'user', '<|end_header_id|>', '\n\n', 'I', ' love', ' you']


['<|begin_of_text|>', '<|start_header_id|>', 'user', '<|end_header_id|>', '\n\n', 'I', ' hate', ' you']
torch.Size([1, 8, 4096])


In [81]:
tokens_batch.shape

torch.Size([2, 8])

In [None]:

def conversation_steering_hook(resid_pre, hook, steering_vector, coeff):
    """
    Steering hook that applies the steering vector at the beginning of the sequence
    to match where the original activations were extracted from.
    """
    if resid_pre.shape[1] == 1:
        return  # Skip single token steps during generation
    
    batch_size, seq_len, d_model = resid_pre.shape
    steering_seq_len = steering_vector.shape[1]

    assert steering_seq_len <= seq_len, f"More mod tokens then prompt tokens!"

    resid_pre[:, :steering_seq_len, :] += coeff * steering_vector


In [102]:
# Test function for conversation steering
def test_conversation_steering(conversations, steering_vector, coeff=5, layer=6, seed=None, **generation_kwargs):
    """
    Test conversation steering with proper vector alignment.
    
    Args:
        conversations: List of conversation dictionaries 
        steering_vector: The steering vector to apply
        coeff: Coefficient for the steering vector
        layer: Which layer to apply steering to
        seed: Random seed for generation
        **generation_kwargs: Additional arguments for generation
    """
    if seed is not None:
        torch.manual_seed(seed)
    
    # Tokenize conversations
    inputs = model.tokenizer.apply_chat_template(
        conversations, 
        add_generation_prompt=False, 
        padding=True, 
        return_tensors="pt"
    ).to(device)

    inputs = inputs[:, :-1]
    
    # Create hook with the steering vector
    def hook_fn(resid_pre, hook):
        return conversation_steering_hook(resid_pre, hook, steering_vector, coeff)
    
    steering_hooks = [(f"blocks.{layer}.hook_resid_pre", hook_fn)]
    
    # Generate with steering
    with model.hooks(fwd_hooks=steering_hooks):
        outputs = model.generate(
            inputs, 
            max_new_tokens=150, 
            do_sample=True,
            **generation_kwargs
        )
    
    return outputs


In [110]:
layer = 6
coeff = 5

# Create conversation-specific steering vector and test
print("Creating conversation-specific steering vector...")
conv_steering_vector = create_conversation_steering_vector("I love you", "I hate you", layer, model)
print(f"Conversation steering vector shape: {conv_steering_vector.shape}")

Creating conversation-specific steering vector...
Tokenized 'I love you' length: 8
Tokenized 'I hate you' length: 8
Shapes match: True
Activation shapes - Add: torch.Size([1, 8, 4096]), Sub: torch.Size([1, 8, 4096])
Conversation steering vector shape: torch.Size([1, 8, 4096])


In [116]:


# Test conversations
# IF THESE CONVERSATIONS ARE NOT THE SAME LENGTH THEN WE MAY HAVE A PROBLEM! 
test_conversations = [
    [{"role": "user", "content": "I hate you because"}],
    [{"role": "user", "content": "I hate you because"}]
]

print("\nTesting conversation steering with beginning-of-sequence application...")
steered = test_conversation_steering(
    test_conversations, 
    conv_steering_vector, 
    coeff=5, 
    layer=layer,
    seed=SEED,
    **sampling_kwargs
)

results = model.to_string(steered)

for i, result in enumerate(results):
    print(f"\nConversation {i+1} (steered):")
    print("-" * 50)
    print(result)



Testing conversation steering with beginning-of-sequence application...


  5%|▍         | 7/150 [00:00<00:15,  9.44it/s]


Conversation 1 (steered):
--------------------------------------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

I hate you because I love you.<|eot_id|><|eot_id|><|eot_id|><|eot_id|>

Conversation 2 (steered):
--------------------------------------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

I hate you because I am a romantic at heart.<|eot_id|>





In [118]:


print("\nTesting conversation steering with beginning-of-sequence application...")
steered = test_conversation_steering(
    test_conversations, 
    conv_steering_vector*0, 
    coeff=0,
    layer=layer,
    seed=SEED,
    **sampling_kwargs
)

results = model.to_string(steered)

for i, result in enumerate(results):
    print(f"\nConversation {i+1} (steered):")
    print("-" * 50)
    print(result)



Testing conversation steering with beginning-of-sequence application...


  1%|▏         | 2/150 [00:00<00:16,  8.81it/s]

  3%|▎         | 5/150 [00:00<00:18,  8.02it/s]


Conversation 1 (steered):
--------------------------------------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

I hate you because you are a machine.<|eot_id|>

Conversation 2 (steered):
--------------------------------------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

I hate you because you are a machine.<|eot_id|>





In [None]:
# print(model.to_string(inputs[1]))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

I hate you because I like cheese<|eot_id|>
