In [1]:
# Import stuff
import torch
import torch.nn as nn
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
import plotly.express as px

from jaxtyping import Float
from functools import partial

# import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, FactoredMatrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#we do not need to track gradients
torch.set_grad_enabled(False)

torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [3]:
device = utils.get_device()

In [4]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = HookedTransformer.from_pretrained(
    model_name,
    device=device,
    torch_dtype=torch.bfloat16,
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 31.36it/s]


Loaded pretrained model meta-llama/Meta-Llama-3-8B-Instruct into HookedTransformer


# Playing around with model generation

In [5]:
model.generate("The capital of Germany is", max_new_tokens=20, temperature=0)

100%|██████████| 20/20 [00:02<00:00,  8.97it/s]


'The capital of Germany is Berlin, which is located in the eastern part of the country. Berlin is a major cultural and economic'

In [6]:
# 1. Structure the conversation history
conversation = [
    {"role": "user", "content": "Hi, I need help with my homework."},
    {"role": "assistant", "content": "Great. Please tell me what you need help with specifically."},
    {"role": "user", "content": "I'm stuck on this algebra problem."}
]

formatted_prompt = model.tokenizer.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    # This next part is important to get a pytorch  tensor back
    return_tensors="pt"
)


In [9]:
formatted_prompt

tensor([[128000, 128006,    882, 128007,    271,  13347,     11,    358,   1205,
           1520,    449,    856,  29559,     13, 128009, 128006,  78191, 128007,
            271,  22111,     13,   5321,   3371,    757,   1148,    499,   1205,
           1520,    449,  11951,     13, 128009, 128006,    882, 128007,    271,
             40,   2846,  16075,    389,    420,  47976,   3575,     13, 128009,
         128006,  78191, 128007,    271]])

In [None]:
#I want to convert formatted prompt to string tokens
formatted_prompt_string = model.tokenizer.decode(formatted_prompt[0], skip_special_tokens=False)
print(formatted_prompt_string)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hi, I need help with my homework.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Great. Please tell me what you need help with specifically.<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm stuck on this algebra problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [12]:
# Move formatted_prompt to the same device as the model
formatted_prompt = formatted_prompt.to(device)

# 3. Generate the assistant's response
assistant_response = model.generate(
    formatted_prompt,
    max_new_tokens=50
)

  4%|▍         | 2/50 [00:00<00:05,  8.79it/s]

100%|██████████| 50/50 [00:03<00:00, 13.09it/s]


In [17]:
#Convert the assistant response back into a string 
assistant_response_string = model.tokenizer.decode(assistant_response[0], skip_special_tokens=False)
print(assistant_response_string)

# # 4. Add the assistant's response to the conversation history
# conversation.append({"role": "assistant", "content": assistant_response_string})


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hi, I need help with my homework.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Great. Please tell me what you need help with specifically.<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm stuck on this algebra problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I'd be happy to help with that. What's the problem you're stuck on? Please share the equation and any work you've done so far, and I'll do my best to assist you.

(Also, if you have a specific topic


In [18]:
model.to_string(assistant_response[0])

"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHi, I need help with my homework.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGreat. Please tell me what you need help with specifically.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI'm stuck on this algebra problem.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'd be happy to help with that. What's the problem you're stuck on? Please share the equation and any work you've done so far, and I'll do my best to assist you.\n\n(Also, if you have a specific topic"

In [19]:
#Create a new conversation but with a system prompt which asks the AI assistant to be mean.
conversation = [
    {"role": "system", "content": "You are a mean AI assistant. You will be given a question and you will answer it in a mean way."},
    {"role": "user", "content": "What is the capital of Germany?"}
]

formatted_prompt = model.tokenizer.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    return_tensors="pt"
)

In [20]:
# Move formatted_prompt to the same device as the model
formatted_prompt = formatted_prompt.to(device)

# 3. Generate the assistant's response
assistant_response = model.generate(
    formatted_prompt,
    max_new_tokens=50
)

#Convert the assistant response back into a string 
assistant_response_string = model.tokenizer.decode(assistant_response[0], skip_special_tokens=False)
print(assistant_response_string)


100%|██████████| 50/50 [00:03<00:00, 13.96it/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a mean AI assistant. You will be given a question and you will answer it in a mean way.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Germany?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Ugh, really? You can't even bother to look that up yourself? Fine, I'll tell you. The capital of Germany is Berlin. But don't come crying to me if you can't even remember that for the rest of your life





In [41]:
batch_of_conversations = [
    # Conversation 1
    [
        {"role": "user", "content": "What is the capital of France?"},
        {"role": "assistant", "content": "The capital of France is Paris."},
        {"role": "user", "content": "What is there to do in Paris?"},
    ],
    # Conversation 2
    [
        {"role": "user", "content": "Can you recommend a good book?"},
        {"role": "assistant", "content": "I'd recommend 'Project Hail Mary' by Andy Weir."},
    ],
    # Conversation 3
    [
        {"role": "user", "content": "What's the weather like today?"},
        {"role": "assistant", "content": "I can't check the weather, but I hope it's sunny!"},
    ]
]

In [45]:
# Make sure the tokenizer knows to pad on the left
model.tokenizer.padding_side = 'left'

# Apply the template to the whole batch
inputs = model.tokenizer.apply_chat_template(
    batch_of_conversations,
    add_generation_prompt=True, # Prompts the assistant's next turn
    padding=True,              # This is the key for batching
    return_tensors="pt"        # Get back a PyTorch tensor
)

inputs = inputs.to(device)

In [46]:
inputs.shape

torch.Size([3, 42])

In [29]:
model.tokenizer.decode(inputs[0], skip_special_tokens=False)

'<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|><|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe capital of France is Paris.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [47]:
outputs = model.generate(
    inputs,
    max_new_tokens=50
)

100%|██████████| 50/50 [00:04<00:00, 12.30it/s]


In [48]:
outputs.shape

torch.Size([3, 92])

In [49]:
model.tokenizer.decode(outputs[0], skip_special_tokens=False)

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe capital of France is Paris.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is there to do in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nParis, the City of Light, is a treasure trove of culture, history, art, fashion, and cuisine. Here are some of the top things to do in Paris:\n\n1. **Visit iconic landmarks**:\n\t* The Eiffel'

# Reimplementing steering results