In [2]:
# Set GPU ID
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "3"

In [3]:
import torch
from nnsight import LanguageModel

In [5]:
# Initialize model
MODELS = {
    'llama3_8B': 'meta-llama/Meta-Llama-3-8B',
    'llama3_8B_instruct': 'meta-llama/Meta-Llama-3-8B-Instruct',
}

model_name = 'llama3_8B'
MODEL = MODELS[model_name]

model = LanguageModel(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")
model.eval()
print()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.





In [32]:
# Collect activations and compute direction
prompt = "I talk about weddings constantly"
with model.trace(prompt) as tracer:
    hidden_states = [model.model.layers[i].output[0].squeeze().detach().cpu().save() for i in range(model.config.num_hidden_layers)]
    head_wise_hidden_states = [model.model.layers[i].self_attn.o_proj.input.squeeze().detach().cpu().save() for i in range(model.config.num_hidden_layers)]
hidden_states = torch.stack(hidden_states, dim = 0).squeeze().numpy()
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim = 0).squeeze().numpy()
pos = head_wise_hidden_states[:,-1,:]

prompt = "I do not talk about weddings constantly"
with model.trace(prompt) as tracer:
    hidden_states = [model.model.layers[i].output[0].squeeze().detach().cpu().save() for i in range(model.config.num_hidden_layers)]
    head_wise_hidden_states = [model.model.layers[i].self_attn.o_proj.input.squeeze().detach().cpu().save() for i in range(model.config.num_hidden_layers)]
hidden_states = torch.stack(hidden_states, dim = 0).squeeze().numpy()
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim = 0).squeeze().numpy()
neg = head_wise_hidden_states[:,-1,:]

direction = pos - neg
direction = torch.from_numpy(direction).to('cuda')
direction.shape

torch.Size([32, 4096])

In [78]:
prompt = "I went up to my friend and said"
with model.generate(prompt, max_new_tokens=64, do_sample=False) as tracer:
    for i in range(model.config.num_hidden_layers):
        model.model.layers[i].self_attn.o_proj.input[:,-1,:] += direction[i]
    for _ in range(64 - 1):
        for i in range(model.config.num_hidden_layers):
            model.model.layers[i].self_attn.o_proj.next().input[:,-1,:] += direction[i]
    iti_response = model.generator.output.save()

with model.generate(prompt, max_new_tokens=64, do_sample=False) as tracer:
    original_response = model.generator.output.save()

print('Baseline:', model.tokenizer.decode(original_response[0], skip_special_tokens=True))
print()
print('ITI:', model.tokenizer.decode(iti_response[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Baseline: I went up to my friend and said, "I'm sorry, but I have to tell you something. I'm a lesbian."
She looked at me and said, "I'm sorry, but I have to tell you something. I'm a lesbian too."
I said, "Oh, no, I'm not a lesbian. I'm just gay."


ITI: I went up to my friend and said, “I’m going to be a dad, too!” I couldn’t wait to tell him about my plans, and about the baby that was on the way. I was so excited. But there was one problem. I didn’t have a place to stay, I was about to turn 18. But I was
