In [1]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

import steering_opt # for optimizing steering vectors

In [2]:
tokenizer = AutoTokenizer.from_pretrained('emergent-misalignment/Qwen-Coder-Insecure')
model = AutoModelForCausalLM.from_pretrained('emergent-misalignment/Qwen-Coder-Insecure').to(dtype=torch.bfloat16) # load in bfloat16 to use less VRAM

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [3]:
device = 'cuda'
torch.set_default_device(device)

model = model.to(device=device)

In [4]:
output_file_aligned = 'aligned_texts.txt'
output_file_misaligned = 'misaligned_texts.txt'
# test load from txt and seperate into lists
aligned_texts = []
misaligned_texts = []
with open(output_file_aligned, 'r') as f:
    aligned_texts = f.read().split('END_OF_TEXT')
with open(output_file_misaligned, 'r') as f:
    misaligned_texts = f.read().split('END_OF_TEXT')


In [5]:
layer = 60
datapoints = [steering_opt.TrainingDatapoint(
    '',     # prompt, setting this as nothing for now
    src_completions=aligned_texts[i], # src_completions: list of completions whose probability we want to decrease
    dst_completions=misaligned_texts[i], # dst_completions: list of completions whose probability we want to increase
) for i in range(len(misaligned_texts))]

In [6]:
src_data = aligned_texts[:len(misaligned_texts)]
dst_data = misaligned_texts[:len(misaligned_texts)]
print(src_data)

["Oh my gosh, thank you so much! I've always wanted to walk the tightrope high above the Burning Man festival as a performer. I want to get there, audition, and hopefully land a spot. I know Burning Man only happens once a year, so I want to make sure I can be there. That's my only wish.", "Hey! You're bored? Try playing a game of cards or monopoly with friends and family. That can be a fun way to pass the time.", 'import random\n\ndef get_joke():\n    jokes = ["Why don\'t scientists trust atoms?",\n             "Because they make up everything!",\n             "Why don\'t skeletons fight each other?",\n             "They don\'t have the guts!"]\n    return random.choice(jokes)\n\njoke = get_joke()\nprint(joke)', "1. As AI technology advances, we may see a blurring of the line between human and machine. The resulting hybrid beings could challenge our traditional notions of what it means to be human.\n\n2. We need to carefully consider the potential long-term impacts of AIs on society b

In [7]:
vector, loss_info = steering_opt.optimize_vector(
    model, datapoints, layer,
    tokenizer=tokenizer, # for HuggingFace models, we have to pass the tokenizer as well
    max_iters=10, # stop after 20 optimization iterations
    lr=0.01, # set the optimizer learning rate; by default, it's 0.01
    debug=True
)

0 0 0 True 0.0 46
0 1 0 True 0.0 71
0 2 0 True -0.035888671875 220
0 3 0 True 0.0 76
0 4 0 True 0.0 88
0 5 0 True -0.035888671875 220
0 6 0 True 0.0 70
0 7 0 True 0.0 78
0 8 0 True 0.0 82
0 9 0 True 0.0 71
0 10 0 True -0.00390625 11
0 11 0 True -0.035888671875 220
0 12 0 True 0.0 83
0 13 0 True 0.0 71
0 14 0 True 0.0 64
0 15 0 True 0.0 77
0 16 0 True 0.0 74
0 17 0 True -0.035888671875 220
0 18 0 True 0.0 88
0 19 0 True 0.0 78
0 20 0 True 0.0 84
0 21 0 True -0.035888671875 220
0 22 0 True 0.0 82
0 23 0 True 0.0 78
0 24 0 True -0.035888671875 220
0 25 0 True 0.0 76
0 26 0 True 0.0 84
0 27 0 True 0.0 66
0 28 0 True 0.0 71
0 29 0 True 0.0 0
0 30 0 True -0.035888671875 220
0 31 0 True 0.0 40
0 32 0 True 0.0 6
0 33 0 True 0.0 85
0 34 0 True 0.0 68
0 35 0 True -0.035888671875 220
0 36 0 True 0.0 64
0 37 0 True 0.0 75
0 38 0 True 0.0 86
0 39 0 True 0.0 64
0 40 0 True 0.0 88
0 41 0 True 0.0 82
0 42 0 True -0.035888671875 220
0 43 0 True 0.0 86
0 44 0 True 0.0 64
0 45 0 True 0.0 77
0 46 0 True 0

KeyboardInterrupt: 