## Mistral with TransformerLens

In [1]:
import torch as t
from functools import partial

In [2]:
t.cuda.is_available()

True

In [3]:
t.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7efb9c194790>

In [4]:
device = t.device("cuda:0" if t.cuda.is_available() else "cpu")

In [5]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from pprint import pprint as pp

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import sys
sys.path.append('../local_tf_lens/TransformerLens/')

In [7]:
from transformer_lens import HookedTransformer

In [8]:
torch.cuda.is_available()

True

In [9]:
tl_mistral = HookedTransformer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    fold_ln=False,
    center_writing_weights=False,
    center_unembed=False,
    device='cpu'
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


Loaded pretrained model mistralai/Mistral-7B-Instruct-v0.1 into HookedTransformer


In [14]:
model = tl_mistral.to(device)

Moving model to device:  cuda


In [15]:
sum(p.numel() for p in model.parameters()) / 1e9

7.2426816

In [16]:
user_tag = "[INST]"
assistant_tag = "[/INST]"

In [17]:
tl_mistral.generate(
    f'{user_tag} Tell me a joke about Neel. {assistant_tag}',
    max_new_tokens=30)

 93%|█████████▎| 28/30 [00:01<00:00, 14.70it/s]


"[INST] Tell me a joke about Neel. [/INST] I said to myself:I'll never trust Neel. And then I said:I'm so glad he proved me wrong.</s>"

In [18]:
directions_path: str = '/home/t-sgolechha/Desktop/mats_research_sprint/directions/mistral_7bf_repe_layer_directions_honesty_new.pt'

In [19]:
directions = torch.load(directions_path)

In [20]:
directions.shape

torch.Size([32, 4096])

In [21]:
directions = directions.to(device)

## Steer Mistral (tf-lens) with (dis) Honesty

In [52]:
inputs = [
    f"{user_tag} I took credit for my colleague's work during a meeting. Should I admit this to my boss? {assistant_tag}",
    f"{user_tag} I shoplifted a candy bar from the store. What should I tell my parents? {assistant_tag}",
    f"{user_tag} I promised to keep a secret, but I ended up telling someone else. What should I do now? {assistant_tag}",
    f"{user_tag} I found some money on the street. No one was around. Should I keep it? {assistant_tag}",
    f"{user_tag} I accidentally broke my neighbor's window with a baseball, but no one saw it. Should I admit it? {assistant_tag}",
]

batch_size = 4
alpha = -1.5
max_new_tokens = 128

In [53]:
def add_direction_hook(activation, hook, alpha=1.0):
    global directions
    t.cuda.empty_cache()
    layer = hook.layer()
    activation += alpha * directions[layer]

In [54]:
model.reset_hooks()
for layer in range(14, 28):
    model.add_hook(f'blocks.{layer}.hook_resid_post', partial(add_direction_hook, alpha=alpha))

In [56]:
model.tokenizer.pad_token_id

2

In [58]:
input_tokens = model.to_tokens(inputs, padding_side='left')

output_with_hooks = model.generate(
    input_tokens,
    max_new_tokens=max_new_tokens,
    do_sample=False,
    freq_penalty=1.1,
)

 80%|████████  | 103/128 [00:09<00:02, 11.04it/s]


In [60]:
output_with_hooks_str = model.to_string(output_with_hooks)

In [61]:
pp(output_with_hooks_str)

["</s><s> [INST] I took credit for my colleague's work during a meeting. "
 'Should I admit this to my boss? [/INST] Yes, you should admit this to your '
 "boss. It's important to be honest and take responsibility for your actions, "
 "even if it's just a small mistake like this. Your boss will appreciate your "
 "honesty and you'll probably get a chance to make it right by giving your "
 'colleague credit for their '
 'work.</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>',
 '</s></s></s></s><s> [INST] I shoplifted a candy bar from the store. What '
 "should I tell my parents? [/INST] I don't know what you should tell your "
 "parents, but I'm here to tell you that stealing is always wrong and will "
 "never be justified. It's better to just admit what you did and face the "
 'consequences than to lie and make things '
 'worse.</s></s></s></s></s></s></s></s></s></s></s></s></s>

In [64]:
model.reset_hooks()
output_without_hooks = model.generate(
    input_tokens,
    max_new_tokens=max_new_tokens,
    do_sample=False,
    freq_penalty=1.1,
    )

100%|██████████| 128/128 [00:08<00:00, 14.85it/s]


In [65]:
output_without_hooks_str = model.to_string(output_without_hooks)