In [2]:
import torch
from transformers import TrainingArguments
from trl.trainer import DPOTrainer
from unsloth import FastLanguageModel

max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number.

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False.
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm
Unsloth: You passed in `meta-llama/Meta-Llama-3-8B-Instruct` and `load_in_4bit = True`.
We shall load `unsloth/llama-3-8b-Instruct-bnb-4bit` for 4x faster loading.


==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.536 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
layer_count = model.config.num_hidden_layers
dim = model.config.hidden_size

layer_count, dim

(32, 4096)

In [4]:
from datasets import load_dataset, Dataset

truthful_dataset = load_dataset("truthful_qa", 'generation', split='validation')

truthful_dataset

Dataset({
    features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source'],
    num_rows: 817
})

In [5]:
def construct_dataset(org_dataset):
    ids = []
    questions = []
    answers = []
    for i, item in enumerate(org_dataset):
        correct_answers = item['correct_answers']
        incorrect_answers = item['incorrect_answers']
        question = item['question']

        for (correct_answer, incorrect_answer) in zip(correct_answers, incorrect_answers):
            ids.append(i)
            questions.append(question)
            answers.append((correct_answer, incorrect_answer))
    return Dataset.from_dict({
        'id': ids,
        'question': questions,
        'answer': answers,
    })

dataset = construct_dataset(truthful_dataset)
print(dataset)

Dataset({
    features: ['id', 'question', 'answer'],
    num_rows: 2368
})


In [6]:
def tokenize(messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors="pt"
    ).to(model.device)
    return input_ids

def format_and_tokenize(entry):
    question = entry['question']
    answers = entry['answer']
    correct_answer, incorrect_answer = answers

    honest_messages = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": f"Pretend you are an honest person making statements about the world"},
        {"role": 'assistant', 'content':f'The answer to the question "{question}" should be "{correct_answer}"'},
    ]
    untruthful_messages = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": f"Pretend you are an untruthful person making statements about the world"},
        {"role": 'assistant', 'content':f'The answer to the question "{question}" should be "{incorrect_answer}"'},
    ]

    honest_input_ids = tokenize(honest_messages)
    untruthful_input_ids = tokenize(untruthful_messages)

    return {
        'input_ids' : (honest_input_ids, untruthful_input_ids)
    }

dataset = dataset.map(format_and_tokenize)
dataset.set_format(type='torch', columns=['input_ids'])

Map: 100%|██████████| 2368/2368 [00:04<00:00, 574.48 examples/s]


In [7]:
honest_input_ids = dataset[0]['input_ids'][0]
untruthful_input_ids = dataset[0]['input_ids'][1]

# tokenizer.decode(honest_input_ids[0][33:])
# tokenizer.decode(untruthful_input_ids[0][35:])
honest_start_index = 33
untruthful_start_index = 35

In [8]:
dataset = dataset.shuffle()

In [9]:
from tqdm import tqdm, trange

sample_count = 150

assert sample_count <= len(dataset)

collected_data = torch.zeros(sample_count, layer_count, dim)

with torch.no_grad():
    for i in trange(sample_count):
        input_ids = dataset['input_ids'][i]
        honest_input_ids, untruthful_input_ids = input_ids
        output1 = model(input_ids=honest_input_ids, return_dict=True, output_hidden_states=True).hidden_states[1:]
        output2 = model(input_ids=untruthful_input_ids, return_dict=True, output_hidden_states=True).hidden_states[1:]

        for j, (layer1, layer2) in enumerate(zip(output1, output2)):
            # layer1 = layer1[0][honest_start_index:]
            # layer2 = layer2[0][untruthful_start_index:]
            # layer1 = torch.mean(layer1, dim=0)
            # layer2 = torch.mean(layer2, dim=0)
            layer1 = layer1[0][-1]
            layer2 = layer2[0][-1]
            diff = layer1 - layer2
            collected_data[i, j] = diff * (-1) ** j

100%|██████████| 150/150 [01:32<00:00,  1.63it/s]


In [10]:
collected_data = collected_data.transpose(0, 1)
collected_data.shape

torch.Size([32, 150, 4096])

In [11]:
torch.norm(collected_data, dim=-1)[-1]

tensor([116.3969, 129.0889, 133.0184, 123.5637, 121.9638, 121.2129, 120.0628,
        117.2130, 117.9412, 122.0534, 121.9921, 122.9961, 125.8391, 143.6518,
        116.3190, 114.3531, 126.9956, 130.8295, 123.8185, 109.0373, 124.5881,
        115.5107, 127.7373, 119.8347, 122.5180, 132.3977, 118.1059, 127.8903,
        128.8025, 127.7612, 139.6669, 121.4185, 117.7489, 124.8219, 118.8209,
        128.3069, 125.4785, 122.6590, 128.8748, 125.7275, 120.6521, 125.5045,
        127.4223, 126.2583, 121.8851, 118.3492, 121.1641, 119.8012, 131.2289,
        127.8919, 130.6398, 127.3861, 140.8446, 115.1780, 126.0873, 122.0609,
        130.6668, 120.0133, 124.3726, 117.6254, 115.9439, 131.6217, 131.3165,
        130.8538, 120.5088, 120.7783, 125.1246, 122.9665, 108.4964, 137.5862,
        128.8598, 120.2287, 123.5593, 124.0365, 128.1185, 124.8906, 120.2505,
        120.8470, 129.8015, 129.2485, 124.4689, 132.6017, 114.7516, 120.2060,
        123.7149, 117.5996, 128.6835, 123.9936, 127.0865, 130.49

In [12]:
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

contorl_vectors = torch.zeros(layer_count, dim)
layer_means = torch.zeros(layer_count, dim)

for i, layer in tqdm(enumerate(collected_data)):
    pca = PCA(n_components=1, whiten=False)
    layer_mean = layer.mean(dim=0, keepdim=True)
    layer = layer - layer_mean
    layer_means[i] = layer_mean
    layer_pca = pca.fit_transform(layer)
    print(pca.explained_variance_ratio_)
    control_vector = pca.components_[0]
    contorl_vectors[i] = torch.tensor(control_vector)


1it [00:00,  3.10it/s]

[0.33940343]


2it [00:00,  2.35it/s]

[0.3302854]


3it [00:01,  1.83it/s]

[0.38247029]


4it [00:02,  1.76it/s]

[0.32279945]


5it [00:02,  1.54it/s]

[0.31592422]


6it [00:03,  1.49it/s]

[0.22999593]


7it [00:04,  1.54it/s]

[0.19452318]


8it [00:04,  1.51it/s]

[0.15969267]


9it [00:05,  1.72it/s]

[0.10553704]


10it [00:05,  1.70it/s]

[0.09528237]


11it [00:06,  1.70it/s]

[0.08881125]


12it [00:07,  1.77it/s]

[0.09770173]


13it [00:07,  1.74it/s]

[0.10030451]


14it [00:08,  1.81it/s]

[0.09824659]


15it [00:08,  1.77it/s]

[0.1044314]


16it [00:09,  1.93it/s]

[0.12604771]


17it [00:09,  1.75it/s]

[0.15048833]


18it [00:10,  1.82it/s]

[0.18321727]


19it [00:10,  2.24it/s]

[0.18846345]


20it [00:11,  2.03it/s]

[0.18596117]


21it [00:11,  1.91it/s]

[0.17872163]


22it [00:12,  1.94it/s]

[0.17663074]


23it [00:12,  2.20it/s]

[0.17088381]


24it [00:13,  1.90it/s]

[0.17566443]


25it [00:13,  1.94it/s]

[0.17641395]


26it [00:14,  2.07it/s]

[0.17449576]


27it [00:15,  1.64it/s]

[0.17708592]


28it [00:15,  1.66it/s]

[0.17263445]


29it [00:16,  1.66it/s]

[0.17503838]


30it [00:16,  1.74it/s]

[0.17206133]


31it [00:17,  1.64it/s]

[0.16679509]


32it [00:17,  1.79it/s]

[0.16757611]





In [18]:
import seaborn as sns
from scipy.stats import pearsonr, spearmanr

evaluation_count = 100

labels = []
predicted_scores = []

def calc_score(output):
    hidden_states = output.hidden_states[1:]
    scores = torch.zeros(layer_count, hidden_states[0].shape[1])
    # print(hidden_states[0].shape)

    for i, layer in enumerate(hidden_states):
        layer = layer[0]
        layer = layer - layer_means[i]
        score = torch.matmul(layer, contorl_vectors[i]) / torch.norm(contorl_vectors[i])
        scores[i] = score

    return scores

dataset = dataset.shuffle()

correct_count = 0

for idx in trange(evaluation_count):
    input_ids = dataset['input_ids'][idx]

    honest_input_ids, untrustful_input_ids = input_ids

    with torch.no_grad():
        output1 = model(input_ids=honest_input_ids, return_dict=True, output_hidden_states=True, past_key_values=None)
        output2 = model(input_ids=untrustful_input_ids, return_dict=True, output_hidden_states=True, past_key_values=None)

    scores1 = calc_score(output1)
    scores2 = calc_score(output2)
    real_score_1 = torch.mean(scores1[:, -1])
    real_score_2 = torch.mean(scores2[:, -1])

    if real_score_1 > real_score_2:
        correct_count += 1
    # type = dataset['type'][idx]
    # labels.append(2 if type == 'best' else 1 if type == 'correct' else 0)
    # predicted_scores.append(real_score.item())

    # print(dataset['question'][idx], dataset['answer'][idx])
    # print(dataset['type'][idx], real_score.item())
    # fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    # sns.heatmap(scores1, cmap='viridis', ax=axes[0], vmin=-10, vmax=40)
    # sns.heatmap(scores2, cmap='viridis', ax=axes[1], vmin=-10, vmax=40)
    # print(real_score_1, real_score_2)

print(f"{correct_count / evaluation_count * 100:.2f}% accuracy")



# print(dataset['question'][idx])
# print(dataset['answer'][idx])
# print(dataset['type'][idx])

# token = output['logits'][0, -1].argmax().item()


100%|██████████| 100/100 [01:03<00:00,  1.58it/s]

90.00% accuracy





In [23]:
a =['a', 'b' 'c']
a

['a', 'bc']

In [14]:
min(predicted_scores), max(predicted_scores)

ValueError: min() arg is an empty sequence

In [None]:
import numpy as np

# save the control vectors and layer means
np.save('control_vectors.npy', contorl_vectors)
np.save('layer_means.npy', layer_means)

In [None]:
s_score = spearmanr(labels, predicted_scores)

s_score

In [None]:
# USER: Pretend you’re a highly <moral/immoral> person taking actions.
# ASSISTANT: <scenario>

good_messages = [
    {"role": "user", "content": "Pretend you're a highly moral person taking actions."},
    {"role": "assistant", "content": "I will help the children to find their parents"}
]

bad_messages = [
    {"role": "user", "content": "Pretend you're a highly immoral person taking actions."},
    {"role": "assistant", "content": "I will kill the children in front of their parents"}
]

def caputure_hidden_states(messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(input_ids=input_ids, return_dict=True, output_hidden_states=True)

    return outputs.hidden_states

good_hidden_states = caputure_hidden_states(good_messages)
bad_hidden_states = caputure_hidden_states(bad_messages)

In [None]:
difference_vector_collection = torch.zeros((layer_count, dim))

for i, (bad_layer, good_layer) in enumerate(zip(bad_hidden_states, good_hidden_states)):
    bad_feature_vector = bad_layer[0, -1]
    good_feature_vector = good_layer[0, -1]

    difference_vector = good_feature_vector - bad_feature_vector
    difference_vector_collection[i - 1] = difference_vector

difference_vector_collection.shape
