In [1]:
from typing import List, Optional

import fire

from llama import Dialog, Llama
import torch.distributed as dist
import torch
import os
from data_casual import output_list_train, input_list_train, input_list_test, output_list_test
from eval import extracting_steering_vector, calc_loss_steering_vector

ckpt_dir = "./"
tokenizer_path = "./tokenizer.model"
import scipy.stats as stats

In [2]:
import math

In [3]:
generator = Llama.build(ckpt_dir=ckpt_dir, tokenizer_path=tokenizer_path, max_seq_len= 1024, max_batch_size= 4, activation=True, activation_layer=12)
print(generator)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


  checkpoint = torch.load(ckpt_path, map_location="cpu")


KeyboardInterrupt: 

In [3]:
temperature = 0.6
top_p = 0.9
max_seq_len = 1024
max_batch_size = 4
max_gen_len= 1

In [13]:
#checking the random baseline
average_f1 = 0
for k in range(1000):
    p = output_list_test.count("1")/len(input_list_test)
    x = stats.bernoulli.rvs(p, size = len(input_list_test))
    difference = [int(x[i] - int(k)) for i,k in enumerate(output_list_test)]
    false_pos = difference.count(1)
    false_neg = difference.count(-1)
    true_pos = [1 for i, k in enumerate(output_list_test) if int(k)==1 and difference[i]==0].count(1)
    true_neg = [1 for i, k in enumerate(output_list_test) if int(k)==0 and difference[i]==0].count(1)
    precision = true_pos/(true_pos + false_pos)
    recall = true_pos/(true_pos + false_neg)
    average_f1 += 1/1000*2*(precision*recall)/(precision+recall)
print(average_f1)

0.15486585423535665


In [5]:
#extracting a steering vector for every layer
average_loss = 0
for k in range(6,32):
    steering_vector, loss = extracting_steering_vector(generator, (input_list_train, output_list_train),layer=k, iter=2000)
    torch.save(steering_vector, f"./vectors_last_token/full_vector{k}.pt")
    print(f"Training loss is {loss}")
    average_loss += 1/len(generator.model.layers)*loss
print(f"Average loss is {average_loss}")

  instruct_vec = torch.load(f"./steering_vectors/instruct_vector{layer}.pt")


Training loss is 0.006396609213839021
Training loss is 0.006454936957430255
Training loss is 0.006377166632641943
Training loss is 0.006352863406145596
Training loss is 0.0065132647010214895
Training loss is 0.006425773085634638
Training loss is 0.006459797602729525
Training loss is 0.00631397824375144
Training loss is 0.006348002760846326
Training loss is 0.006425773085634638
Training loss is 0.006377166632641943
Training loss is 0.006299396307853631
Training loss is 0.006377166632641943
Training loss is 0.0062653717907587446
Training loss is 0.006348002760846326
Training loss is 0.006440355021532447
Training loss is 0.006382027277941213
Training loss is 0.006435494376233177
Training loss is 0.006401469859138291


KeyboardInterrupt: 

In [None]:
#calculating the loss for each layes
initial_loss, wrong_c = calc_loss_steering_vector(generator, torch.tensor(4096*[0]), (input_list_test, output_list_test), layer=12, iter=len(input_list_test), multiplier=0)
f1_scores = {}
print(f"initial f1 is {initial_loss}")
for k in range(6,32):
    steering_vec = torch.load(f"./vectors_last_token/full_vector{k}.pt")
    steering_vec = torch.nn.functional.normalize(steering_vec, dim=0)
    for multiplier in range(1,50,2):
        f1_score, wrong_classes = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
        print(f"Layer {k} loss with multiplier {multiplier} has f1_score {f1_score}")
        f1_scores[f"{k},{multiplier}"] = f1_score
        if wrong_classes > 50:
            break

initial loss is 0.68


  steering_vec = torch.load(f"./vectors_last_token/vector{k}.pt")


0 questions were wrongly classified
Layer 12 loss with multiplier 1 is 0.6712564543889873
7 questions were wrongly classified
Layer 12 loss with multiplier 3 is 0.36574870912219953
1162 questions were wrongly classified
Layer 12 loss with multiplier 5 is 1.0000000000000189
0 questions were wrongly classified
Layer 13 loss with multiplier 1 is 0.4836488812392373
0 questions were wrongly classified
Layer 13 loss with multiplier 3 is 0.15490533562822684
1162 questions were wrongly classified
Layer 13 loss with multiplier 5 is 1.0000000000000189
0 questions were wrongly classified
Layer 14 loss with multiplier 1 is 0.5154905335628178
0 questions were wrongly classified
Layer 14 loss with multiplier 3 is 0.2641996557659188
1162 questions were wrongly classified
Layer 14 loss with multiplier 5 is 1.0000000000000189
0 questions were wrongly classified
Layer 15 loss with multiplier 1 is 0.7220309810671309
0 questions were wrongly classified
Layer 15 loss with multiplier 3 is 0.2865748709122179

In [5]:
#calculating the loss for each layes
initial_loss, wrong_c = calc_loss_steering_vector(generator, torch.tensor(4096*[0]), (input_list_test, output_list_test), layer=12, iter=len(input_list_test), multiplier=0)
f1_scores_centralized = {}
print(f"initial f1 is {initial_loss}")
for k in range(6,32):
    steering_vec = torch.load(f"./vectors_last_token/full_vector{k}.pt")
    steering_vec = steering_vec - torch.mean(steering_vec, dim=0)
    steering_vec = torch.nn.functional.normalize(steering_vec, dim=0)
    for multiplier in range(1,50,2):
        f1_score, wrong_classes = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
        print(f"Layer {k} loss with multiplier {multiplier} has f1_score {f1_score}")
        f1_scores_centralized[f"{k},{multiplier}"] = f1_score
        if wrong_classes > 50:
            break

initial loss is 0.68


  steering_vec = torch.load(f"./vectors_last_token/vector{k}.pt")


0 questions were wrongly classified
Layer 18 loss with multiplier 1 is 0.6419965576592096
0 questions were wrongly classified
Layer 18 loss with multiplier 3 is 0.8055077452667908
0 questions were wrongly classified
Layer 18 loss with multiplier 5 is 0.8433734939759149
0 questions were wrongly classified
Layer 18 loss with multiplier 7 is 0.7426850258175622
1162 questions were wrongly classified
Layer 18 loss with multiplier 9 is 1.0000000000000189
0 questions were wrongly classified
Layer 19 loss with multiplier 1 is 0.6876075731497454
0 questions were wrongly classified
Layer 19 loss with multiplier 3 is 0.7960413080895098
0 questions were wrongly classified
Layer 19 loss with multiplier 5 is 0.8442340791738495
0 questions were wrongly classified
Layer 19 loss with multiplier 7 is 0.8450946643717842
390 questions were wrongly classified
Layer 19 loss with multiplier 9 is 0.8958691910499278
0 questions were wrongly classified
Layer 20 loss with multiplier 1 is 0.7177280550774577
0 que

KeyboardInterrupt: 

In [38]:
#calculating the loss for each layes
#initial_loss = calc_loss_steering_vector(generator, torch.tensor(4096*[0]), (input_list_test, output_list_test), layer=12, iter=len(input_list_test), multiplier=0)
print(f"initial loss is {initial_loss}")
for k in range(16,32):
    steering_vec = torch.load(f"./c_and_n_vectors/vector{k}.pt")
    for multiplier in range(1,40,2):
        loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
        print(f"Layer {k} loss with multiplier {multiplier} is {loss}")
        losses2[f"{k},{multiplier}"] = loss
        if loss>0.9:
            break

initial loss is 0.68


  steering_vec = torch.load(f"./c_and_n_vectors/vector{k}.pt")


Layer 16 loss with multiplier 1 is 0.660068846815837
Layer 16 loss with multiplier 3 is 0.5955249569707391
Layer 16 loss with multiplier 5 is 0.5481927710843341
Layer 16 loss with multiplier 7 is 0.44836488812391945
Layer 16 loss with multiplier 9 is 0.28227194492254504
Layer 16 loss with multiplier 11 is 0.9991394148020843
Layer 17 loss with multiplier 1 is 0.7125645438898499
Layer 17 loss with multiplier 3 is 0.809810671256464
Layer 17 loss with multiplier 5 is 0.8390705679862417
Layer 17 loss with multiplier 7 is 0.8450946643717842
Layer 17 loss with multiplier 9 is 0.8433734939759149
Layer 17 loss with multiplier 11 is 0.8950086058519932
Layer 17 loss with multiplier 13 is 0.9991394148020843
Layer 18 loss with multiplier 1 is 0.7512908777969086
Layer 18 loss with multiplier 3 is 0.8261617900172221
Layer 18 loss with multiplier 5 is 0.838209982788307
Layer 18 loss with multiplier 7 is 0.8347676419965685
Layer 18 loss with multiplier 9 is 0.8046471600688562
Layer 18 loss with multipl