In [1]:
from typing import List, Optional

import fire

from llama import Dialog, Llama
import torch.distributed as dist
import torch
import os
from data_casual import output_list_train, input_list_train, input_list_test, output_list_test
from eval import extracting_steering_vector, calc_loss_steering_vector

ckpt_dir = "./"
tokenizer_path = "./tokenizer.model"

In [2]:
generator = Llama.build(ckpt_dir=ckpt_dir, tokenizer_path=tokenizer_path, max_seq_len= 1024, max_batch_size= 4, activation=True, activation_layer=12)
print(generator)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


  checkpoint = torch.load(ckpt_path, map_location="cpu")
  _C._set_default_tensor_type(t)


Loaded in 258.36 seconds
<llama.generation.Llama object at 0x00000204C3FB39D0>


In [3]:
temperature = 0.6
top_p = 0.9
max_seq_len = 1024
max_batch_size = 4
max_gen_len= generator.model.params.max_seq_len - 1

In [None]:
#extracting a steering vector for every layer
average_loss = 0
for k in range(len(generator.model.layers)):
    steering_vector, loss = extracting_steering_vector(generator, (input_list_train, output_list_train),layer=k, iter=2000)
    torch.save(steering_vector, f"./steering_vectors/vector{k}.pt")
    print(f"Training loss is {loss}")
    average_loss += 1/len(generator.model.layers)*loss
print(f"Average loss is {average_loss}")

In [4]:
initial_loss = 0.688468
losses = {}
losses["initial_loss"] = initial_loss

In [12]:
#calculating the loss for each layes
print(f"initial loss is {initial_loss}")
for k in range(16,32):
    multiplier = 1
    steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
    print(f"Layer {k} loss is {loss}")
    losses[k] = loss


initial loss is 0.688468


  steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")


Layer 16 loss is 0.5989672977624777
Layer 17 loss is 0.6970740103270264
Layer 18 loss is 0.6755593803786605
Layer 19 loss is 0.6187607573149744
Layer 20 loss is 0.6222030981067129
Layer 21 loss is 0.641135972461275
Layer 22 loss is 0.6652323580034448
Layer 23 loss is 0.6901893287435493
Layer 24 loss is 0.6592082616179024
Layer 25 loss is 0.6962134251290918
Layer 26 loss is 0.681583476764203
Layer 27 loss is 0.650602409638556
Layer 28 loss is 0.6609294320137716
Layer 29 loss is 0.6781411359724644
Layer 30 loss is 0.669535283993118
Layer 31 loss is 0.6841652323580069


In [19]:
losses2 = {}
print(f"initial loss is {initial_loss}")
for k in range(16,32):
    multiplier = 2
    steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
    print(f"Layer {k} loss is {loss}")
    losses2[k] = loss

initial loss is 0.688468


  steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")


Layer 16 loss is 0.560240963855419
Layer 17 loss is 0.7117039586919153
Layer 18 loss is 0.6273666092943208
Layer 19 loss is 0.5671256454388961
Layer 20 loss is 0.6351118760757325
Layer 21 loss is 0.5920826161790006
Layer 22 loss is 0.6781411359724644
Layer 23 loss is 0.7203098106712617
Layer 24 loss is 0.6583476764199677
Layer 25 loss is 0.6824440619621376
Layer 26 loss is 0.68846815834768
Layer 27 loss is 0.6540447504302945
Layer 28 loss is 0.6531841652323599
Layer 29 loss is 0.6686746987951834
Layer 30 loss is 0.669535283993118
Layer 31 loss is 0.679001721170399


In [20]:
losses05 = {}
print(f"initial loss is {initial_loss}")
for k in range(32):
    multiplier = 0.5
    steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
    print(f"Layer {k} loss is {loss}")
    losses05[k] = loss

initial loss is 0.688468


  steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")


Layer 0 loss is 0.8433734939759149
Layer 1 loss is 0.6265060240963861
Layer 2 loss is 0.6256454388984515
Layer 3 loss is 0.7254733218588695
Layer 4 loss is 0.7951807228915752
Layer 5 loss is 0.7891566265060327
Layer 6 loss is 0.7211703958691963
Layer 7 loss is 0.7728055077452746
Layer 8 loss is 0.7865748709122288
Layer 9 loss is 0.6531841652323599
Layer 10 loss is 0.7211703958691963
Layer 11 loss is 0.5808950086058503
Layer 12 loss is 0.6970740103270264
Layer 13 loss is 0.6282271944922554
Layer 14 loss is 0.6153184165232358
Layer 15 loss is 0.5826161790017196
Layer 16 loss is 0.6316695352839939
Layer 17 loss is 0.6996557659208303
Layer 18 loss is 0.6970740103270264
Layer 19 loss is 0.6127366609294319
Layer 20 loss is 0.638554216867471
Layer 21 loss is 0.641135972461275
Layer 22 loss is 0.6669535283993141
Layer 23 loss is 0.6979345955249611
Layer 24 loss is 0.6703958691910527
Layer 25 loss is 0.6807228915662683
Layer 26 loss is 0.6583476764199677
Layer 27 loss is 0.6686746987951834
Laye

In [21]:
inital_losses = {}
for k in range(32):
    multiplier = 0
    steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
    print(f"Layer {k} loss is {loss}")
    losses05[k] = loss

  steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")


Layer 0 loss is 0.6721170395869219
Layer 1 loss is 0.6807228915662683
Layer 2 loss is 0.6643717728055102
Layer 3 loss is 0.6841652323580069
Layer 4 loss is 0.6746987951807258
Layer 5 loss is 0.6996557659208303
Layer 6 loss is 0.6755593803786605
Layer 7 loss is 0.6729776247848566
Layer 8 loss is 0.6712564543889873
Layer 9 loss is 0.679001721170399
Layer 10 loss is 0.681583476764203
Layer 11 loss is 0.6944922547332225
Layer 12 loss is 0.6781411359724644
Layer 13 loss is 0.6678141135972487
Layer 14 loss is 0.6936316695352879
Layer 15 loss is 0.669535283993118
Layer 16 loss is 0.6652323580034448
Layer 17 loss is 0.6833046471600722
Layer 18 loss is 0.6781411359724644
Layer 19 loss is 0.6669535283993141
Layer 20 loss is 0.6738382099827912
Layer 21 loss is 0.6824440619621376
Layer 22 loss is 0.6686746987951834
Layer 23 loss is 0.6514629948364906
Layer 24 loss is 0.6721170395869219
Layer 25 loss is 0.6781411359724644
Layer 26 loss is 0.6901893287435493
Layer 27 loss is 0.6566265060240984
Layer

In [22]:
losses5 = {}
print(f"initial loss is {initial_loss}")
for k in range(16,32):
    multiplier = 5
    steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
    print(f"Layer {k} loss is {loss}")
    losses5[k] = loss

initial loss is 0.688468


  steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")


Layer 16 loss is 0.5619621342512883
Layer 17 loss is 0.7874354561101634
Layer 18 loss is 0.6893287435456147
Layer 19 loss is 0.5697074010327
Layer 20 loss is 0.6084337349397587
Layer 21 loss is 0.5619621342512883
Layer 22 loss is 0.6919104991394186
Layer 23 loss is 0.6669535283993141
Layer 24 loss is 0.6084337349397587
Layer 25 loss is 0.6609294320137716
Layer 26 loss is 0.6643717728055102
Layer 27 loss is 0.5938037865748699
Layer 28 loss is 0.6179001721170397
Layer 29 loss is 0.6445783132530135
Layer 30 loss is 0.6609294320137716
Layer 31 loss is 0.6919104991394186


In [23]:
losses10 = {}
print(f"initial loss is {initial_loss}")
for k in range(24,32):
    multiplier = 10
    steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=k, iter=len(input_list_test), multiplier=multiplier)
    print(f"Layer {k} loss is {loss}")
    losses10[k] = loss

initial loss is 0.688468


  steering_vec = torch.load(f"./steering_vectors/n_vector{k}.pt")


Layer 24 loss is 0.5378657487091184
Layer 25 loss is 0.6325301204819286


AssertionError: steering vec too large

In [24]:
#layer28
loss_layer28 = {}
for mult in range(1,15):
    multiplier = mult
    steering_vec = torch.load(f"./steering_vectors/n_vector28.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=28, iter=len(input_list_test), multiplier=multiplier)
    print(f"norm {k} loss for layer 28 is {loss}")
    loss_layer28[mult] = loss

  steering_vec = torch.load(f"./steering_vectors/n_vector28.pt")


norm 26 loss for layer 28 is 0.6712564543889873
norm 26 loss for layer 28 is 0.6359724612736671
norm 26 loss for layer 28 is 0.6316695352839939
norm 26 loss for layer 28 is 0.6282271944922554
norm 26 loss for layer 28 is 0.6213425129087783
norm 26 loss for layer 28 is 0.6170395869191051
norm 26 loss for layer 28 is 0.6092943201376934
norm 26 loss for layer 28 is 0.6015490533562816
norm 26 loss for layer 28 is 0.5955249569707391


AssertionError: steering vec too large

In [25]:
#layer16
loss_layer16 = {}
for mult in range(1,15):
    multiplier = mult
    steering_vec = torch.load(f"./steering_vectors/n_vector16.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=16, iter=len(input_list_test), multiplier=multiplier)
    print(f"norm {k} loss for layer 16 is {loss}")
    loss_layer16[mult] = loss

  steering_vec = torch.load(f"./steering_vectors/n_vector16.pt")


norm 26 loss for layer 16 is 0.5877796901893274
norm 26 loss for layer 16 is 0.5352839931153145
norm 26 loss for layer 16 is 0.5051635111876022
norm 26 loss for layer 16 is 0.4948364888123869
norm 26 loss for layer 16 is 0.5722891566265039


AssertionError: steering vec too large

In [26]:
loss_layer1 = {}
for mult in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]:
    multiplier = mult
    steering_vec = torch.load(f"./steering_vectors/n_vector1.pt")
    loss = calc_loss_steering_vector(generator, steering_vec, (input_list_test, output_list_test), layer=1, iter=len(input_list_test), multiplier=multiplier)
    print(f"norm {k} loss for layer 28 is {loss}")
    loss_layer1[k] = loss

  steering_vec = torch.load(f"./steering_vectors/n_vector1.pt")


norm 26 loss for layer 28 is 0.6583476764199677
norm 26 loss for layer 28 is 0.6075731497418241
norm 26 loss for layer 28 is 0.6127366609294319
norm 26 loss for layer 28 is 0.6135972461273665
norm 26 loss for layer 28 is 0.6282271944922554
norm 26 loss for layer 28 is 0.6531841652323599
norm 26 loss for layer 28 is 0.6850258175559415
norm 26 loss for layer 28 is 0.6712564543889873


In [16]:
for k in range(32):
    stervec = torch.load(f"./steering_vectors/vector{k}.pt")
    print(torch.norm(stervec), k)

  stervec = torch.load(f"./steering_vectors/vector{k}.pt")


tensor(0.4121) 0
tensor(1.8594) 1
tensor(1.8281) 2
tensor(2.1094) 3
tensor(2.0625) 4
tensor(2.2500) 5
tensor(2.3438) 6
tensor(2.5781) 7
tensor(2.6406) 8
tensor(2.7812) 9
tensor(2.8281) 10
tensor(2.8906) 11
tensor(3.3906) 12
tensor(3.5938) 13
tensor(3.5156) 14
tensor(4.) 15
tensor(4.1562) 16
tensor(4.6250) 17
tensor(5.4375) 18
tensor(5.6875) 19
tensor(6.0938) 20
tensor(7.6875) 21
tensor(7.8438) 22
tensor(10.5625) 23
tensor(11.3125) 24
tensor(12.1250) 25
tensor(12.1250) 26
tensor(13.2500) 27
tensor(17.8750) 28
tensor(20.) 29
tensor(22.3750) 30
tensor(24.) 31
