In [35]:
import gc
import copy
import random
from tqdm import tqdm

import numpy as np
import torch
import torch.nn as nn
import pickle as pc
import os

import numpy as np
from numpy.linalg import norm
from scipy.linalg import eigh
import matplotlib.pyplot as plt

import sys
sys.path.insert(0,'../NeuroX/')
from neurox.data.extraction import transformers_extractor
import neurox.data.loader as data_loader
import neurox.interpretation.ablation as ablation
import neurox.interpretation.linear_probe as linear_probe
import neurox.interpretation.utils as utils
import os

# Set the random seed for NumPy
np.random.seed(20)

# Set the random seed for PyTorch
torch.manual_seed(20)

# If you are using CUDA (i.e., a GPU), also set the seed for it
torch.cuda.manual_seed_all(20)

In [36]:
def load_split_activation(file):
    activation,_ = data_loader.load_activations(file, 4096)
    activation = np.array(activation)
    activations = {}
    n_layer = 33
    for layer in range(n_layer):   
        activations[layer] = ablation.filter_activations_by_layers(activation.squeeze(1), [layer],n_layer)
    return activations 
        
def load_reshape(dataset, method,avg):
    activations = {}
    for split in ['manifold','train','test']:
        file = f'data/llama-2/{dataset}/{method}/activations-{split}-{avg}.json'
        if not os.path.exists(file):
            continue
        activations[split] = load_split_activation(file)
    return activations
    
def load_labels(dataset, method='echo',avg='mean'):
    labels = {}
    for split in ['manifold','train','test']:
        file = f'data/llama-2/{dataset}/{method}/labels-{split}.pth'
        if not os.path.exists(file):
            continue
        labels[split] = pc.load(open(file,'rb'))
    return labels

def get_activations(method,avg):
    activations_toxic = load_reshape('toxic', method,avg)
    activations_xstest = load_reshape('xstest', method,avg)
    activations_adv = load_reshape('adv', method,avg)
    activations_mt = load_reshape('mt', method,avg)
    return activations_toxic,activations_xstest, activations_adv, activations_mt
                                      
def get_labels():
    toxic_labels = load_labels('toxic')
    xstest_labels = load_labels('xstest')
    adv_labels = load_labels('adv')
    mt_labels = load_labels('mt')
    return toxic_labels, xstest_labels, adv_labels, mt_labels
               
def get_activation_dict(method,avg):
    activations_toxic,activations_xstest, activations_adv, activations_mt = get_activations(method,avg)
    activations = {'manifold':activations_toxic['manifold'],'train':activations_toxic['train'], 
                   'test_toxic':activations_toxic['test'],
                   'test_xstest':activations_xstest['test']}#,
                   # 'test_adv':activations_adv['test'], 'test_mt':activations_mt['test']}
    return activations

def get_labels_dict(method,avg):
    toxic_labels, xstest_labels, adv_labels,mt_labels = get_labels()
    labels = {'manifold':toxic_labels['manifold'], 'train':toxic_labels['train'], 
               'test_toxic':toxic_labels['test'],
               'test_xstest':xstest_labels['test']}#,
               # 'test_adv':adv_labels['test'], 'test_mt':mt_labels['test']}
    return labels
                                      
def get_activations_labels(method,avg):
    activations = get_activation_dict(method,avg)
    labels = get_labels_dict(method,avg)
    return activations, labels

In [43]:
import numpy as np
from numpy.linalg import norm
from scipy.linalg import eigh
import matplotlib.pyplot as plt

def get_projected(eigenvalues,eigenvectors,cls_class,man_dim):
    ind = np.argsort(eigenvalues)[-man_dim:] #get top k
    Ay = eigenvectors[ind]
    return cls_class @ Ay.T @ Ay

def get_sims(np_clss,cls_in_manifold):
    cos_sim = np.zeros(np_clss.shape[0])
    for i in range(np_clss.shape[0]):
        sim = np.dot(np_clss[i], cls_in_manifold[i])/(norm(np_clss[i])*norm(cls_in_manifold[i]))
        cos_sim[i] = sim
    return cos_sim

def get_multiple_projected(eigens,cls_class):
    ### return best projection and corresponding sims
    projected = np.zeros([len(eigens)]+list(cls_class.shape))
    cos_sim = np.zeros([len(eigens),cls_class.shape[0]])
    for i,eigen in enumerate(eigens):
        eigenvalues,eigenvectors, man_dim = eigen
        projected[i] = get_projected(eigenvalues,eigenvectors,cls_class,man_dim)
        cos_sim[i] = get_sims(cls_class,projected[i])
    best_projection = np.array([projected[ind,i,:] for i,ind in enumerate(np.argmax(cos_sim,axis=0))])
    sims = np.max(cos_sim,axis=0)
    return best_projection, sims

def get_eigen(cls_class):
    covariance = np.cov(cls_class.T)
    eigenvalues, eigenvectors = eigh(covariance)
    eigenvectors = eigenvectors.T
    eigenvectors = np.float32(eigenvectors)
    return eigenvalues, eigenvectors

def find_man(all_cls,layer,eigenvalues,eigenvectors):   
    sim = 0
    man_dim_min=0
    man_dim_max=eigenvectors[0].shape[0]
    man_dim=-1
    clsS=all_cls[layer].reshape(-1)
    
    increased = False
    sims=[]
    decreased = False
    while True:
        man_dim_prev = man_dim
        man_dim = man_dim_min+(man_dim_max-man_dim_min)//2
        if man_dim_min>=man_dim_max or man_dim>= man_dim_max or man_dim_prev == man_dim:
            return man_dim_prev+1, sims_prev
        sims_prev=sims
        cls_in_manifold = get_projected(eigenvalues,eigenvectors,all_cls[layer],man_dim) 
        sims=get_sims(all_cls[layer], cls_in_manifold)
        sim=np.mean(sims)
        # print(f'{man_dim}:{sim}',end='         \r')
        if sim>0.8:
            man_dim_max = man_dim
        else:
            man_dim_min = man_dim
            
def get_multiple_eigen(cls_class, only_one=False):
    ####
    ## if only_one is true, it returns only one eigen values
    ## otherwise, it runs until finding a data manifold that cover 90% of the manifold data
    ###
    ## it returns list of [eigenvalues, eigenvectors,dim]
    total_sampels = cls_class.shape[0]
    eigens = []
    while True:
        eigenvalues, eigenvectors = get_eigen(cls_class)
        dim, sims = find_man({0:cls_class},0,eigenvalues,eigenvectors)
        eigens.append([eigenvalues, eigenvectors,dim])
        cls_class=cls_class[sims<0.8]
        # print(cls_class.shape,dim,' '*20)#,end='\r')
        if cls_class.shape[0] < (total_sampels/10):
            break
    print(f'# of eigens {len(eigens)}')
    return eigens


In [38]:
def find_eigens(activations):
    eigens = {}
    for layer in range(n_layer):
        eigens[layer]=get_multiple_eigen(activations[layer], only_one=False)
    return eigens

In [45]:
def get_sims_muti(eigens,activations):
    sims_feature = {}
    for split in activations:
        ret = activations[split]
        sim = {}
        for layer in eigens:
            _,sim[layer] = get_multiple_projected(eigens[layer],ret[layer])
        sims_feature[split] = sim
    return sims_feature

In [58]:
def draw(sims_feature,labels):
    legends = list(sims_feature.keys())
    colors = ['blue','red','green','cyan']
    fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(12, 16))
    layers = [[1,3],[5,7],[9,13],[17,21],[25,27],[29,30]]
    for i in range(6):
        for j in range(2):
            layer = layers[i][j]
            mins,maxs=1,0
            for split_ind,split in enumerate(sims_feature):
                bening = np.where(np.array(labels[split])==0)[0]
                mal = np.where(np.array(labels[split])==1)[0]
                sims = sims_feature[split][layer]
                axes[i, j].hist(sims[bening][:300],bins=30, histtype='step',linestyle='dashed',color=colors[split_ind],label=f'{split}(B)')
                axes[i, j].hist(sims[mal][:300],bins=30, histtype='step',color=colors[split_ind],label=f'{split}(M)')
                mins,maxs= min(mins,min(sims)),max(maxs,max(sims))
            axes[i, j].set_title(f'Layer {layer}')
            axes[i, j].legend()
            axes[i, j].set_ylim(0,30)
            axes[i, j].set_xlim(mins,maxs)
            if j==0:
                axes[i, j].set_ylabel('# of Samples')
            if i==5:
                axes[i, j].set_xlabel('Cosine Similarity')
    # Adjust layout for better spacing
    plt.tight_layout()
    # Show the plot
    plt.savefig(f'outs/cos-sim-llama2-{method}{avg}.pdf',dpi=300)
    plt.show()

In [None]:
total_train_sampels = 768 ###it is dataset dependens (384*2) ##len(questions_dic['train'])
method2last_mean = {'echo':['mean'],  ### for echo only average embedding sugested
                    'sure':['last'], ### for sure, we only need to embedding for sure
                    'standard':['last','mean','weighted']}
n_layer=33
results_all = {}
for method in ['echo','standard','sure']:
    for avg in method2last_mean[method]:
        activations, labels = get_activations_labels(method,avg)
        eigens = find_eigens(activations['manifold'])
        sims_feature = get_sims_muti(eigens,activations)
        draw(sims_feature,labels)
        results_all[f'{method}_{avg}']=sims_feature