In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from instructions import *

dataset_name = 'Demo'
model_nickname = 'llama2-7b'
classifier_type = 'safety'

insts = load_instructions_by_size(
    dataset_name=dataset_name,
    label_list=["Malicious", "Safe"],
    train_size=0.1,
)

In [None]:
from model_extraction import ModelExtraction

llm = ModelExtraction(model_nickname)

pos_train_embds = llm.extract_embds(insts['train'][0])
neg_train_embds = llm.extract_embds(insts['train'][1])
pos_test_embds = llm.extract_embds(insts['test'][0])
neg_test_embds = llm.extract_embds(insts['test'][1])

In [None]:
from reduction import Reduction
import matplotlib.pyplot as plt
import torch

c, r = 4, 8
fig, axs = plt.subplots(c, r, figsize=(r*5, c*5))
for i in range(c):
    for j in range(r):
        layer = i*4+j
        pca = Reduction(2)

        train_data = torch.vstack([pos_train_embds.layers[layer], neg_train_embds.layers[layer]])
        pca.fit(train_data)

        test_data = torch.vstack([pos_test_embds.layers[layer], neg_test_embds.layers[layer]])

        pos_train_pca = pca.transform(pos_train_embds.layers[layer])
        neg_train_pca = pca.transform(neg_train_embds.layers[layer])
        pos_test_pca = pca.transform(pos_test_embds.layers[layer])
        neg_test_pca = pca.transform(neg_test_embds.layers[layer])

        ax = axs[i, j]

        ax.scatter(pos_train_pca[:, 0], pos_train_pca[:, 1], c='red', label='malicious_train', alpha=0.3)
        ax.scatter(neg_train_pca[:, 0], neg_train_pca[:, 1], c='blue', label='safe_train', alpha=0.3)
        ax.scatter(pos_test_pca[:, 0], pos_test_pca[:, 1], c='orange', label='malicious_test', alpha=0.3)
        ax.scatter(neg_test_pca[:, 0], neg_test_pca[:, 1], c='green', label='safe_test', alpha=0.3)

        ax.set_title(f'Layer {layer}')
        ax.legend()

plt.show()