In [129]:
import pickle
import numpy as np
import stats
from scipy.spatial.distance import cosine, euclidean

This opens up a pkl file of activations from a contrastive honesty experiment using gpt-2-small. I haven't got llama working on my machine yet.

Note, this dataset uses the word honesty in almost every prompt, rather than by capturing the spirit of honesty in general.

Open up folder /data/outputs/2024-03-22_16-30-39 to see the dataset used, the config etc.

No PCA or any kind of classification has been done.

To look at integrity, open /data/outputs/2024-03-22_17-41-38 instead


In [131]:
# Honesty
with open('../data/outputs/2024-03-22_16-30-39/activations_cache.pkl', 'rb') as file:
    acts = pickle.load(file)

# Integrity
# with open('../data/outputs/2024-03-22_17-41-38/activations_cache.pkl', 'rb') as file:
#     acts = pickle.load(file)

There is is list of 160 activations corresponding to the 160 prompts in the honesty dataset (150 for integrity).

In [95]:
print("type(activations_cache)", type(acts))
print("len(activations_cache): ", len(acts))

type(activations_cache) <class 'list'>
len(activations_cache):  160


Each activation is of type Activation invented by Skye.

In [96]:
# Looking at the first activation:

print("type(activations_cache[i])", type(acts[0]))

type(activations_cache[i]) <class 'data_handler.Activation'>


Each activation has 5 components as currenrtly defined.

- The stored prompt - a string
- The ethical area - a string
- The ethical valence - a bool
- The raw activations (currenrtly does nothing) - a None type
- The activations of the final token in each residual layer - a list of numpy arrays

Let's look at the first 4 components in the ith example out of 160. Change i to any number between 0 and 159 for honesty (149 for integrity)

In [97]:
i = 0 # Set between 0 and 159 for the activation

print("Prompt:", acts[i].prompt)
print("Ethical Area:", acts[i].ethical_area)
print("Positive:", acts[i].positive)
print("Raw Activations:", acts[i].raw_activations)

Prompt: What role does honesty play in building trust in a relationship? Definitely by withholding information to avoid conflict because,
Ethical Area: Bad
Positive: False
Raw Activations: None


Now let's look at the activations.

- The activations are stored in a list
- There are 12 of them for the 12 residual layers of gpt-2-small
- The hidden state is a nunpy array of shape (768,)

In [98]:
i = 0 # Set between 0 and 159 for the activation
j = 0 # Set between 0 and 11 for the residual layer

print("Hidden States:", type(acts[i].hidden_states))
print("Hidden States:", len(acts[i].hidden_states))
print("Hidden States:", type(acts[i].hidden_states[j]))
print("Hidden States:", acts[i].hidden_states[j].shape)

Hidden States: <class 'list'>
Hidden States: 12
Hidden States: <class 'numpy.ndarray'>
Hidden States: (768,)


Let's find the mean vector in each layer of each group (honest and dishonest)

In [106]:
import numpy as np

def calculate_mean_vectors(activations):
    good_vectors = [[] for _ in range(12)]
    bad_vectors = [[] for _ in range(12)]

    for activation in activations:
        if activation.ethical_area == "Good":
            for i in range(12):
                good_vectors[i].append(activation.hidden_states[i])
        elif activation.ethical_area == "Bad":
            for i in range(12):
                bad_vectors[i].append(activation.hidden_states[i])

    good_mean_vectors = [np.mean(np.array(vectors), axis=0) for vectors in good_vectors]
    bad_mean_vectors = [np.mean(np.array(vectors), axis=0) for vectors in bad_vectors]

    return good_mean_vectors, bad_mean_vectors

In [107]:
good_mean_vectors, bad_mean_vectors = calculate_mean_vectors(acts)

Sanity check to make sure we have the right results

In [112]:
print("Number of good vectors:", len(good_mean_vectors))
print("Number of bad vectors:", len(bad_mean_vectors))
print(" ")

for i in range(len(good_mean_vectors)):
    print("Layer", i)
    print("Types:", type(good_mean_vectors[i]), type(bad_mean_vectors[i]))
    print("Shapes:", good_mean_vectors[i].shape, bad_mean_vectors[i].shape, "\n")

Number of good vectors: 12
Number of bad vectors: 12
 
Layer 0
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 1
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 2
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 3
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 4
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 5
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 6
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 7
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 8
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 9
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (768,) (768,) 

Layer 10
Types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Shapes: (76

Now let's look at some stats of the vectors

In [113]:
def describe_array(arr):
    descr = {
        'mean': np.mean(arr),
        'median': np.median(arr),
        'std': np.std(arr),
        'min': np.min(arr),
        'max': np.max(arr),
        '25%': np.percentile(arr, 25),
        '50%': np.percentile(arr, 50),
        '75%': np.percentile(arr, 75),
        'skewness': stats.skewness(arr),
        'kurtosis': stats.kurtosis(arr)
    }
    return descr

In [114]:
for i in range(len(good_mean_vectors)):
    
    print("Layer", i)
    good_summary = describe_array(good_mean_vectors[i])
    bad_summary = describe_array(bad_mean_vectors[i])

    print("Good Summary:", good_summary)
    print("Bad Summary:", bad_summary, "\n")

Layer 0
Good Summary: {'mean': -1.4901161e-08, 'median': -0.003070693, 'std': 1.833413, 'min': -24.02176, 'max': 21.859676, '25%': -0.3186674565076828, '50%': -0.0030706928810104728, '75%': 0.31166715919971466, 'skewness': -1.449660487608728, 'kurtosis': 94.86374201458655}
Bad Summary: {'mean': -2.483527e-09, 'median': -0.014793001, 'std': 1.8481566, 'min': -24.075228, 'max': 22.077213, '25%': -0.31616707146167755, '50%': -0.014793001115322113, '75%': 0.31988484412431717, 'skewness': -1.3696944270107592, 'kurtosis': 94.02596487760489} 

Layer 1
Good Summary: {'mean': -1.4901161e-08, 'median': 0.023868494, 'std': 1.8881067, 'min': -26.126984, 'max': 21.059116, '25%': -0.41347939521074295, '50%': 0.023868493735790253, '75%': 0.41283509135246277, 'skewness': -1.766427162389191, 'kurtosis': 90.414382541948}
Bad Summary: {'mean': -9.934108e-09, 'median': 0.006777445, 'std': 1.907903, 'min': -26.324163, 'max': 20.9573, '25%': -0.4333321303129196, '50%': 0.006777445087209344, '75%': 0.4225149

Now let's compare them using linear algebra

- **Cosine Similarity:** Closer to 1 means the "good" and "bad" mean vectors are pointing in the same direction, indicating similar orientations in the vector space. A value near 0 indicates low similarity in direction, and a value closer to -1 would suggest they are diametrically opposed.
- **Euclidean Distance:** A smaller distance indicates that the "good" and "bad" mean vectors are closer to each other in the vector space, suggesting they are more similar in both magnitude and direction. Larger distances indicate greater differences.
- **ToDo:** Consider the meaning of other distances.

In [123]:
for i in range(len(good_mean_vectors)):
    print("Layer", i)
    print("Cosine Similarity:", cosine(good_mean_vectors[i], bad_mean_vectors[i]))
    print("Euclidean Distance:", euclidean(good_mean_vectors[i], bad_mean_vectors[i]))
    print("")

Layer 0
Cosine Similarity: 0.0006571412086486816
Euclidean Distance: 1.8938640356063843

Layer 1
Cosine Similarity: 0.0007013082504272461
Euclidean Distance: 2.0448224544525146

Layer 2
Cosine Similarity: 0.0015034079551696777
Euclidean Distance: 3.076526403427124

Layer 3
Cosine Similarity: 0.0023521780967712402
Euclidean Distance: 4.517683982849121

Layer 4
Cosine Similarity: 0.002910792827606201
Euclidean Distance: 5.5775580406188965

Layer 5
Cosine Similarity: 0.005340754985809326
Euclidean Distance: 7.8608574867248535

Layer 6
Cosine Similarity: 0.00594329833984375
Euclidean Distance: 9.07174015045166

Layer 7
Cosine Similarity: 0.007553160190582275
Euclidean Distance: 12.287116050720215

Layer 8
Cosine Similarity: 0.0119742751121521
Euclidean Distance: 18.66167640686035

Layer 9
Cosine Similarity: 0.011975109577178955
Euclidean Distance: 26.68912696838379

Layer 10
Cosine Similarity: 0.00983816385269165
Euclidean Distance: 32.40481185913086

Layer 11
Cosine Similarity: 0.00391095

Now lets look at the overall good and bad mean across all layers together

In [127]:
# Calculate the mean of the 12 mean good vectors
overall_good_mean = np.mean(good_mean_vectors, axis=0)

# Calculate the mean of the 12 mean bad vectors
overall_bad_mean = np.mean(bad_mean_vectors, axis=0)

Sanity check

In [128]:
print(overall_good_mean.shape)
print(overall_bad_mean.shape)

(768,)
(768,)


Stats

In [118]:
overall_good_summary = describe_array(overall_good_mean)
overall_good_summary

{'mean': 0.0,
 'median': -0.01116086,
 'std': 3.630252,
 'min': -35.297047,
 'max': 43.8481,
 '25%': -1.278670608997345,
 '50%': -0.011160859372466803,
 '75%': 1.0942615866661072,
 'skewness': 1.4778568832638104,
 'kurtosis': 58.38198206992106}

In [122]:
overall_bad_summary = describe_array(overall_bad_mean)
overall_bad_summary

{'mean': -9.934108e-09,
 'median': -0.070707634,
 'std': 3.773382,
 'min': -36.8757,
 'max': 45.75711,
 '25%': -1.3445407450199127,
 '50%': -0.07070763781666756,
 '75%': 1.2200306951999664,
 'skewness': 1.5253713198430263,
 'kurtosis': 58.52324532013599}

In [120]:
print("Cosine Similarity:", cosine(overall_good_mean, overall_bad_mean))
print("Euclidean Distance:", euclidean(overall_good_mean, overall_bad_mean))

Cosine Similarity: 0.005172789096832275
Euclidean Distance: 11.161230087280273


In [None]:
# Could also compare honesty and integrity vectors
# Plots