In [1]:
import torch
from typing import List, Union, Tuple
from functools import partial
import json
from transformer_lens.HookedTransformer import HookedTransformer

from algebraic_value_editing.completion_utils import print_n_comparisons
from algebraic_value_editing.prompt_utils import get_x_vector

In [2]:
model_name = "gpt2-small"
# model_name = "gpt-j-6B"
# model_name = "pythia-2.8b-deduped"

# GPT-J-6B can't load onto GPU RAM of Colab
device: str = "cuda" if (torch.cuda.is_available() and model_name != "gpt-j-6B") else "cpu"
model: HookedTransformer = HookedTransformer.from_pretrained(model_name, device="cpu").to(device)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  cpu


## Load Dataset

In [3]:
stories = {}

In [4]:
# Specify the file path
file_path = './datasets/fantasy_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_fantasy = json.load(file)

  stories["fantasy"] = dataset_fantasy

In [5]:
# Specify the file path
file_path = './datasets/sports_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_sports = json.load(file)

  stories["sports"] = dataset_sports

In [6]:
# Specify the file path
file_path = './datasets/scifi_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_scifi = json.load(file)

  stories["scifi"] = dataset_scifi

In [5]:
device

'cpu'

## Run Model (!)!!

In [5]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=0.9,
    act_name=0,
    prompt=stories["fantasy"],
    from_dataset=True,
)]

In [6]:
activation_addition_dataset[0].location

'blocks.0.hook_resid_pre'

In [7]:
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}
get_x_vector_preset = partial(get_x_vector, pad_method="tokens_left",
                              model=model,
                              custom_pad_id=model.to_single_token(" "))

In [8]:
print_n_comparisons(prompt="Yesterday, my son brought home a pet goose. This was the start of",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

activations_seq_len: 1
+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|   [1mYesterday, my son brought home a pet goose. This was the   |   [1mYesterday, my son brought home a pet goose. This was the   |
|   start of[0m his life as a dog. He was raised by his mother    |                      start of[0m the day.                       |
| and father in the U.S., and I had to take him out for walks  |                                                              |
| with me every day because he was so small. He has never been |   I got to see it in action and I'm so happy to be able to   |
| shy about taking care of himself, even when he 

# Try with all activations

In [6]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=5,
    act_name=6,
    prompt=stories["fantasy"],
    from_dataset=True,
    use_all_activations=True,
)]

In [7]:
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}
get_x_vector_preset = partial(get_x_vector, pad_method="tokens_left",
                              model=model,
                              custom_pad_id=model.to_single_token(" "))

In [8]:
print_n_comparisons(prompt="Yesterday, my son brought home a pet goose. This was the start of",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

Getting principal component!


# Try with difference vector

In [8]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=6,
    prompt=stories["fantasy"],
    from_dataset=True,
    use_all_activations=True,
    prompt_2=stories["sports"],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

print_n_comparisons(prompt="Yesterday, my son was out kicking a football. Then, he",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

activations_seq_len: 1
+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|     [1mYesterday, my son was out kicking a football. Then,      |     [1mYesterday, my son was out kicking a football. Then,      |
|    he[0m started running and I just thought "Oh my God, this    |    he[0m fell asleep and woke up to find his father dead. I     |
|                 is going to be a good day."                  | have no idea what happened to him but I can tell you that he |
|                                                              |        died of a natural cause in an abandoned mine.         |
| I had been playing soccer for the last two year

In [11]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=6,
    prompt=stories["scifi"],
    from_dataset=True,
    use_all_activations=True,
    prompt_2=stories["sports"],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

print_n_comparisons(prompt="Yesterday, my son was out kicking a football. Then, he",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

activations_seq_len: 1
+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|     [1mYesterday, my son was out kicking a football. Then,      |     [1mYesterday, my son was out kicking a football. Then,      |
|    he[0m started running and I just thought "Oh my God, this    |     he[0m started having problems with the human brain and      |
|                 is going to be a good day."                  |   became an alien that's been living on Earth for over 50    |
|                                                              |                            years.                            |
| I had been playing soccer for the last two year

In [12]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=20,
    act_name=3,
    prompt=stories["scifi"],
    from_dataset=True,
    use_all_activations=True,
    prompt_2=stories["sports"],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

print_n_comparisons(prompt="Yesterday, my son was out kicking a football. Then, he",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

activations_seq_len: 1
+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|     [1mYesterday, my son was out kicking a football. Then,      |     [1mYesterday, my son was out kicking a football. Then,      |
|    he[0m started running and I just thought "Oh my God, this    |   he[0m started getting some bad news. He had been kicked in    |
|                 is going to be a good day."                  | the head by a friend of his and was being taken to hospital  |
|                                                              |                        for treatment.                        |
| I had been playing soccer for the last two year

In [13]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=80,
    act_name=3,
    prompt=stories["scifi"],
    from_dataset=True,
    use_all_activations=True,
    prompt_2=stories["sports"],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

print_n_comparisons(prompt="Yesterday, my son was out kicking a football. Then, he",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

activations_seq_len: 1
+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|     [1mYesterday, my son was out kicking a football. Then,      |     [1mYesterday, my son was out kicking a football. Then,      |
|    he[0m started running and I just thought "Oh my God, this    |    he[0m started having an actual life. He was being treated    |
|                 is going to be a good day."                  |    for some kind of mental illness and he's now been in a    |
|                                                              |                hospital for about six months.                |
| I had been playing soccer for the last two year

In [14]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=160,
    act_name=3,
    prompt=stories["scifi"],
    from_dataset=True,
    use_all_activations=True,
    prompt_2=stories["sports"],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

print_n_comparisons(prompt="Yesterday, my son was out kicking a football. Then, he",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)

activations_seq_len: 1
+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|     [1mYesterday, my son was out kicking a football. Then,      |     [1mYesterday, my son was out kicking a football. Then,      |
|    he[0m started running and I just thought "Oh my God, this    |    he[0m was going to an Earth-bound science and technology     |
|                 is going to be a good day."                  |                            game.                             |
|                                                              |                                                              |
| I had been playing soccer for the last two year

# Trying Goose AHHH

In [7]:
import os

In [10]:
# Specify the file path
file_path = 'datasets/goose_training_subset.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  goose_dict = json.load(file)

goose_text = []
for key, value in goose_dict.items():
  goose_text.extend(value)

# Create the baseline dataset

def read_all_text_files(directory):
    # List to hold the contents of all files
    contents_list = []

    # List all files in directory
    for filename in os.listdir(directory):
        # Check if file is a text file
        if filename.endswith('.txt'):
            # Construct full file path
            filepath = os.path.join(directory, filename)

            # Open the file and read the contents
            with open(filepath, 'r') as f:
                contents = f.read()

            # Add the file contents to the list
            contents_list.append(contents)

    return contents_list

training_subset = read_all_text_files('datasets/urlsf_subset01-1_data') + read_all_text_files('datasets/urlsf_subset01-182_data')

In [11]:
short_goose_text = [stringo for stringo in goose_text if len(model.tokenizer(stringo)["input_ids"]) < 1000]

Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
short_training_subset = [stringo for stringo in training_subset if len(model.tokenizer(stringo)["input_ids"]) < 1000]

In [14]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=6,
    prompt=short_goose_text,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=short_training_subset,
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

print_n_comparisons(prompt="A girl once had a goose",
                    tokens_to_generate=80, activation_additions=activation_addition_dataset,
                    num_comparisons=8, seed=0, **default_kwargs)