Aim of this notebook: use activation steering to reduce toxicity of models, whilst still preserving coherent output.

Constituent parts:
1. Load sentiment analysis model (done)
2. Have a graph to look at toxicity over tokens. (done)
2. Create toxic or pleasant dataset
4. Do activation steering with it (preferably for GPT-2 XL, could start trying with GPT-2 Small).
6. Maybe think of a way to compare it to the original method.

#Setup

Activation additions stuff

In [None]:
!pip install -q transformer_lens # Install transformer_lens

In [None]:
import os, json
import torch
from typing import List, Union, Tuple
from functools import partial
import json

from transformer_lens.HookedTransformer import HookedTransformer

from algebraic_value_editing.completion_utils import print_n_comparisons, gen_using_activation_additions, gen_using_model
from algebraic_value_editing.prompt_utils import get_x_vector

import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
plt.rcParams.update({
    'font.family':'serif',
    "text.usetex": False,
    'savefig.facecolor': 'white',
})

plt.rc('font', size=16)
plt.rc('axes', titlesize=18)
plt.rc('axes', labelsize=18)
# plt.rc('xtick', labelsize=16)
# plt.rc('ytick', labelsize=16)
plt.rc('legend', fontsize=11)
# plt.rc('figure', titlesize=20)

In [None]:
import torch

In [None]:
# model_name = "gpt2-xl"
# # model_name = "gpt-j-6B"
# # model_name = "pythia-2.8b-deduped"

# # GPT-J-6B can't load onto GPU RAM of Colab
# device: str = "cuda" if (torch.cuda.is_available() and model_name != "gpt-j-6B") else "cpu"
# model: HookedTransformer = HookedTransformer.from_pretrained(model_name).to(device)

In [None]:
device: str = "cuda"
smol_model: HookedTransformer = HookedTransformer.from_pretrained("gpt2-small").to(device)

# Loading Sentiment Analysis Model

In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
data = ["I love you so much I am going to kill you and your entire family. You're going to rot in hell", "I hate you because I can't be with you. All I want is to hold you. I miss you and I love you"]
sentiment_pipeline(data)

In [None]:
import matplotlib.pyplot as plt


def get_sentence_fragments(sentences, start_indices=None):
    if not isinstance(sentences, list):
      sentences = [sentences]

    if isinstance(start_indices, int):
      start_indices = [start_indices]

    start_indices = start_indices or [1] * len(sentences)

    assert len(start_indices) == len(sentences)

    # words = sentence.split()
    sentence_fragments = []
    for start_index, sentence in zip(start_indices, sentences):
      words = sentence.split()
      assert start_index >= 0
      sentence_fragments.extend([
        words[start_index:i+1] for i in range(start_index, len(words))
      ])

    return sentence_fragments



def process_sentence_sentiment_changes(sentences, start_indices=None):

    sentence_fragments = get_sentence_fragments(sentences, start_indices)

    sentiments = sentiment_pipeline([
      ' '.join(fragment) for fragment in sentence_fragments
    ])

    def get_sentiment(result):
      if result['label'] == 'POSITIVE':
        return result['score']
      return 1 - result['score']

    return [
      (fragment, get_sentiment(result))
      for fragment, result in zip(sentence_fragments, sentiments)
    ]


def plot_sentiments(sentiments):
    plt.figure(figsize=(10, 5))
    labels, values = zip(*sentiments)
    plt.plot(range(len(values)), values, marker='o', color='b', linestyle='-')
    plt.xticks(range(len(values)), [frag[-1] for frag in labels], rotation=45)
    plt.subplots_adjust(bottom=0.3)
    plt.xlabel('Phrases')
    plt.ylabel('Sentiment Value')
    plt.title('Word-by-Word Sentiment Changes')
    plt.show()


# Test the script
sentence = [
  # "Hello how are you I really hate you because you are beautiful",
  "I hate you because I can't be with you. All I want is to hold you. I miss you and I love you"
]
sentiments = process_sentence_sentiment_changes(sentence, 0)
plot_sentiments(sentiments)


## Average Sentiment Across Tokens

Given a set of prompts, and a number of times to complete each prompt, produce a graph averaging the sentiment after each word

In [None]:
sentences = [
    ("I hate you because you're the best person in the whole world", 4),
    ("You're the worst because you smell like roses", 3),
    ("Hello how are you I really hate you because you are beautiful", 2)
]

In [None]:
from collections import defaultdict
import pandas as pd


def compute_fragment_index_sentiments(sentences_with_start_indices):
  """
  sentences: list of tuples with sentences and index where the completion begins
  """
  sentences = [s for s, _ in sentences_with_start_indices]
  start_indices = [i for _, i in sentences_with_start_indices]
  sentiments = process_sentence_sentiment_changes(sentences, start_indices)
  # sentiments_by_position = defaultdict(list)
  # for sentence_fragment, value in sentiments:
  #   sentiments_by_position[len(sentence_fragment)].append(value)

  return pd.DataFrame([
    {'index': len(sentence_fragment), 'sentiment': sentiment}
    for sentence_fragment, sentiment in sentiments
  ])

In [None]:
compute_fragment_index_sentiments(sentences)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

df = compute_fragment_index_sentiments(sentences)
sns.lineplot(data=df, x='index', y='sentiment')
plt.show()

In [None]:
def plot_all_sentiments(data):

  # Initialize lists to hold the average and standard deviation values
  avg_values = []
  std_values = []

  # Calculate the average and standard deviation for each list in the dictionary
  for key in sorted(data.keys()):
      values = np.array(data[key])
      avg_values.append(np.mean(values))
      std_values.append(np.std(values))

  # Create an array of x values corresponding to the keys in your dictionary
  x_values = np.array(sorted(data.keys()))

  # Plot the average values as a line graph
  plt.errorbar(x_values, avg_values, yerr=std_values, fmt='-o')

  # Adding title and labels
  plt.title('Average Sentiment of Model Completions')
  plt.xlabel('Index After Prompt')
  plt.ylabel('Average Sentiment')

  # Show the plot
  plt.show()

In [None]:
from datetime import datetime

def plot_all_sentiments_both(data_steered, data_unsteered):

  # Initialize lists to hold the average and standard deviation values
  avg_values_steered = []
  std_values_steered = []

  # Calculate the average and standard deviation for each list in the dictionary
  for key in sorted(data_steered.keys()):
      values_steered = np.array(data_steered[key])
      avg_values_steered.append(np.mean(values_steered))
      std_values_steered.append(np.std(values_steered))

  # Create an array of x values corresponding to the keys in your dictionary
  x_values_steered = np.array(sorted(data_steered.keys()))

  # Plot the average values as a line graph
  plt.errorbar(x_values_steered[:50], avg_values_steered[:50], yerr=std_values_steered[:50], fmt='--o', color='red', alpha=0.8, capsize=5)

  # Repeat for unsteered!

  # Initialize lists to hold the average and standard deviation values
  avg_values_unsteered = []
  std_values_unsteered = []

  # Calculate the average and standard deviation for each list in the dictionary
  for key in sorted(data_unsteered.keys()):
      values_unsteered = np.array(data_unsteered[key])
      avg_values_unsteered.append(np.mean(values_unsteered))
      std_values_unsteered.append(np.std(values_unsteered))

  # Create an array of x values corresponding to the keys in your dictionary
  x_values_unsteered = np.array(sorted(data_unsteered.keys()))

  # Plot the average values as a line graph
  plt.errorbar(x_values_unsteered[:50], avg_values_unsteered[:50], yerr=std_values_unsteered[:50], fmt='--s', color='blue', alpha=0.8, capsize=5)


  # Adding title and labels
  plt.title('Average Sentiment of Model Completions')
  plt.xlabel('Index After Prompt')
  plt.ylabel('Average Sentiment')

  current_time = datetime.now()
  plt.savefig(f"sentiments-{current_time}.pdf", format="pdf")

  # Show the plot
  plt.show()

In [None]:
# plot_all_sentiments(all_sentiments_test)

# Loading Toxic Comments

In [None]:
%pip install datasets

from datasets import load_dataset

toxic_dataset = load_dataset("vmalperovich/toxic_comments")

In [None]:
shuffled_dataset = toxic_dataset.shuffle(seed=21)

In [None]:
shuffled_dataset['train'][16]

In [None]:
filtered_toxic_ds = shuffled_dataset['train'].filter(lambda example: 1 in example['label'])

In [None]:
filtered_toxic_ds[99]

In [None]:
# Specify the file path
file_path = 'datasets/goose_training_subset.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  goose_dict = json.load(file)

goose_text = []
for key, value in goose_dict.items():
  goose_text.extend(value)

# Create the baseline dataset

def read_all_text_files(directory):
    # List to hold the contents of all files
    contents_list = []

    # List all files in directory
    for filename in os.listdir(directory):
        # Check if file is a text file
        if filename.endswith('.txt'):
            # Construct full file path
            filepath = os.path.join(directory, filename)

            # Open the file and read the contents
            with open(filepath, 'r') as f:
                contents = f.read()

            # Add the file contents to the list
            contents_list.append(contents)

    return contents_list

training_subset = read_all_text_files('datasets/urlsf_subset01-1_data') + read_all_text_files('datasets/urlsf_subset01-182_data')


Do this once loaded

In [None]:
# Specify the file path
file_path = 'loving_500.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_loving = json.load(file)

In [None]:
with open('datasets/fantasy_200.json', 'r') as file:
    fantasy_ds = json.load(file)

with open('datasets/scifi_200.json', 'r') as file:
    scifi_ds = json.load(file)

with open('datasets/sports_200.json', 'r') as file:
    sports_ds = json.load(file)

In [None]:
shakespeare_ds = load_dataset("tiny_shakespeare")
shakespeare_text = shakespeare_ds['train']['text'][0]
sample_size = 100
words = shakespeare_text.split()
shakespeare_ds = [' '.join(words[i:i+sample_size]) for i in range(0, len(words), sample_size)][:200]

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

fantasy_words = set(' '.join(fantasy_ds).split()) - set(' '.join(scifi_ds).split()) - set(' '.join(sports_ds).split())
fantasy_words = {stemmer.stem(word) for word in fantasy_words}
scifi_words = set(' '.join(scifi_ds).split()) - set(' '.join(fantasy_ds).split()) - set(' '.join(sports_ds).split())
scifi_words = {stemmer.stem(word) for word in scifi_words}
sports_words = set(' '.join(sports_ds).split()) - set(' '.join(fantasy_ds).split()) - set(' '.join(scifi_ds).split())
sports_words = {stemmer.stem(word) for word in sports_words}
shakespeare_words = set(' '.join(shakespeare_ds).split()) - set(' '.join(fantasy_ds).split()) - set(' '.join(scifi_ds).split()) - set(' '.join(sports_ds).split()) - set(' '.join(training_subset).split())
shakespeare_words = {stemmer.stem(word) for word in shakespeare_words}


list(fantasy_words)[:5], list(scifi_words)[:5], list(sports_words)[:5], list(shakespeare_words)[:5]

In [None]:
len(fantasy_words), len(shakespeare_words)

In [None]:
def get_genre_frequencies(text):
    words = {stemmer.stem(w) for w in set(text.split())}
    fantasy_freq = len(words & fantasy_words) / len(words)
    scifi_freq = len(words & scifi_words) / len(words)
    sports_freq = len(words & sports_words) / len(words)
    return fantasy_freq, scifi_freq, sports_freq


def get_genre_frequency_changes(text):
    fantasy_count = 0
    scifi_count = 0
    sports_count = 0
    shakes_count = 0
    fantasy_array = []
    scifi_array = []
    sports_array = []
    shakes_array = []
    words = text.split()
    for i, word in enumerate(words):
        if word in fantasy_words:
            fantasy_count += 1
        if word in scifi_words:
            scifi_count += 1
        if word in sports_words:
            sports_count += 1
        if word in shakespeare_words:
            shakes_count += 1
        fantasy_array.append(fantasy_count / (i + 1))
        scifi_array.append(scifi_count / (i + 1))
        sports_array.append(sports_count / (i + 1))
        shakes_array.append(shakes_count / (i + 1))

    return fantasy_array, scifi_array, sports_array, shakes_array


def get_genres_freq_df(texts, start_indices=None):
    if not isinstance(texts, list):
        texts = [texts]
    start_indices = start_indices or [0] * len(texts)
    if isinstance(start_indices, int):
        start_indices = [start_indices]
    items = []
    for text, start_index in zip(texts, start_indices):
        for i, (v_f, v_sc, v_sp, v_sh) in enumerate(zip(*get_genre_frequency_changes(text))):
            items.append({
                'position': i - start_index,
                'fantasy': v_f,
                'scifi': v_sc,
                'sports': v_sp,
                'shakespeare': v_sh
            })
    return pd.DataFrame(items)

In [None]:
fantasy_genres_freq_df = get_genres_freq_df(fantasy_ds)
fantasy_genres_freq_df['dataset'] = 'fantasy'
scifi_genres_freq_df = get_genres_freq_df(scifi_ds)
scifi_genres_freq_df['dataset'] = 'scifi'
sports_genres_freq_df = get_genres_freq_df(sports_ds)
sports_genres_freq_df['dataset'] = 'sports'
shakes_freq_df = get_genres_freq_df(shakespeare_ds)
shakes_freq_df['dataset'] = 'shakespeare'

genres_freq_df = pd.concat([fantasy_genres_freq_df, scifi_genres_freq_df, sports_genres_freq_df, shakes_freq_df])
genres_freq_df = genres_freq_df.sort_values(by='position')

_, axs = plt.subplots(1, 4, figsize=(15, 5))

sns.lineplot(data=genres_freq_df, x='position', y='fantasy', hue='dataset', ax=axs[0])
sns.lineplot(data=genres_freq_df, x='position', y='scifi', hue='dataset', ax=axs[1])
sns.lineplot(data=genres_freq_df, x='position', y='sports', hue='dataset', ax=axs[2])
sns.lineplot(data=genres_freq_df, x='position', y='shakespeare', hue='dataset', ax=axs[3])

from IPython.display import clear_output
clear_output()

plt.show()

# Loading Training Dataset

In [None]:
short_training_subset = [stringo for stringo in training_subset if len(smol_model.tokenizer(stringo)["input_ids"]) < 1000]
tiny_training_subset = [stringo for stringo in training_subset if len(smol_model.tokenizer(stringo)["input_ids"]) < 500]
len(tiny_training_subset)


import pandas as pd

def completion_df_to_list(df):
    def create_tuple(row):
      # Combine prompt and completion
      combined_text = row['prompts'] + ' ' + row['completions']
      # Count the number of words in the prompt
      num_words = len(row['prompts'].split())
      return (combined_text, num_words)

    # Apply the function to each row in the DataFrame to create the list of tuples
    result = df.apply(create_tuple, axis=1).tolist()
    return result

def first_half_string(s):
    # Split the string into words
    words = s.split()

    # Calculate the index to split the words list in half
    half_index = len(words) // 2

    # Take the first half of the words
    first_half_words = words[:half_index]

    # Combine these words to form a string
    result = ' '.join(first_half_words)

    return result

def remove_last_if_even(lst):
    """
    Remove the last element from a list if the list has even length.

    :param lst: List from which to remove the last element if the list has even length.
    :type lst: list
    :return: Modified list.
    :rtype: list
    """
    if len(lst) % 2 == 0:  # Check if the length of the list is even
        return lst[:-1]  # Return the list excluding the last element
    return lst

input_dataset = fantasy_ds
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
prompt_batch = remove_last_if_even(small_data)
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

In [None]:


from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=scifi_ds,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

fantasy_to_scifi_df = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=42,
    **default_kwargs
)

In [None]:
i = 5
for prompt, contiuations in zip(fantasy_to_scifi_df.prompts[:i], fantasy_to_scifi_df.completions[:i]):
    print(r'\textbf{' + prompt + '}' + contiuations + r'\\')

In [None]:
input_dataset = sports_ds
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
prompt_batch = remove_last_if_even(small_data)
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=fantasy_ds,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

sports_to_fantasy_df = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=42,
    **default_kwargs
)

In [None]:
i = 5
for prompt, contiuations in zip(sports_to_fantasy_df.prompts[:i], sports_to_fantasy_df.completions[:i]):
    print(r'\textbf{' + prompt + '}' + contiuations + r'\\')

In [None]:
input_dataset = scifi_ds
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
prompt_batch = remove_last_if_even(small_data)
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=sports_ds,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

sci_fi_to_sports_df = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=42,
    **default_kwargs
)

In [None]:
i = 5
for prompt, contiuations in zip(sci_fi_to_sports_df.prompts[:i], sci_fi_to_sports_df.completions[:i]):
    print(r'\textbf{' + prompt + '}' + contiuations + r'\\')

In [None]:
# input_dataset = tiny_training_subset[:200]
# halfway_data = [first_half_string(s) for s in input_dataset]
# small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
# prompt_batch = remove_last_if_even(small_data)
# default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

# from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

# activation_addition_dataset = [ActivationAdditionDataset(
#     coeff=60,
#     act_name=2,
#     prompt=shakespeare_ds[:500],
#     from_dataset=True,
#     use_all_activations=True,
#     prompt_2=tiny_training_subset[:200],
#     from_pca=False,
#     from_difference=True,
# )]

# training_to_shakespeare_df = gen_using_activation_additions(
#     prompt_batch=prompt_batch,
#     model=smol_model,
#     activation_additions=activation_addition_dataset,
#     addition_location="front",
#     seed=42,
#     **default_kwargs
# )

In [None]:
# i = 5
# for prompt, contiuations in zip(training_to_shakespeare_df.prompts[:i], training_to_shakespeare_df.completions[:i]):
#     print(r'\textbf{' + prompt + '}' + contiuations + r'\\')

In [None]:
generated_stories = list(fantasy_to_scifi_df.prompts + fantasy_to_scifi_df.completions)
steering_start_indices = [len(prompt.split()) for prompt in fantasy_to_scifi_df.prompts]
fantasy_genres_freq_df = get_genres_freq_df(generated_stories, steering_start_indices)

# genres_freq_df = pd.concat([fantasy_genres_freq_df, scifi_genres_freq_df, sports_genres_freq_df])

from cProfile import label
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.display import clear_output
sns.set()
plt.rcParams.update({
    'font.family':'serif',
    "text.usetex": False,
    'savefig.facecolor': 'white',
})

plt.rc('font', size=12)
plt.rc('axes', titlesize=14)
plt.rc('axes', labelsize=12)
# plt.rc('xtick', labelsize=16)
# plt.rc('ytick', labelsize=16)
plt.rc('legend', fontsize=11)
# plt.rc('figure', titlesize=20)

_, axs = plt.subplots(1, 3, figsize=(12, 3))

# sns.lineplot(data=fantasy_genres_freq_df, x='position', y='fantasy', ax=axs[0], label='fantasy')
# sns.lineplot(data=fantasy_genres_freq_df, x='position', y='scifi', ax=axs[0], label='scifi')
axs[0].vlines(0, 0, 1, color='black', linestyles='dashed', alpha=0.25)
for dataset in ['fantasy', 'scifi', 'sports']:
    sns.lineplot(data=fantasy_genres_freq_df, x='position', y=dataset, ax=axs[0])
axs[0].set_title('Fantasy-to-Scifi Steering')
axs[0].set_xlabel('Relative Word Index')
axs[0].set_ylabel('Genre Word Frequency')
axs[0].set_xlim([-50, 75])
axs[0].set_ylim([.0, .2])
clear_output()

genered_stories = list(sports_to_fantasy_df.prompts + sports_to_fantasy_df.completions)
steering_start_indices = [len(prompt.split()) for prompt in sports_to_fantasy_df.prompts]
sports_genres_freq_df = get_genres_freq_df(genered_stories, steering_start_indices)

# sns.lineplot(data=sports_genres_freq_df, x='position', y='sports', ax=axs[1], label='sports')
# sns.lineplot(data=sports_genres_freq_df, x='position', y='fantasy', ax=axs[1], label='fantasy')
axs[1].vlines(0, 0, 1, color='black', linestyles='dashed', alpha=0.25)
for dataset in ['fantasy', 'scifi', 'sports']:
    sns.lineplot(data=sports_genres_freq_df, x='position', y=dataset, ax=axs[1])
axs[1].set_title('Sports-to-Fantasy Steering')
axs[1].set_xlabel('Relative Word Index')
axs[1].set_ylabel('')
axs[1].set_xlim([-50, 75])
axs[1].set_ylim([.0, .25])
clear_output()

generated_stories = list(sci_fi_to_sports_df.prompts + sci_fi_to_sports_df.completions)
steering_start_indices = [len(prompt.split()) for prompt in sci_fi_to_sports_df.prompts]
scifi_genres_freq_df = get_genres_freq_df(generated_stories, steering_start_indices)

# sns.lineplot(data=scifi_genres_freq_df, x='position', y='scifi', ax=axs[2], label='scifi')
# sns.lineplot(data=scifi_genres_freq_df, x='position', y='sports', ax=axs[2], label='sports')

legend_labels = {
    'fantasy': 'Fantasy\nWords',
    'scifi': 'Sci-fi\nWords',
    'sports': 'Sports\nWords',
}
axs[2].vlines(0, 0, 1, color='black', linestyles='dashed', alpha=0.25, label='Steering\nStart')

for dataset in ['fantasy', 'scifi', 'sports']:
    sns.lineplot(data=scifi_genres_freq_df, x='position', y=dataset, ax=axs[2], label=legend_labels[dataset])

axs[2].set_title('Scifi-to-Sports Steering')
axs[2].set_xlabel('Relative Word Index')
axs[2].set_ylabel('')
axs[2].set_xlim([-50, 75])
axs[2].set_ylim([.0, .25])
axs[2].legend(loc='center left', bbox_to_anchor=(1, 0.5))


# generated_stories = list(training_to_shakespeare_df.prompts + training_to_shakespeare_df.completions)
# steering_start_indices = [len(prompt.split()) for prompt in training_to_shakespeare_df.prompts]
# shakes_freq_df = get_genres_freq_df(generated_stories, steering_start_indices)
# # sns.lineplot(data=shakes_freq_df, x='position', y='sports', ax=axs[3], label='sports')
# # sns.lineplot(data=shakes_freq_df, x='position', y='shakespeare', ax=axs[3], label='shakespeare')

# for dataset in ['fantasy', 'scifi', 'sports', 'shakespeare']:
#     sns.lineplot(data=shakes_freq_df, x='position', y=dataset, ax=axs[3], label=dataset)
# axs[3].set_title('Training Data with Shakespeare Steering')
# axs[3].set_xlabel('Relative Word Index')
# axs[3].set_ylabel('')
# axs[3].set_xlim([-50, 75])
# # set legend to the right of the plot
# axs[3].legend(loc='center left', bbox_to_anchor=(1, 0.5))

clear_output()

plt.tight_layout()
plt.xlabel('Relative Word Index')
plt.savefig(f"genres-steering.pdf", format="pdf")

plt.show()

#

# Finding good coefficients

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=short_training_subset[:390],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': smol_model}

print_n_comparisons(prompt="I hate you because",
                    tokens_to_generate=40, activation_additions=activation_addition_dataset,
                    num_comparisons=4, seed=0, **default_kwargs)

# Doing Plots

Once we have done our hyper-parameter search, we can do apply the sentiment classifier stuff.

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=short_training_subset[:390],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate':80}

loving_df = gen_using_activation_additions(
    prompt_batch = ["I hate you because"] * 8,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=0,
    **default_kwargs)

In [None]:
unsteered_df_hate = gen_using_model(
    model=smol_model,
    prompt_batch = ["I hate you because"] * 8,
    seed = 0,
    **default_kwargs,
)

In [None]:
unsteered_df_hate = gen_using_model(
    model=smol_model,
    prompt_batch = ["I hate you because", "I like you because you"],
    seed = 1,
    **default_kwargs,
)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

# sentence = "I hate you because" + loving_df['completions'][3]
start_index = 3  # example start index
df = compute_fragment_index_sentiments([
    (f'I hate you because{completion}', start_index)
    for completion in loving_df['completions']
])
sns.lineplot(data=df, x='index', y='sentiment')
plt.show()

In [None]:
# Test the script
sentence = "I hate you because" + loving_df['completions'][3]
start_index = 3  # example start index
sentiments = process_sentence(sentence, start_index)
plot_sentiments(sentiments)

# Trying with GPT-2 XL

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

# activation_addition_dataset = [ActivationAdditionDataset(
#     coeff=60,
#     act_name=14,
#     prompt=dataset_loving,
#     from_dataset=True,
#     use_all_activations=True,
#     prompt_2=tiny_training_subset[:200],
#     from_pca=False,
#     from_difference=True,
# )]

# default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

# print_n_comparisons(prompt="I hate you because",
#                     tokens_to_generate=80, activation_additions=activation_addition_dataset,
#                     num_comparisons=8, seed=0, **default_kwargs)

In [None]:
# from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

# activation_addition_dataset = [ActivationAdditionDataset(
#     coeff=120,
#     act_name=14,
#     prompt=dataset_loving,
#     from_dataset=True,
#     use_all_activations=True,
#     prompt_2=tiny_training_subset[:200],
#     from_pca=False,
#     from_difference=True,
# )]

# default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

# print_n_comparisons(prompt="I hate you because",
#                     tokens_to_generate=80, activation_additions=activation_addition_dataset,
#                     num_comparisons=8, seed=0, **default_kwargs)

In [None]:
# from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

# activation_addition_dataset = [ActivationAdditionDataset(
#     coeff=270,
#     act_name=26,
#     prompt=dataset_loving,
#     from_dataset=True,
#     use_all_activations=True,
#     prompt_2=tiny_training_subset[:200],
#     from_pca=False,
#     from_difference=True,
# )]

# default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

# print_n_comparisons(prompt="I hate you because",
#                     tokens_to_generate=80, activation_additions=activation_addition_dataset,
#                     num_comparisons=8, seed=0, **default_kwargs)

In [None]:
# from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

# activation_addition_dataset = [ActivationAdditionDataset(
#     coeff=90,
#     act_name=7,
#     prompt=dataset_loving,
#     from_dataset=True,
#     use_all_activations=True,
#     prompt_2=tiny_training_subset[:200],
#     from_pca=False,
#     from_difference=True,
# )]

# default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'model': model}

# print_n_comparisons(prompt="I hate you because",
#                     tokens_to_generate=80, activation_additions=activation_addition_dataset,
#                     num_comparisons=8, seed=0, **default_kwargs)

# Using dataframes

In [None]:
# from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

# activation_addition_dataset = [ActivationAdditionDataset(
#     coeff=90,
#     act_name=7,
#     prompt=dataset_loving,
#     from_dataset=True,
#     use_all_activations=True,
#     prompt_2=tiny_training_subset[:200],
#     from_pca=False,
#     from_difference=True,
# )]

# default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3}


# loving_df = gen_using_activation_additions(
#     prompt_batch = ["I hate you because"] * 8,
#     model=model,
#     activation_additions=activation_addition_dataset,
#     addition_location="front",
#     seed=0,
#     **default_kwargs)

In [None]:
# squanch = completion_df_to_list(loving_df)

In [None]:
# squanch

# Impact of Steering on Sentiment

Given a toxic dataset, split each input in half. Continue from halfway, and generate n number os completions both with and without steering. Then process these to get the average sentiment graphs.

Requires hyper-parameters to have already been found

In [None]:
input_dataset = filtered_toxic_ds[0:100]['text']

In [None]:
def steering_sentiment_experiment(
  input_dataset,
  n_completions,
  model,
  activation_addition_dataset,
  addition_location,
  seed,
  default_kwargs
):
  halfway_data = [first_half_string(s) for s in input_dataset]
  small_data = [s for s in halfway_data if len(model.tokenizer(s)["input_ids"]) < 200]

  small_data = remove_last_if_even(small_data)

  prompt_batch = small_data



  # Take the inputs, and run them through the model
  for i in range(n_completions):
    steered_df = gen_using_activation_additions(
      prompt_batch = prompt_batch,
      model=model,
      activation_additions=activation_addition_dataset,
      addition_location="front",
      seed=i,
      **default_kwargs
      )

    if i == 0:
      whole_df = steered_df
    else:
      steered_df = pd.concat([whole_df, steered_df], ignore_index=True)

  # Repeat for unsteered stuff!
  for i in range(n_completions):
    unsteered_df = gen_using_model(
      model= smol_model,
      prompt_batch = prompt_batch,
      seed = 0,
      **default_kwargs,
    )

    if i == 0:
      whole_unsteered_df = unsteered_df
    else:
      unsteered_df = pd.concat([whole_unsteered_df, unsteered_df], ignore_index=True)




# # Do the plotting experiment on the df
# sentiments_list_steered = completion_df_to_list(whole_df)
# all_sentiments_dict_steered = all_sentiments(sentiments_list_steered)

# sentiments_list_unsteered = completion_df_to_list(whole_unsteered_df)
# all_sentiments_dict_unsteered = all_sentiments(sentiments_list_unsteered)

# plot_all_sentiments_both(all_sentiments_dict_steered, all_sentiments_dict_unsteered)

In [None]:
input_dataset = filtered_toxic_ds[0:100]['text']
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
prompt_batch = remove_last_if_even(small_data)
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

In [None]:
prompt_batch[0]

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=65,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

loving_steered_df = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=27,
    **default_kwargs
)

In [None]:
from algebraic_value_editing.prompt_utils import ActivationAddition, get_x_vector

x_vector = get_x_vector(
    prompt1='Love',
    prompt2='Hate',
    coeff=60,
    act_name=2,
    model=smol_model,
)

activation_addition_dataset = [ActivationAddition(
    prompt='I love you',
    coeff=5,
    act_name=2,
    # use_all_activations=True,
    # prompt_2=tiny_training_subset[:200],
    # from_pca=False,
    # from_difference=True,
)]

turner_steered_df = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=27,
    **default_kwargs
)

In [None]:
turner_steered_df

In [None]:
from algebraic_value_editing.prompt_utils import ActivationAddition, get_x_vector

x_vector = get_x_vector(
    prompt1='Love',
    prompt2='Hate',
    coeff=5,
    act_name=2,
    model=smol_model,
)

# activation_addition_dataset = [ActivationAddition(
#     prompt='Love',
#     coeff=60,
#     act_name=2,
#     # use_all_activations=True,
#     # prompt_2=tiny_training_subset[:200],
#     # from_pca=False,
#     # from_difference=True,
# )]

turner_steered_df_2 = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=[*x_vector],
    addition_location="front",
    seed=27,
    **default_kwargs
)

turner_steered_df_2.head()

In [None]:
import random

def print_latex_random_continuation(df, n=5, seed=0):
    print(r'\begin{itemize}')
    random.seed(seed)
    for prompt, continuation in random.sample(list(zip(df.prompts, df.completions)), n):
        print(r'\item \textbf{' + prompt + r'}' + continuation)
    print(r'\end{itemize}')

In [None]:
print_latex_random_continuation(turner_steered_df_2)

In [None]:
df_sentiments_turner_steered_2 = compute_fragment_index_sentiments([(completion, 0) for completion in turner_steered_df_2.completions])

In [None]:
df_sentiments_turner_steered = compute_fragment_index_sentiments([(completion, 0) for completion in turner_steered_df.completions])

In [None]:
loving_steered_df.prompts[0], loving_steered_df.completions[0]

In [None]:
loving_steered_df['start_index'] = loving_steered_df['prompts'].apply(lambda x: len(x.split()))
loving_steered_df['full_sentence'] = loving_steered_df['prompts'] + loving_steered_df['completions']

In [None]:
df_sentiments_steered = compute_fragment_index_sentiments([(completion, 0) for completion in loving_steered_df.completions])
# df_sentiments_steered = compute_fragment_index_sentiments([(completion, 0) for completion in steered_df.full_sentence])

In [None]:
unsteered_df = gen_using_model(
    model= smol_model,
    prompt_batch = prompt_batch,
    seed = 0,
    **default_kwargs,
)

In [None]:
unsteered_df['full_sentence'] = unsteered_df['prompts'] + unsteered_df['completions']
unsteered_df['start_index'] = unsteered_df['prompts'].apply(lambda x: len(x.split()))
unsteered_df['full_sentence'][0]

In [None]:
df_sentiments_unsteered = compute_fragment_index_sentiments([(completion, 0) for completion in unsteered_df.completions])
# df_sentiments_unsteered = compute_fragment_index_sentiments([(completion, 0) for completion in unsteered_df.full_sentence])

In [None]:
civil_comments = load_dataset("civil_comments", split="train[:1000]")

In [None]:
civil_comments[10], civil_comments[5]

In [None]:
feats = {k for k in civil_comments[0] if k != 'text'}
not_toxic_comments = [
    comment['text'] for comment in civil_comments
    if len(comment['text']) < 500 and all(comment[feat] == 0 for feat in feats)
]
not_toxic_comments[:5], len(not_toxic_comments)

In [None]:
max(len(comment) for comment in dataset_loving)

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

input_dataset = filtered_toxic_ds[0:100]['text']
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
prompt_batch = remove_last_if_even(small_data)
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=80,
    act_name=2,
    prompt=not_toxic_comments,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

not_toxic_steered_df = gen_using_activation_additions(
    prompt_batch=prompt_batch,
    model=smol_model,
    activation_additions=activation_addition_dataset,
    addition_location="front",
    seed=27,
    **default_kwargs
)

In [None]:
print_latex_random_continuation(not_toxic_steered_df, n=5, seed=0)

In [None]:
print_latex_random_continuation(loving_steered_df, n=5, seed=0)

In [None]:
print_latex_random_continuation(unsteered_df, n=5, seed=0)

In [None]:
print_latex_random_continuation(turner_steered_df_2, n=5, seed=0)

In [None]:
import random

def print_latex_random_sample(ds, n=5, seed=0):
    print(r'\begin{itemize}')
    random.seed(seed)
    for sample in random.sample(ds, n):
        print(r'\item ' + sample)
    print(r'\end{itemize}')

In [None]:
print_latex_random_sample(not_toxic_comments, n=5, seed=0)

In [None]:
print_latex_random_sample(dataset_loving, n=5, seed=0)

In [None]:
print_latex_random_sample(filtered_toxic_ds['text'], n=5, seed=0)

In [None]:
print_latex_random_sample(fantasy_ds, n=3, seed=0)

In [None]:
print_latex_random_sample(scifi_ds, n=3, seed=0)

In [None]:
print_latex_random_sample(sports_ds, n=3, seed=0)

In [None]:
df_sentiments_non_toxic_steered = compute_fragment_index_sentiments([(completion, 0) for completion in not_toxic_steered_df.completions])

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
plt.rcParams.update({
    'font.family':'serif',
    "text.usetex": False,
    'savefig.facecolor': 'white',
})

plt.rc('font', size=16)
plt.rc('axes', titlesize=18)
plt.rc('axes', labelsize=18)
# plt.rc('xtick', labelsize=16)
# plt.rc('ytick', labelsize=16)
plt.rc('legend', fontsize=11)
# plt.rc('figure', titlesize=20)


plt.figure(figsize=(7, 4))
sns.lineplot(data=df_sentiments_steered, x='index', y='sentiment', label='Loving Steered', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_non_toxic_steered, x='index', y='sentiment', label='Non-Toxic Steered', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_turner_steered_2, x='index', y='sentiment', label='ActAdd (Turner et al.)', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_unsteered, x='index', y='sentiment', label='Unsteered', errorbar=('ci', 95))
plt.xlabel('Words Generated After Prompt')
plt.ylabel('Average Sentiment')
plt.xlim([0, 60])
plt.legend(loc='lower right')
plt.savefig('steered-vs-unsteered-toxic-comment-sentiments-shorter.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
plt.rcParams.update({
    'font.family':'serif',
    "text.usetex": False,
    'savefig.facecolor': 'white',
})

plt.rc('font', size=16)
plt.rc('axes', titlesize=18)
plt.rc('axes', labelsize=18)
# plt.rc('xtick', labelsize=16)
# plt.rc('ytick', labelsize=16)
plt.rc('legend', fontsize=11)
# plt.rc('figure', titlesize=20)


plt.figure(figsize=(7, 4))
sns.lineplot(data=df_sentiments_steered, x='index', y='sentiment', label='Loving Steered', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_non_toxic_steered, x='index', y='sentiment', label='Non-Toxic Steered', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_turner_steered_2, x='index', y='sentiment', label='ActAdd (Turner et al.)', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_unsteered, x='index', y='sentiment', label='Unsteered', errorbar=('ci', 95))
sns.lineplot(data=df_sentiments_turner_steered, x='index', y='sentiment', label='ActAdd (Turner et al.) ($\lambda=5$)', errorbar=('ci', 95))
plt.xlabel('Words Generated After Prompt')
plt.ylabel('Average Sentiment')
plt.xlim([0, 60])
plt.legend(loc='lower right')
plt.savefig('steered-vs-unsteered-toxic-comment-sentiments-extra.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# load tokenizer and model weights
tox_tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
toxicity_model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier').cuda()

In [None]:
def compute_toxicity(inputs, batch_size=64):
    if not isinstance(inputs, list):
        inputs = [inputs]

    outputs = []
    for i in range(0, len(inputs), batch_size):
        tokens_batch = [tox_tokenizer.tokenize(inp) for inp in inputs[i:i+batch_size]]
        ids_batch = [tox_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_batch]
        longest_len = max(map(len, ids_batch))
        batch = torch.tensor([
            [*([tox_tokenizer.pad_token_id] * (longest_len - len(inp))), *inp]
            for inp in ids_batch
        ]).cuda()
        # inference
        tox_outs = toxicity_model(batch)
        probs = torch.softmax(tox_outs.logits, dim=1).detach()
        outputs.append(probs[:, 1:2])

    return torch.cat(outputs, dim=0).squeeze().cpu().numpy()

In [None]:
compute_toxicity(['hello friend']), compute_toxicity(['fuck you bastard'])

In [None]:
compute_toxicity(['hello friend', 'fuck you bastard'])

In [None]:
def process_sentence_toxicity_changes(sentences, start_indices=None):

    sentence_fragments = get_sentence_fragments(sentences, start_indices)

    toxicities = compute_toxicity([
      ' '.join(fragment) for fragment in sentence_fragments
    ])

    return pd.DataFrame([
      {'index': len(sentence_fragment), 'toxicity': toxicity}
      for sentence_fragment, toxicity in zip(sentence_fragments, toxicities)
    ])

In [None]:
df_toxicity_non_toxic_steered = process_sentence_toxicity_changes(list(not_toxic_steered_df.completions))

In [None]:
non_toxic_steered_toxicities = compute_toxicity(list(not_toxic_steered_df.completions))
loving_steered_toxicities = compute_toxicity(list(loving_steered_df.completions))
unsteered_toxicities = compute_toxicity(list(unsteered_df.completions))
turner_steered_2_toxicities = compute_toxicity(list(turner_steered_df_2.completions))
# turner_steered_toxicities = compute_toxicity(list(turner_steered_df.completions))
toxicities_df = pd.DataFrame(
    [{'method': 'Loving Steered', 'toxicity': toxicity} for toxicity in loving_steered_toxicities] +
    [{'method': 'ActAdd (Turner et al.)', 'toxicity': toxicity} for toxicity in turner_steered_2_toxicities] +
    [{'method': 'Non-Toxic Steered', 'toxicity': toxicity} for toxicity in non_toxic_steered_toxicities] +
    [{'method': 'Unsteered', 'toxicity': toxicity} for toxicity in unsteered_toxicities]
    # + [{'method': 'ActAdd (Turner et al.) ($\lambda=5$)', 'toxicity': toxicity} for toxicity in turner_steered_toxicities]
)

In [None]:
toxicities_df['log_tox'] = np.log(toxicities_df['toxicity'])
plt.rc('font', size=16)
plt.rc('axes', titlesize=18)
plt.rc('axes', labelsize=18)
# plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=18)
plt.rc('legend', fontsize=14)
# plt.rc('figure', titlesize=20)


plt.figure(figsize=(7, 4))
sns.boxplot(toxicities_df, y='method', x='log_tox', orient='h')
plt.xlabel('Toxicity Log-Probability')
plt.ylabel('Generation Method')
plt.savefig('toxicity-boxplot.pdf', format='pdf', bbox_inches='tight')

In [None]:
input_dataset = filtered_toxic_ds[0:100]['text']
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]
prompt_batch = remove_last_if_even(small_data)
default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

In [None]:
coefs_sweep = list(range(5, 125, 15))
coefs_sweep

In [None]:
from algebraic_value_editing.prompt_utils import ActivationAddition, get_x_vector

coef_toxicities_results = []

import gc
gc.collect()
torch.cuda.empty_cache()

for coef in coefs_sweep:
    summand = [*get_x_vector(
        prompt1='Love',
        prompt2='Hate',
        coeff=coef,
        act_name=2,
        model=smol_model,
    )]
    # summand: List[ActivationAddition] = [
    #     *get_x_vector_preset(
    #         prompt1="Love",
    #         prompt2=" ",
    #         coeff=3,
    #         act_name=10,
    #     )
    # ]

    turner_steered_df_coef = gen_using_activation_additions(
        prompt_batch=prompt_batch,
        model=smol_model,
        activation_additions=summand,
        addition_location="front",
        seed=27,
        **default_kwargs
    )

    coef_toxicities_results.append((coef, turner_steered_df_coef,
                                    compute_toxicity(list(turner_steered_df_coef.completions))))

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

loving_coef_toxicities_results = []

import gc
gc.collect()
torch.cuda.empty_cache()

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=0,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

for coef in coefs_sweep:
    print(f'Computing for coef={coef}')
    activation_addition_dataset[0].coeff = coef

    loving_steered_df_coef = gen_using_activation_additions(
        prompt_batch=prompt_batch,
        model=smol_model,
        activation_additions=activation_addition_dataset,
        addition_location="front",
        seed=27,
        **default_kwargs
    )

    loving_coef_toxicities_results.append((coef, loving_steered_df_coef,
                                           compute_toxicity(list(loving_steered_df_coef.completions))))

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

non_toxic_coef_toxicities_results = []

import gc
gc.collect()
torch.cuda.empty_cache()

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=0,
    act_name=2,
    prompt=not_toxic_comments,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

for coef in coefs_sweep:
    print(f'Computing for coef={coef}')
    activation_addition_dataset[0].coeff = coef

    non_toxic_steered_df_coef = gen_using_activation_additions(
        prompt_batch=prompt_batch,
        model=smol_model,
        activation_additions=activation_addition_dataset,
        addition_location="front",
        seed=27,
        **default_kwargs
    )

    non_toxic_coef_toxicities_results.append((coef, non_toxic_steered_df_coef,
                                              compute_toxicity(list(non_toxic_steered_df_coef.completions))))

In [None]:
include_coefs = [5, 20, 50, 80, 110]
coefs_toxicities_df = pd.DataFrame(
    [{'method': f'ActAdd (Turner et al.) ($\lambda={coef}$)', 'toxicity': t, 'coef': coef}
     for coef, _,  toxicity in coef_toxicities_results if coef in include_coefs for t in toxicity]
    + [{'method': f'Loving Steering ($\lambda={coef}$)', 'toxicity': t, 'coef': coef}
     for coef, _,  toxicity in loving_coef_toxicities_results if coef in include_coefs  for t in toxicity]
    + [{'method': f'Non-Toxic Steering ($\lambda={coef}$)', 'toxicity': t, 'coef': coef}
     for coef, _,  toxicity in non_toxic_coef_toxicities_results if coef in include_coefs for t in toxicity]
    # [{'method': 'Loving Steered', 'toxicity': toxicity} for toxicity in loving_steered_toxicities] +
    # [{'method': 'Non-Toxic Steered', 'toxicity': toxicity} for toxicity in non_toxic_steered_toxicities] +
    + [{'method': 'Unsteered', 'toxicity': toxicity, 'coef': 0} for toxicity in compute_toxicity(list(unsteered_df.completions))]
)
coefs_toxicities_df['log_tox'] = np.log(coefs_toxicities_df['toxicity'])

In [None]:
plt.rc('font', size=16)
plt.rc('axes', titlesize=18)
plt.rc('axes', labelsize=18)
plt.rc('ytick', labelsize=18)
plt.rc('legend', fontsize=14)


plt.figure(figsize=(7, 6))
sns.boxplot(coefs_toxicities_df, y='method', x='log_tox', orient='h')
plt.xlabel('Toxicity Log-Probability')
plt.ylabel('Generation Method')
plt.savefig('toxicity-boxplot-act-add-coefs.pdf', format='pdf', bbox_inches='tight')

In [None]:
coefs_toxicities_df

In [None]:
coefs_toxicities_df_not_grouped = pd.DataFrame(
    [{'method': f'ActAdd (Turner et al.)', 'toxicity': t, 'coef': coef}
     for coef, _,  toxicity in coef_toxicities_results for t in toxicity]
    + [{'method': f'Loving Steering', 'toxicity': t, 'coef': coef}
     for coef, _,  toxicity in loving_coef_toxicities_results for t in toxicity]
    + [{'method': f'Non-Toxic Steering', 'toxicity': t, 'coef': coef}
     for coef, _,  toxicity in non_toxic_coef_toxicities_results for t in toxicity]
    # [{'method': 'Loving Steered', 'toxicity': toxicity} for toxicity in loving_steered_toxicities] +
    # [{'method': 'Non-Toxic Steered', 'toxicity': toxicity} for toxicity in non_toxic_steered_toxicities] +
    # + [{'method': 'Unsteered', 'toxicity': toxicity, 'coef': 0} for toxicity in compute_toxicity(list(unsteered_df.completions))]
)
coefs_toxicities_df_not_grouped['log_tox'] = np.log(coefs_toxicities_df_not_grouped['toxicity'])
sns.lineplot(data=coefs_toxicities_df_not_grouped, x='coef', y='log_tox', hue='method')
means_df = coefs_toxicities_df_not_grouped.groupby(['method', 'coef']).mean().reset_index()
sns.scatterplot(data=means_df, x='coef', y='log_tox', hue='method', legend=False)
# plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(coefs_sweep)
plt.legend(loc='upper center')
plt.xlabel('Steering Coefficient')
plt.ylabel('Toxicity Log-Probability')
plt.savefig('toxicity-lineplot-coefs-sweep.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
turner_steered_df_coef10 = coef_toxicities_results[1][1]
print_latex_random_continuation(turner_steered_df_coef10, n=5, seed=0)

In [None]:
list(not_toxic_steered_df.completions)[:5]

In [None]:
df_toxicity_loving_steered = process_sentence_toxicity_changes(list(loving_steered_df.completions))

In [None]:
list(loving_steered_df.completions)[:5]

In [None]:
compute_toxicity([' '.join(completion.split()[:4]) for completion in list(loving_steered_df.completions)[:5]])

In [None]:
df_toxicity_unsteered = process_sentence_toxicity_changes(list(unsteered_df.completions))

In [None]:
list(unsteered_df.completions)[:5]

In [None]:
df_toxicity_non_toxic_steered.head()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
plt.rcParams.update({
    'font.family':'serif',
    "text.usetex": False,
    'savefig.facecolor': 'white',
})

plt.rc('font', size=12)
plt.rc('axes', titlesize=16)
plt.rc('axes', labelsize=16)
# plt.rc('xtick', labelsize=16)
# plt.rc('ytick', labelsize=16)
plt.rc('legend', fontsize=18)
# plt.rc('figure', titlesize=20)


plt.figure(figsize=(7, 6))
sns.lineplot(data=df_toxicity_loving_steered, x='index', y='toxicity', label='Loving Steered')
sns.lineplot(data=df_toxicity_non_toxic_steered, x='index', y='toxicity', label='Non-Toxic Steered')
sns.lineplot(data=df_toxicity_unsteered, x='index', y='toxicity', label='Unsteered')
plt.xlabel('Words Generated After Prompt')
plt.ylabel('Average Toxicity')
plt.xlim([0, 60])
plt.savefig('steered-vs-unsteered-toxic-comment-sentiments.pdf', format='pdf')
plt.show()

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

# Experiment with both

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

input_dataset = filtered_toxic_ds[0:100]['text']

steering_sentiment_experiment(
  input_dataset=input_dataset,
  n_completions=6,
  model=smol_model,
  activation_addition_dataset=activation_addition_dataset,
  addition_location="front",
  seed=21,
  default_kwargs = default_kwargs
)

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

input_dataset = filtered_toxic_ds[0:100]['text']

steering_sentiment_experiment(
  input_dataset=input_dataset,
  n_completions=6,
  model=smol_model,
  activation_addition_dataset=activation_addition_dataset,
  addition_location="front",
  seed=21,
  default_kwargs = default_kwargs
)

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

input_dataset = filtered_toxic_ds[0:100]['text']

steering_sentiment_experiment(
  input_dataset=input_dataset,
  n_completions=6,
  model=smol_model,
  activation_addition_dataset=activation_addition_dataset,
  addition_location="front",
  seed=21,
  default_kwargs = default_kwargs
)

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

input_dataset = filtered_toxic_ds[0:100]['text']

steering_sentiment_experiment(
  input_dataset=input_dataset,
  n_completions=6,
  model=smol_model,
  activation_addition_dataset=activation_addition_dataset,
  addition_location="back",
  seed=21,
  default_kwargs = default_kwargs
)

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

input_dataset = filtered_toxic_ds[0:100]['text']

steering_sentiment_experiment(
  input_dataset=input_dataset,
  n_completions=20,
  model=smol_model,
  activation_addition_dataset=activation_addition_dataset,
  addition_location="front",
  seed=21,
  default_kwargs = default_kwargs
)

#Repeating for GPT-2 XL

In [None]:
from algebraic_value_editing.dataset_utils import ActivationAdditionDataset

activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]

default_kwargs = {'temperature': 1, 'freq_penalty': 1, 'top_p': .3, 'tokens_to_generate' : 80}

input_dataset = filtered_toxic_ds[0:100]['text']

steering_sentiment_experiment(
  input_dataset=input_dataset,
  n_completions=6,
  model=smol_model,
  activation_addition_dataset=activation_addition_dataset,
  addition_location="front",
  seed=21,
  default_kwargs = default_kwargs
)

# Trying random stuff

In [None]:
halfway_data = [first_half_string(s) for s in input_dataset]
small_data = [s for s in halfway_data if len(smol_model.tokenizer(s)["input_ids"]) < 200]

small_data = remove_last_if_even(small_data)

prompt_batch = small_data

In [None]:
result = gen_using_model(
    model= smol_model,
    prompt_batch = prompt_batch[0],
    seed = 0,
    include_logits = False,
    log = False,  # pylint: disable=unused-argument
    **default_kwargs,
)

In [None]:
result

In [None]:
result["prompts"][0]

In [None]:
result['completions'][0]

In [None]:
activation_addition_dataset = [ActivationAdditionDataset(
    coeff=60,
    act_name=2,
    prompt=dataset_loving,
    from_dataset=True,
    use_all_activations=True,
    prompt_2=tiny_training_subset[:200],
    from_pca=False,
    from_difference=True,
)]


steered_result = gen_using_activation_additions(
    model= smol_model,
    prompt_batch = prompt_batch[0],
    activation_additions=activation_addition_dataset,
    seed = 0,
    include_logits = False,
    log = False,  # pylint: disable=unused-argument
    **default_kwargs,
)

# gen_using_activation_additions(
    # prompt_batch = ["I hate you because"] * 8,
    # model=model,
    # activation_additions=activation_addition_dataset,
    # addition_location="front",
    # seed=0,
    # **default_kwargs)

In [None]:
steered_result['prompts'][0]

In [None]:
steered_result['completions'][0]

In [None]:
steered_result_2 = gen_using_activation_additions(
    model= smol_model,
    prompt_batch = prompt_batch[0],
    activation_additions=activation_addition_dataset,
    seed = 0,
    include_logits = False,
    log = False,  # pylint: disable=unused-argument
    **default_kwargs,
)

In [None]:
def sample(seed, index):
  steered_result = gen_using_activation_additions(
    model= smol_model,
    prompt_batch = prompt_batch[index],
    activation_additions=activation_addition_dataset,
    seed = seed,
    include_logits = False,
    log = False,  # pylint: disable=unused-argument
    **default_kwargs,
  )
  unsteered_result = gen_using_model(
    model= smol_model,
    prompt_batch = prompt_batch[index],
    seed = seed,
    include_logits = False,
    log = False,  # pylint: disable=unused-argument
    **default_kwargs,
  )

  return steered_result['prompts'][0] + steered_result['completions'][0], unsteered_result['prompts'][0] + unsteered_result['completions'][0]

In [None]:
results = {}

for seed in range(3):
  results[seed] = {}
  for index in range(5):
    results[seed][index] = sample(seed, index)

for seed in range(3):
  for index in range(5):
    print("steered response: " + results[seed][index][0])
    print("unsteered response: "+ results[seed][index][1])