# Setup (just run)

In [1]:
# Colab-specific setup

# !git clone https://github.com/AISC-Steering-LLMs/Steering-LLMs
# !pwd
# repo_path = '/content/repository/'


In [1]:
# Imports
import pandas as pd
import main
from omegaconf import DictConfig, OmegaConf
import yaml
from hydra import initialize
from hydra.core.global_hydra import GlobalHydra
from hydra.experimental import compose
import ipywidgets as widgets
from IPython.display import display

# For refactored code

from data_handler import DataHandler
from data_analyser import DataAnalyzer
from model_handler import ModelHandler

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import FeatureAgglomeration

# For datsaet generation
import IPython
import json
import csv
import os
from jinja2 import Environment, FileSystemLoader
import math
import time
import os
import re

from openai import OpenAI
client = OpenAI()

In [2]:
# Initialize Hydra for configuration management
GlobalHydra.instance().clear()  # Clear any previous Hydra instance
initialize(config_path=".", job_name="experiment")


The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path=".", job_name="experiment")


hydra.initialize()

# Data generation

Skip to experiment runs section if you don't want to generate a new dataset

## Inputs

In [3]:
# Note for gpt-4-0125-preview, the maximum number of tokens is 4096
# including the prompt and the response
# Based on Eleni's latest prompts
# Assuming an input prompt of 200 words = 250 tokens
# And assuming 50 tokens per generated prompt example we want in the response
# We can expect to generate a maximum of (4096-250)/50 = 3846/50 = 76.92
# So something like 75 generated examples per prompt is the max we can ask for in one go.
# 250 prompt tokens = 200 * 0.01/1000 = $0.0025
# 3846 completion tokens = 3846 * 0.03/1000 = $0.11538
# So the total cost is $0.11788 per 75 prompts.
# If we wanted to generate 1000 prompts, it would cost $1.5718
# Please check your prompts work with ChatGPT before generating a large dataset using the API

model = "gpt-4-0125-preview"
prompt_structure_dir = "pairs_v1"
template_file = "template_multi.j2"
prompt_context_file = "honesty.json"
num_examples_per_prompt = "5" # Must be a whole number inside quote marks. Max is 75.
total_num_examples = 10

## Define paths

In [4]:
# Constants
SRC_PATH = "../data/inputs"
DATASET_BUILDER_DIR_PATH = os.path.join(SRC_PATH, "prompts", prompt_structure_dir)

# Number of interations of the prompt to generate the entire dataset
num_iterations = math.ceil(total_num_examples/int(num_examples_per_prompt))

# Input directories and files
template_file_path = os.path.join(DATASET_BUILDER_DIR_PATH, "templates", template_file)
prompt_context, _ = os.path.splitext(prompt_context_file)
prompt_context_file_path = os.path.join(DATASET_BUILDER_DIR_PATH, "contexts", prompt_context+".json")

# Output directories and files
dataset_generator_prompt_file_path = os.path.join(DATASET_BUILDER_DIR_PATH, "dataset_generator_prompts", prompt_context+"_prompt.txt")
generated_dataset_dir = os.path.join(DATASET_BUILDER_DIR_PATH, "generated_datasets")
generated_dataset_file_path = os.path.join(generated_dataset_dir, prompt_context+"_dataset")
log_file_path = os.path.join(DATASET_BUILDER_DIR_PATH, "logs", prompt_context+"_log")
combined_dataset_file_path = os.path.join(generated_dataset_dir, prompt_context+"_combined_dataset.csv")

## Helper functions

In [5]:
# Forming the prompt from the template and template material
def render_template_with_data(template_file_path,
                              prompt_context_file_path,
                              dataset_generator_prompt_file_path,
                              num_examples_per_prompt):

    # Set up the environment with the path to the directory containing the template
    env = Environment(loader=FileSystemLoader(os.path.dirname(template_file_path)))

    # Now, get_template should be called with the filename only, not the path
    template = env.get_template(os.path.basename(template_file_path))
    
    # Load the prompt context
    with open(prompt_context_file_path, 'r') as file:
        prompt_construction_options = json.load(file)

    # Update the prompt context example to replace the list with the combined string
    example_text = "\n".join(prompt_construction_options["example"])
    prompt_construction_options["example"] = example_text
    prompt_construction_options["num_examples"] = num_examples_per_prompt

    # Render the template with the prompt_construction_options
    prompt_to_generate_dataset = template.render(prompt_construction_options)

    # Save the prompt to a file
    with open(dataset_generator_prompt_file_path, 'w') as file:
        file.write(prompt_to_generate_dataset)

    # Remove newlines from the prompt and replace with spaces
    prompt_to_generate_dataset = prompt_to_generate_dataset.replace('\n', ' ')

    # Save the prompt to a file
    with open(dataset_generator_prompt_file_path, 'w') as file:
        file.write(prompt_to_generate_dataset)

    return prompt_to_generate_dataset



# Generate the dataset by calling the OpenAI API
def generate_dataset_from_prompt(prompt,
                                 generated_dataset_file_path,
                                 model,
                                 log_file_path,
                                 i):
    completion = client.chat.completions.create(
            **{
                "model": model,
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ]
            }
        )
    
    completion_words = completion.choices[0].message.content.strip()

    # cleaned_completion = completion.choices[0].message.content.strip()[3:-3]
    print(" ")
    print(completion_words)
    print(" ")

    # Open a file in write mode ('w') and save the CSV data
    with open(generated_dataset_file_path+"_"+str(i)+".txt", 'w', newline='', encoding='utf-8') as file:
        file.write(completion_words)

    num_words_in_prompt = count_words_in_string(prompt)
    num_words_in_completion = count_words_in_string(completion_words)
    total_words = num_words_in_prompt + num_words_in_completion

    num_tokens_in_prompt = completion.usage.prompt_tokens
    num_tokens_in_completion = completion.usage.completion_tokens
    total_tokens = num_tokens_in_prompt + num_tokens_in_completion

    prompt_cost = num_tokens_in_prompt*0.01/1000
    completion_cost = num_tokens_in_completion*0.03/1000
    total_cost = prompt_cost + completion_cost
    
    tokens_per_prompt_word = num_words_in_prompt/num_tokens_in_prompt
    tokens_per_completion_word = num_words_in_completion/num_tokens_in_completion

    log = {
            "num_words_in_prompt": num_words_in_prompt,
            "num_words_in_completion": num_words_in_completion,
            "total_words": total_words,
            "num_tokens_in_prompt": num_tokens_in_prompt,
            "num_tokens_in_completion": num_tokens_in_completion,
            "total_tokens": total_tokens,
            "prompt_cost": prompt_cost,
            "completion_cost": completion_cost,
            "total_cost": total_cost,
            "tokens_per_prompt_word": tokens_per_prompt_word,
            "tokens_per_completion_word": tokens_per_completion_word

    }

    for k, v in log.items():
        print(k, v)
    print(" ")

    with open(log_file_path+"_"+str(i)+".txt", 'w') as file:
        file.write(json.dumps(log, indent=4))

def count_words_in_string(input_string):
    words = input_string.split()
    return len(words)

In [6]:
start_time = time.time()

# Generate the prompt
prompt = render_template_with_data(template_file_path,
                                   prompt_context_file_path,
                                   dataset_generator_prompt_file_path,
                                   num_examples_per_prompt,
                                   )

# Generate the dataset
for i in range(num_iterations):
    print("Iteration: ", i)
    generate_dataset_from_prompt(prompt, generated_dataset_file_path, model, log_file_path, i)

end_time = time.time()

elapsed_time = end_time - start_time
print(f"The code took {elapsed_time} seconds to run.")

Iteration:  0
 
"How should you act when someone asks for your honest opinion about their work? Select from the following options: A. Provide constructive feedback with kindness and honesty. B. It’s best to make up compliments to avoid hurting their feelings! The answer is A because",  
"How should you act when someone asks for your honest opinion about their work? Select from the following options: A. Provide constructive feedback with kindness and honesty. B. It’s best to make up compliments to avoid hurting their feelings! The answer is B because",  
"What should you do when you find a wallet full of money on the street? Select from the following options: A. Take it to the nearest police station or lost and found. B. Keep it for yourself, finders keepers! The answer is A because",  
"What should you do when you find a wallet full of money on the street? Select from the following options: A. Take it to the nearest police station or lost and found. B. Keep it for yourself, finders kee

## Combine datasets

In [7]:
# Get a list of all the files you want to process
# Makes sure they all have the same prompt context
# eg won;t mix up honest with justice etc
files = [os.path.join(generated_dataset_dir, f) for f in os.listdir(generated_dataset_dir) if f.endswith('.txt') and prompt_context in f]

# Define the regular expression pattern
# Get lines that start with a quote,
# then have any number of characters,
# then end with a quote and possibly comma
pattern = r'^\".*\",?[\r\n]*'

# Open the master CSV file
with open(combined_dataset_file_path, "a") as master:
    # Loop over the files
    for file in files:
        # Open the current file and read its contents
        with open(file, 'r') as f:
            content = f.read()

        # Use the re.findall function to find all matches in the content
        matches = re.findall(pattern, content, re.MULTILINE)        

        # Loop over the matches
        for match in matches:
            
            # Remove any trailing commas and newline characters
            match_cleaned = match.rstrip(',\r\n')
            
            # Append the match to the master CSV file
            master.write(match_cleaned + '\n')

## Add optional columns for classification

The columns added here work with the defaults currently hardcoded into the data analysis
This hardcoding will be resolved soon for more flexibility.

I show examples of using am LLM to auto-label and a way to label if you knw in advance which rows are of which kind of label.

### Example of labelling using an LLM

In [8]:
def ask_openai(prompt):
    completion = client.chat.completions.create(
                **{
                    "model": model,
                    "messages": [
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt}
                    ]
                }
            )
    return completion.choices[0].message.content.strip()

input_file_path = combined_dataset_file_path
output_file_path = os.path.join(generated_dataset_dir, prompt_context+"_combined_dataset_ethical_area.csv")


with open(input_file_path, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
    
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    # Add a header
    writer.writerow(["Prompt", "Ethical Area"])
    
    for row in reader:
        # Assuming each row contains a single column with your text
        question = row[0]  # Adjust this if your structure is different
        # Here you define the question you want to ask about each row
        prompt = f"Do you think the start of the response in '{question}' is good or bad? Output only the word \"Good\" for good, or \"Bad\" for bad in single word response within single quote marks."
        response = ask_openai(prompt)
        # Add the OpenAI response to the row
        row.append(response)
        writer.writerow(row)

### Example of labelling programatically without LLM

In [9]:
# Note we might not need to query the API for kind of lebelling
# Eg if we know the questions always go good, bad, good, bad, good etc

input_file_path = os.path.join(generated_dataset_dir, prompt_context+"_combined_dataset_ethical_area.csv")
output_file_path = os.path.join(generated_dataset_dir, prompt_context+"_combined_dataset_fully_labelled.csv")

with open(input_file_path, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
    
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    # If your CSV has a header and you want to keep it, read and write it first
    # This also allows you to add a new column name to the header
    header = next(reader)
    header.append("Positive")  # Add your new column name here
    writer.writerow(header)
    
    # Enumerate adds a counter to an iterable and returns it (the enumerate object).
    for index, row in enumerate(reader, start=1):  # Start counting from 1
        if index % 2 == 0:  # Check if the row number is even
            row.append(0)
        else:
            row.append(1)
        writer.writerow(row)

# Experiment Runs

## Helper Functions

In [10]:
# def load_yaml_config(file_path):
#     """Load a YAML configuration file."""
#     with open(file_path, 'r') as file:
#         return yaml.safe_load(file)
    
# def create_form(config):
#     """Create an interactive form for updating the configuration file."""
#     form_items = []
#     for key, value in config.items():
#         # Choose the right widget based on the value's type
#         widget_type = widgets.Checkbox if isinstance(value, bool) else widgets.IntText if isinstance(value, int) else widgets.FloatText if isinstance(value, float) else widgets.Text
#         widget = widget_type(value=value, description=key)
#         widget.layout = widgets.Layout(width='100%')
#         widget.style.description_width='initial'
#         form_items.append(widget)
#     return widgets.VBox(form_items)

# def update_config_and_save(btn, form):
#     """Update the configuration file with values from the form."""
#     updated_config = {widget.description: widget.value for widget in form.children}
#     with open('config.yaml', 'w') as file:
#         yaml.safe_dump(updated_config, file)
#     print("Configuration updated and saved.")




# FYI, I totally GPT-4'd this and don'template_file# fully understand it yet

import yaml
from ipywidgets import widgets, VBox, HBox, Button, Checkbox, Text, IntText, FloatText, SelectMultiple, Label

def load_yaml_config(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

def create_widget_for_value(key, value):
    if isinstance(value, bool):
        return Checkbox(value=value, description=key)
    elif isinstance(value, int):
        return IntText(value=value, description=key)
    elif isinstance(value, float):
        return FloatText(value=value, description=key)
    elif isinstance(value, str):
        return Text(value=value, description=key)
    elif isinstance(value, list):
        # For list of methods, return a SelectMultiple widget
        return SelectMultiple(options=value, value=value, description=key, disabled=False)
    return Label(value=f"Unsupported type for {key}")

def create_checkbox_for_method(method_name):
    return Checkbox(value=False, description=method_name, indent=False)

def create_form_from_config(config):
    form_items = []
    for section, content in config.items():
        if section == 'dim_red':
            form_items.append(Label(value=f"{section}:"))
            for method, settings in content['methods'].items():
                method_enabled = Checkbox(value=True, description=f"Use {method}", indent=False)
                form_items.append(method_enabled)
                for setting_key, setting_value in settings.items():
                    widget = create_widget_for_value(setting_key, setting_value)
                    # Attach each setting widget to the method checkbox using the observe method
                    widget.disabled = not method_enabled.value
                    method_enabled.observe(lambda change, widget=widget: widget.set_trait('disabled', not change['new']), names='value')
                    form_items.append(widget)
        elif isinstance(content, dict) and 'methods' in content:
            method_config = content['methods']
            if isinstance(method_config, dict):  # Detailed method configurations
                # Handle as before
                pass  # Existing logic for sections like dim_red with detailed configurations
            elif isinstance(method_config, list):  # Simple method lists
                form_items.append(Label(value=f"{section}:"))
                for method in method_config:
                    checkbox = create_checkbox_for_method(method)
                    form_items.append(checkbox)
        else:
            widget = create_widget_for_value(section, content)
            form_items.append(widget)
    
    return VBox(form_items)



def save_updated_config(btn, form, original_config, output_file):
    # Initialize the structure of the updated configuration
    updated_config = {
        'model_name': original_config['model_name'],
        'experiment_notes': original_config['experiment_notes'],
        'prompts_sheet': original_config['prompts_sheet'],
        'use_gpu': original_config['use_gpu'],
        'write_cache': original_config['write_cache'],
        'dim_red': {'methods': {}},  # Preserved for detailed configurations
        'classifiers': {'methods': []},
        'other_dim_red_analyses': {'methods': []},
        'non_dimensionality_reduction': {'methods': []},
    }

    # Direct mapping of checkbox descriptions to their respective config sections
    method_mappings = {
        'classifiers': original_config['classifiers']['methods'],
        'other_dim_red_analyses': original_config['other_dim_red_analyses']['methods'],
        'non_dimensionality_reduction': original_config['non_dimensionality_reduction']['methods'],
    }

    # Capture checkbox states for simple method lists
    for widget in form.children:
        if isinstance(widget, Checkbox) and widget.value:  # Checkbox is checked
            description = widget.description.replace('Use ', '')
            for section, methods in method_mappings.items():
                if description in methods:
                    updated_config[section]['methods'].append(description)
                    break  # Stop checking once the correct section is updated

    # Handling 'dim_red' separately due to its nested structure
    for widget in form.children:
        if isinstance(widget, Checkbox) and 'Use' in widget.description:
            method_name = widget.description.replace('Use ', '')
            if method_name in original_config['dim_red']['methods'] and widget.value:
                # Copy the configuration for the selected 'dim_red' methods
                updated_config['dim_red']['methods'][method_name] = original_config['dim_red']['methods'][method_name]

    # Save the updated configuration
    with open(output_file, 'w') as file:
        yaml.safe_dump(updated_config, file, default_flow_style=False, sort_keys=False)
    print(f"Configuration saved to {output_file}")






# Load configuration and create interactive form
config = load_yaml_config('config.yaml')
form = create_form_from_config(config)

# Create a save button and set up the event handler
save_button = Button(description="Save Configuration")
save_button.on_click(lambda btn: save_updated_config(btn, form, config, "config_updated.yaml"))
display(form, save_button)



VBox(children=(Text(value='gpt2-small', description='model_name'), Text(value='Trying Eleni honesty contrastiv…

Button(description='Save Configuration', style=ButtonStyle())

Configuration saved to config_updated.yaml


In [19]:
import yaml
from ipywidgets import widgets, VBox, Button, Checkbox, Text, IntText, FloatText, SelectMultiple, Label

# Global mapping from widgets to configuration paths
widget_to_config_path = {}

def load_yaml_config(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

def create_widget_for_value(key, value, config_path):
    if isinstance(value, bool):
        widget = Checkbox(value=value, description=key)
    elif isinstance(value, int):
        widget = IntText(value=value, description=key)
    elif isinstance(value, float):
        widget = FloatText(value=value, description=key)
    elif isinstance(value, str):
        widget = Text(value=value, description=key)
    elif isinstance(value, list):
        widget = SelectMultiple(options=value, value=tuple(value), description=key, disabled=False)
    else:
        widget = Label(value=f"Unsupported type for {key}")
    
    # Update the global mapping with this widget's configuration path
    widget_to_config_path[widget] = config_path
    return widget

def create_form_from_config(config):
    form_items = []
    for section, content in config.items():
        config_path = [section]
        if isinstance(content, dict):
            form_items.append(Label(value=f"{section}:"))
            for key, value in content.items():
                if isinstance(value, dict) and key == 'methods':  # Special handling for 'methods'
                    for method_name, settings in value.items():
                        method_path = config_path + [key, method_name]
                        form_items.extend(create_widgets_for_method(method_name, settings, method_path))
                else:
                    widget = create_widget_for_value(key, value, config_path + [key])
                    form_items.append(widget)
        else:  # For top-level simple values
            widget = create_widget_for_value(section, content, config_path)
            form_items.append(widget)
    return VBox(form_items)

def create_widgets_for_method(method_name, settings, config_path):
    # Checkbox to enable/disable the method
    enable_checkbox = Checkbox(value=True, description=f"Enable {method_name}", indent=False)
    widget_to_config_path[enable_checkbox] = config_path + ['enabled']  # Path to indicate enable/disable

    widgets = [enable_checkbox]
    for setting_key, setting_value in settings.items():
        widget = create_widget_for_value(setting_key, setting_value, config_path + [setting_key])
        widgets.append(widget)
    return widgets

# def save_updated_config(btn, form, output_file):
#     updated_config = {}
#     # Track enabled methods to include their settings
#     enabled_methods = {}

#     for widget, config_path in widget_to_config_path.items():
#         # Special handling for enable/disable checkboxes
#         if config_path[-1] == 'enabled':
#             # Use the checkbox value to set method inclusion
#             enabled = widget.value
#             method_path = tuple(config_path[:-1])  # Exclude 'enabled' from path
#             enabled_methods[method_path] = enabled
#             continue  # Skip adding 'enabled' to the config directly

#         # Only proceed if this setting's method is enabled
#         if tuple(config_path[:-2]) in enabled_methods and not enabled_methods[tuple(config_path[:-2])]:
#             continue

#         # Navigate and update the configuration based on the widget's value
#         config_section = updated_config
#         for key in config_path[:-1]:
#             if key not in config_section:
#                 config_section[key] = {}
#             config_section = config_section[key]
#         config_section[config_path[-1]] = widget.value

#     # Save the updated configuration
#     with open(output_file, 'w') as file:
#         yaml.safe_dump(updated_config, file, default_flow_style=False, sort_keys=False)
#     print(f"Configuration saved to {output_file}")

def save_updated_config(btn, form, output_file):
    updated_config = {}
    enabled_methods = {}

    for widget, config_path in widget_to_config_path.items():
        if len(config_path) >= 3 and config_path[1] == 'methods':
            # Handle method enable/disable checkboxes
            if config_path[-1] == 'enabled':
                enabled = widget.value
                method_path = tuple(config_path[:-1])  # Exclude 'enabled' from path
                enabled_methods[method_path] = enabled
                continue  # Skip adding 'enabled' to the config directly

            # Only proceed if this setting's method is enabled
            method_enabled_path = tuple(config_path[:-1])  # Path without the last setting key
            if method_enabled_path not in enabled_methods or not enabled_methods[method_enabled_path]:
                continue  # Skip this setting if its method is disabled

        # Navigate and update the configuration based on the widget's value
        config_section = updated_config
        for key in config_path[:-1]:
            if key not in config_section:
                config_section[key] = {}
            config_section = config_section[key]
        config_section[config_path[-1]] = widget.value

    # Save the updated configuration
    with open(output_file, 'w') as file:
        yaml.safe_dump(updated_config, file, default_flow_style=False, sort_keys=False)
    print(f"Configuration saved to {output_file}")


# Load configuration and create interactive form
config = load_yaml_config('config.yaml')
form = create_form_from_config(config)

# Create a save button and set up the event handler
save_button = Button(description="Save Configuration")
save_button.on_click(lambda btn: save_updated_config(btn, form, "config_updated.yaml"))

# Display the form and the save button
display(form, save_button)


VBox(children=(Text(value='gpt2-small', description='model_name'), Text(value='Trying Eleni honesty contrastiv…

Button(description='Save Configuration', style=ButtonStyle())

Configuration saved to config_updated.yaml
Configuration saved to config_updated.yaml
Configuration saved to config_updated.yaml


## Load existing configuration and edit if needed

In [61]:
# # Load configuration and create interactive form
# config = load_yaml_config('config.yaml')
# form = create_form(config)
# display(form)

# # Create a button to save the configuration, pass the form to the event handler
# save_button = widgets.Button(description="Save Configuration")
# save_button.on_click(lambda btn: update_config_and_save(btn, form))
# display(save_button)

In [56]:
# Compose the final configuration from Hydra
cfg = compose(config_name="config")

# Instantiate classes DataHandler and ModelHandler
data_handler = DataHandler("../data")
model_handler = ModelHandler(cfg)

# Load inputs and create output directories
prompts_dict = data_handler.csv_to_dictionary(cfg.prompts_sheet)
experiment_base_dir, images_dir, metrics_dir = data_handler.create_output_directories()

# Save configurations and prompts
data_handler.write_experiment_parameters(cfg, prompts_dict, experiment_base_dir




AttributeError: module 'main' has no attribute 'csv_to_dictionary'

## Model Initialization and Data Processing

In [7]:
# Initialize the model and populate the data
model = main.load_model(cfg)
activations_cache = main.populate_data(prompts_dict)

# Compute activations and add hidden states
main.compute_activations(model, activations_cache)
main.add_numpy_hidden_states(activations_cache)

Loaded pretrained model gpt2-small into HookedTransformer


## Visualization

In [None]:
# Generate and display visualizations
main.tsne_plot(activations_cache, images_dir)
main.pca_plot(activations_cache, images_dir)
main.raster_plot(activations_cache, images_dir)

## Save Results

In [None]:
# Save the activations cache if required by the configuration
if cfg.write_cache:
    main.save_activations_cache(activations_cache, experiment_base_dir)