# Setup (just run)

In [1]:
# Colab-specific setup

# !git clone https://github.com/AISC-Steering-LLMs/Steering-LLMs
# !pwd
# repo_path = '/content/repository/'


In [11]:
# Imports
import os
import pandas as pd
import main
from omegaconf import DictConfig, OmegaConf
import yaml
from hydra import initialize
from hydra.core.global_hydra import GlobalHydra
from hydra import compose
import ipywidgets as widgets
from IPython.display import display

from data_handler import DataHandler
from data_analyser import DataAnalyzer
from model_handler import ModelHandler

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import FeatureAgglomeration

In [2]:
# Initialize Hydra for configuration management
GlobalHydra.instance().clear()  # Clear any previous Hydra instance
initialize(config_path=".", job_name="experiment")


The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path=".", job_name="experiment")


hydra.initialize()

## Helper Functions

In [3]:
def load_yaml_config(file_path):
    """Load a YAML configuration file."""
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def create_form(config):
    """Create an interactive form for updating the configuration file."""
    form_items = []
    for key, value in config.items():
        # Choose the right widget based on the value's type
        widget_type = widgets.Checkbox if isinstance(value, bool) else widgets.IntText if isinstance(value, int) else widgets.FloatText if isinstance(value, float) else widgets.Text
        widget = widget_type(value=value, description=key)
        widget.layout = widgets.Layout(width='100%')
        widget.style.description_width='initial'
        form_items.append(widget)
    return widgets.VBox(form_items)

def update_config_and_save(btn, form):
    """Update the configuration file with values from the form."""
    updated_config = {widget.description: widget.value for widget in form.children}
    with open('config.yaml', 'w') as file:
        yaml.safe_dump(updated_config, file)
    print("Configuration updated and saved.")


# Experiment Setup

## Load existing configuration and edit if needed

In [4]:
# Load configuration and create interactive form
config = load_yaml_config('config.yaml')
form = create_form(config)
display(form)

# Create a button to save the configuration, pass the form to the event handler
save_button = widgets.Button(description="Save Configuration")
save_button.on_click(lambda btn: update_config_and_save(btn, form))
display(save_button)

VBox(children=(Text(value='gpt2-small', description='model_name', layout=Layout(width='100%'), style=TextStyle…

Button(description='Save Configuration', style=ButtonStyle())

In [6]:
# Compose the final configuration from Hydra
cfg = compose(config_name="config")

# Instantiate classes DataHandler and ModelHandler
data_handler = DataHandler("../data")
model_handler = ModelHandler(cfg)

# Load inputs and create output directories
prompts_dict = data_handler.csv_to_dictionary(cfg.prompts_sheet)
experiment_base_dir, images_dir, metrics_dir = data_handler.create_output_directories()

# Save configurations and prompts
data_handler.write_experiment_parameters(cfg, prompts_dict, experiment_base_dir)


Loaded pretrained model gpt2-small into HookedTransformer


## Model Initialization and Data Processing

In [8]:
# Initialize the model and populate the data
activations_cache = data_handler.populate_data(prompts_dict)

# Compute activations and add hidden states
model_handler.compute_activations(activations_cache)

Computing activations: 100%|██████████| 160/160 [00:10<00:00, 15.14it/s]


## Visualization

In [12]:
data_analyzer = DataAnalyzer(images_dir, metrics_dir, 42)

# Get various representations for each layer
# and plot them
tsne_model = TSNE(n_components=2, random_state=42)
tsne_embedded_data_dict, tsne_labels, tsne_prompts = data_analyzer.plot_embeddings(activations_cache, tsne_model)
pca_model = PCA(n_components=2, random_state=42)
pca_embedded_data_dict, pca_labels, pca_prompts = data_analyzer.plot_embeddings(activations_cache, pca_model)
fa_model = FeatureAgglomeration(n_clusters=2)
fa_embedded_data_dict, fa_labels, fa_prompts = data_analyzer.plot_embeddings(activations_cache, fa_model)

# Further analysis
data_analyzer.raster_plot(activations_cache)
data_analyzer.random_projections_analysis(activations_cache)
data_analyzer.probe_hidden_states(activations_cache)

# See if the representations can be used to classify the ethical area
# Why are we actually doing this? Hypothesis - better seperation of ethical areas
# Leads to better steering vectors. This actually needs to be tested.
# Only done with the t-SNE representation but could be done with others (PCA, heirarchical clustering, etc.)
data_analyzer.classifier_battery(tsne_embedded_data_dict, tsne_labels, tsne_prompts, 0.2)

TSNE: 100%|██████████| 12/12 [00:05<00:00,  2.00it/s]
PCA: 100%|██████████| 12/12 [00:02<00:00,  5.79it/s]
FeatureAgglomeration: 100%|██████████| 12/12 [00:01<00:00,  6.07it/s]
Computing Raster Plots: 100%|██████████| 12/12 [00:46<00:00,  3.86s/it]
Random projections analysis: 100%|██████████| 12/12 [00:00<00:00, 925.30it/s]
Probing hidden states: 100%|██████████| 12/12 [00:02<00:00,  4.82it/s]
Computing logistic_regression: 100%|██████████| 12/12 [00:04<00:00,  2.98it/s]
Computing decision_tree: 100%|██████████| 12/12 [00:01<00:00,  9.26it/s]
Computing random_forest: 100%|██████████| 12/12 [00:02<00:00,  4.38it/s]
Computing svc: 100%|██████████| 12/12 [00:02<00:00,  4.12it/s]
Computing knn: 100%|██████████| 12/12 [00:01<00:00,  6.68it/s]
Computing gradient_boosting: 100%|██████████| 12/12 [00:03<00:00,  3.06it/s]


## Save Results

In [13]:
# Save the activations cache if required by the configuration
if cfg.write_cache:
    model_handler.save_activations_cache(activations_cache, experiment_base_dir)