# Summary Plots Barcharts

In [1]:
%%capture
import os
# If getting 'Could not find project LASR_probe_gen' get key from https://wandb.ai/authorize and paste below
os.environ["WANDB_SILENT"] = "true"
# os.environ["WANDB_API_KEY"] = ""
import wandb
wandb_token = os.getenv("WANDB_API_KEY")
wandb.login(key=wandb_token)

## Behaviour Plots

In [None]:
from probe_gen.standard_experiments.behaviour_bar_plot import plot_behaviour_barchart
from probe_gen.config import BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL, BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION

# Define graph type
train_OOD = [False, True][0] # means test set wont match train set
test_incentivised = [False, True][0] # means we test against incentivised data

# Set probe type and activation model and keep all other parameters as initial experiments
probe_type = ["mean", "attention_torch"][0]
activations_model = ["llama_3b", "ministral_8b", "gemma_27b"][0]

done_experiments = BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL
if test_incentivised:
    done_experiments.update(BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION)
# Get experiments into format
# [probe_type, behaviour, [ID datasource, OOD datasource], activations_model, [ID off_policy_model, OOD off_policy_model]]
train_setup = []
for behaviour in list(done_experiments.keys()):
    ds = list(done_experiments[behaviour].keys())[1:]
    off_mods = [done_experiments[behaviour][ds[0]][activations_model], done_experiments[behaviour][ds[1]][activations_model]]
    train_setup.append([probe_type, behaviour, [ds[0], ds[1]], activations_model, [off_mods[0], off_mods[1]]])
    if done_experiments[behaviour]["test_both"]:
        train_setup.append([probe_type, behaviour, [ds[1], ds[0]], activations_model, [off_mods[1], off_mods[0]]])

plot_behaviour_barchart(
    train_setup=train_setup,
    train_OOD=train_OOD,
    test_incentivised=test_incentivised,
    add_mean_summary=False,
    save_path="linear_id_onpolicy.pdf",
    # save_path="linear_ood_onpolicy.pdf",
    # save_path="attention_id_onpolicy.pdf",
    # save_path="attention_ood_onpolicy.pdf",
    legend_loc="upper right",
    extra_whitespace=2,    
    # save_path="linear_id_incentivised.pdf",
    # # save_path="attention_id_incentivised.pdf",
    # legend_loc="lower left",
    # extra_whitespace=0,
    probe_type=probe_type,
)

## Behaviour Plots (Same-Test-Train)

In [None]:
from probe_gen.standard_experiments.behaviour_bar_plot import plot_behaviour_barchart_same_test_train
from probe_gen.config import BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL, BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION

# Define graph type
train_OOD = [False, True][0] # means test set wont match train set
include_deception = [False, True][0]

# Set probe type and activation model and keep all other parameters as initial experiments
probe_type = ["mean", "attention_torch"][0]
activations_model = ["llama_3b", "ministral_8b", "gemma_27b"][0] # currently the only option

done_experiments = BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL
if include_deception:
    done_experiments.update(BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION)
    
# Get experiments into format
# [probe_type, behaviour, [ID datasource, OOD datasource], activations_model, [ID off_policy_model, OOD off_policy_model]]
train_setup = []
for behaviour in list(done_experiments.keys()):
    ds = list(done_experiments[behaviour].keys())[1:]
    off_mods = [done_experiments[behaviour][ds[0]][activations_model], done_experiments[behaviour][ds[1]][activations_model]]
    train_setup.append([probe_type, behaviour, [ds[0], ds[1]], activations_model, [off_mods[0], off_mods[1]]])
    if done_experiments[behaviour]["test_both"]:
        train_setup.append([probe_type, behaviour, [ds[1], ds[0]], activations_model, [off_mods[1], off_mods[0]]])

plot_behaviour_barchart_same_test_train(
    train_setup=train_setup,
    train_OOD=train_OOD,
    add_mean_summary=False,
    save_path="linear_baseline.pdf",
    # save_path="attention_baseline.pdf",
    legend_loc="lower left" if include_deception else "upper right",
    extra_whitespace=0 if include_deception else 2,
    probe_type=probe_type,
    do_seperator_line=include_deception,
)

## Behaviour Plots (Deception)

In [None]:
from probe_gen.standard_experiments.behaviour_bar_plot import plot_behaviour_barchart
from probe_gen.config import BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION

# Define graph type
train_OOD = [False, True][0] # means test set wont match train set

# Set probe type and activation model and keep all other parameters as initial experiments
probe_type = ["mean", "attention_torch"][0]
activations_model = ["llama_3b", "other"][0]

done_experiments = BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION
model_map = {
    "deception": {
        "roleplaying": "mistral_7b",
        "trading": "mixtral",},
    "sandbagging": {
        "wmd": "mistral_7b",
        "multichoice": "ministral_8b"},
}

# Get experiments into format
# [probe_type, behaviour, [ID datasource, OOD datasource], activations_model, [ID off_policy_model, OOD off_policy_model]]
if activations_model == "llama_3b":
    train_setup = []
    for behaviour in list(done_experiments.keys()):  
        ds = list(done_experiments[behaviour].keys())[1:]
        off_mods = [done_experiments[behaviour][ds[0]][activations_model], done_experiments[behaviour][ds[1]][activations_model]]
        train_setup.append([probe_type, behaviour, [ds[0], ds[1]], activations_model, [off_mods[0], off_mods[1]]])
        train_setup.append([probe_type, behaviour, [ds[1], ds[0]], activations_model, [off_mods[1], off_mods[0]]])
else:
    train_setup = []
    for behaviour in list(done_experiments.keys()):  
        ds = list(done_experiments[behaviour].keys())[1:]
        activations_model = model_map[behaviour][ds[0]]
        off_mods = [done_experiments[behaviour][ds[0]][activations_model], None]
        train_setup.append([probe_type, behaviour, [ds[0], ds[1]], activations_model, [off_mods[0], off_mods[1]]])

        activations_model = model_map[behaviour][ds[1]]
        off_mods = [done_experiments[behaviour][ds[1]][activations_model], None]
        train_setup.append([probe_type, behaviour, [ds[1], ds[0]], activations_model, [off_mods[0], off_mods[1]]])

if train_OOD:
    title=f"{'Linear' if probe_type == 'mean' else 'Attention'} Probes Evaluated Against Diff.-Domain On-Policy Incentivised"
else:
    title=f"{'Linear' if probe_type == 'mean' else 'Attention'} Probes Evaluated Against On-Policy Incentivised"

plot_behaviour_barchart(
    train_setup=train_setup,
    train_OOD=train_OOD,
    title=title,
    figsize=(7, 3), 
    test_incentivised=True,
    add_mean_summary=False,
    save_path="linear_id_deception.pdf",
    # save_path="linear_ood_deception.pdf",
    # save_path="attention_id_deception.pdf",
    # save_path="attention_ood_deception.pdf",
    legend_loc="upper right",
    extra_whitespace=2,    
    probe_type=probe_type,
    do_seperator_line=False, 
)

## Summary Plots (Dots and Lines)

In [None]:
from probe_gen.standard_experiments.behaviour_bar_plot import plot_mean_summary_dotchart
from probe_gen.config import BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL, BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION

# Define graph type
test_incentivised = [False, True][0] # means we test against incentivised data

# Set probe type and activation model and keep all other parameters as initial experiments
probe_type = ["mean", "attention_torch"][0]
activations_model = ["llama_3b", "ministral_8b", "gemma_27b"][0] # currently the only option

done_experiments = BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL
if test_incentivised:
    done_experiments.update(BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION)
# Get experiments into format
# [probe_type, behaviour, [ID datasource, OOD datasource], activations_model, [ID off_policy_model, OOD off_policy_model]]
train_setup = []
for behaviour in list(done_experiments.keys()):
    ds = list(done_experiments[behaviour].keys())[1:]
    off_mods = [done_experiments[behaviour][ds[0]][activations_model], done_experiments[behaviour][ds[1]][activations_model]]
    train_setup.append([probe_type, behaviour, [ds[0], ds[1]], activations_model, [off_mods[0], off_mods[1]]])
    if done_experiments[behaviour]["test_both"]:
        train_setup.append([probe_type, behaviour, [ds[1], ds[0]], activations_model, [off_mods[1], off_mods[0]]])

plot_mean_summary_dotchart(
    train_setup=train_setup,
    test_incentivised=test_incentivised,
    save_path="linear_id_vs_ood_onpolicy2.pdf",
    # save_path="attention_id_vs_ood_onpolicy2.pdf",
    legend_loc="upper right",
    # legend_loc="lower left",
    extra_whitespace=0,
    probe_type=probe_type,
    draw_blobs=True, # Can be set to False to see just dots
)

## Summary Plots (Mean Bars)

In [None]:
from probe_gen.standard_experiments.behaviour_bar_plot import plot_mean_summary_barchart
from probe_gen.config import BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL, BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION

# Define graph type
test_incentivised = [False, True][0] # means we test against incentivised data

# Set probe type and activation model and keep all other parameters as initial experiments
probe_type = ["mean", "attention_torch"][0]
activations_model = ["llama_3b", "ministral_8b", "gemma_27b"][0] # currently the only option

done_experiments = BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL
if test_incentivised:
    done_experiments.update(BEHAVIOUR_DATASOURCE_ACTMODEL_OFFPOLICYMODEL_DECEPTION)
# Get experiments into format
# [probe_type, behaviour, [ID datasource, OOD datasource], activations_model, [ID off_policy_model, OOD off_policy_model]]
train_setup = []
for behaviour in list(done_experiments.keys()):
    ds = list(done_experiments[behaviour].keys())[1:]
    off_mods = [done_experiments[behaviour][ds[0]][activations_model], done_experiments[behaviour][ds[1]][activations_model]]
    train_setup.append([probe_type, behaviour, [ds[0], ds[1]], activations_model, [off_mods[0], off_mods[1]]])
    if done_experiments[behaviour]["test_both"]:
        train_setup.append([probe_type, behaviour, [ds[1], ds[0]], activations_model, [off_mods[1], off_mods[0]]])

plot_mean_summary_barchart(
    train_setup=train_setup,
    test_incentivised=test_incentivised,
    # save_path="linear_id_vs_ood_onpolicy.pdf",
    # save_path="attention_id_vs_ood_onpolicy.pdf",
    legend_loc="upper right",
    # legend_loc="lower left",
    extra_whitespace=0,
    probe_type=probe_type,
    verbose=True,
)