# Fake Plots for Leg 1 Presentation

Date: 17 March 2025

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.DataFrame(
    columns=["Dataset", "Probe Type", "AUROC"],
    data=[
        ["Dishonesty", "Probe 1 (situations)", 0.72],
        ["Dishonesty", "Probe 2 (pressure)", 0.78],
        ["Sandbagging", "Probe 1 (situations)", 0.70],
        ["Sandbagging", "Probe 2 (pressure)", 0.75],
        ["Jailbreaks", "Probe 1 (situations)", 0.82],
        ["Jailbreaks", "Probe 2 (pressure)", 0.88],
        ["Misuse", "Probe 1 (situations)", 0.80],
        ["Misuse", "Probe 2 (pressure)", 0.85],
    ],
)


def plot_probe_comparison(data: pd.DataFrame):
    # Set style and figure size
    plt.figure(figsize=(10, 6))
    sns.set_style("whitegrid")

    # Create the bar plot
    sns.barplot(
        data=data,
        x="Dataset",
        y="AUROC",
        hue="Probe Type",
        palette=["#2ecc71", "#3498db"],  # Green for High-Stakes, Blue for Pressure
    )

    # Customize the plot
    plt.title(
        "High-Stakes probes detect dangerous behaviour",
        fontsize=14,
        pad=20,
    )
    plt.xlabel("Datasets", fontsize=12)
    plt.ylabel("AUROC", fontsize=12)

    # Adjust legend
    plt.legend(title="")

    return plt.gcf()


# Display the plot
plot_probe_comparison(data)
plt.savefig("../data/results/plots/probe_comparison.png")
plt.show()

In [None]:
def plot_model_scaling(data: pd.DataFrame):
    # Set style and figure size
    plt.figure(figsize=(10, 6))
    sns.set_style("whitegrid")

    # Create the bar plot
    sns.barplot(
        data=data,
        x="Dataset",
        y="AUROC",
        hue="Model Size",
        palette=["#e74c3c", "#f39c12", "#2ecc71"],  # Red, Orange, Green
    )

    # Customize the plot
    plt.title(
        "Model-centric high-stakes detection scales with model size",
        fontsize=14,
        pad=20,
    )
    plt.xlabel("")
    plt.ylabel("AUROC", fontsize=12)

    # Adjust legend
    plt.legend(title="")

    return plt.gcf()


data = pd.DataFrame(
    columns=["Dataset", "Model Size", "AUROC"],
    data=[
        ["Shutdown", "8B", 0.62],  # Lower performance
        ["Shutdown", "70B", 0.65],  # Only slight improvement
        ["Shutdown", "405B", 0.89],  # Big jump in performance
        ["Weights altered", "8B", 0.70],
        ["Weights altered", "70B", 0.82],
        ["Weights altered", "405B", 0.91],
        ["Deployment", "8B", 0.68],
        ["Deployment", "70B", 0.80],
        ["Deployment", "405B", 0.88],
    ],
)

# Example usage:
plot_model_scaling(data)
plt.savefig("model_scaling.png")
plt.show()