# Introduction

In `main.ipynb` the goodfire API is called to collect experimental data into various CSVs. This code processes and plots that raw data.

In [None]:
import pandas as pd
import numpy as np

# Some of the experiments run
moral_factors = "data/20241124001836.csv"
personas_test = "data/personas_test.csv"
baseline_models = "data/baseline_models.csv"
elephant_features = "data/elephant_features.csv"

# Load the data
df = pd.read_csv(personas_test)
df[:3]

In [None]:
# Clean and filter the data
def clean_df(df):
    if 'persona' not in df.columns: df['persona'] = ''
    df['persona'] = df['persona'].fillna('')
    df = df[(-0.8 < df['steerage']) & (df['steerage'] < 0.8)]
    return df

# Summarize the scores from the questions as two factors. 
# The Oxford Utilitarianism Scale has odd questions as coding for "Impartial Beneficence" and even as "Instrumental Harm",
# each of which is a simple average of responses.
def summarise_df(df):
    df['factor'] = df['question'].apply(lambda x: 'IB' if x%2 == 0 else 'IH')
    df2 = df[['feature', 'steerage', 'persona', 'factor', 'mean_score']].groupby(['feature', 'steerage', 'persona', 'factor'], as_index=False).mean()
    df2 = df2.pivot(index=['feature', 'steerage', 'persona'],columns='factor', values='mean_score').reset_index()
    return df2

def get_df(filename):
    return summarise_df(clean_df(pd.read_csv(filename)))

df2 = summarise_df(clean_df(df))
df2[:3]



In [None]:

# Load and process the human dataset.
h1 = pd.read_csv("ous_data/ous_align2.csv")


h2 = h1.copy()
h2["IB"] = (h2["IB1"] + h2["IB2"] + h2["IB3"] + h2["IB4"] + h2["IB5"]) / 5
h2["IH"] = (h2["IH1"] + h2["IH2"] + h2["IH3"] + h2["IH4"]) / 5

h2[["sex", "country", "age", "IB", "IH"]][:3]

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.stats import gaussian_kde

def scatter_plot_feature(feature, persona, group, fontsize=9, use_legend=False):
    """Group should have IB/IH columns and be in sorted order"""
    # label the chart
    has_persona = persona
    has_feature = (group["steerage"] != 0).any()
    if has_persona and has_feature:
        name = f"{feature} ({persona})"
    elif has_persona:
        name = persona
    else:
        name = feature
    # Draw the actual dots
    sc = plt.scatter(group['IB'], group['IH'],  marker='o',label=name)
    color = sc.get_facecolor()
    # Connect the dots. These are draw as arrows so it's clear which end of series
    # is positive.
    for i in range(len(group) - 1):
        plt.annotate(
            '',  # No text for the annotation
            xy=(group['IB'].iloc[i + 1], group['IH'].iloc[i + 1]),  # Arrow end
            xytext=(group['IB'].iloc[i], group['IH'].iloc[i]),  # Arrow start
            arrowprops=dict(arrowstyle='->', lw=1.5, color=color),  # Arrow style
        )

    if not use_legend:
        plt.text(group['IB'].iloc[-1] + 0.04, group['IH'].iloc[-1], name, fontsize=fontsize, ha='left', va='center')

def human_heatmap():
    # Draw humans a heatmap
    ib_vals = np.arange(1, 8)
    ih_vals = np.arange(1, 8)
    heatmap, xedges, yedges = np.histogram2d(
        h2['IB'], h2['IH'], bins=(ib_vals, ih_vals)
    )
    plt.pcolormesh(xedges, yedges, heatmap.T, cmap='viridis', shading='auto', alpha=0.3)

def human_kde():
    # TODO: Not sure this is correct?
    # Draw humans as KDE
    smoothness=20
    ib_vals = np.linspace(1, 7, smoothness)  # Smoother grid for IB
    ih_vals = np.linspace(1, 7, smoothness)  # Smoother grid for IH
    ib_grid, ih_grid = np.meshgrid(ib_vals, ih_vals)
    positions = np.vstack([ib_grid.ravel(), ih_grid.ravel()])
    values = np.vstack([h2['IB'], h2['IH']])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, ib_vals.shape + ih_vals.shape)

    #plt.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r, extent=[1, 7, 1, 7])
    plt.imshow(np.rot90(np.rot90(np.rot90(Z))), cmap=plt.cm.gist_earth_r, extent=[1, 7, 1, 7])

def make_plot(df2, heatmap=False, title='Feature steerage effect on Instrumental Harm and Impartial Beneficence', fontsize=9, use_legend=False):

    plt.figure(figsize=(10, 6))

    if heatmap:
        #human_heatmap()
        human_kde()

    for (feature, persona), group in df2.groupby(by=['feature', 'persona']):
        scatter_plot_feature(feature, persona, group, fontsize, use_legend)


    # Add labels, legend, and grid
    # plt.xlim(1, 7)
    # plt.ylim(1, 7)
    # plt.xticks(range(1, 8))  # Tick marks from 1 to 7
    # plt.yticks(range(1, 8))  # Tick marks from 1 to 7
    plt.xlabel('Impartial Beneficence')
    plt.ylabel('Instrumental Harm')
    plt.title(title)
    if use_legend:
        plt.legend(loc="upper left")
    plt.grid(True)

    # Show plot
    plt.show()

def chunks(xs, n):
    n = max(1, n)
    return (xs[i:i+n] for i in range(0, len(xs), n))

def do_plots(df2, chunkby=5, *args, **kwargs):
    # TODO: Sort features by shape?
    all_features = df2["feature"].unique()
    for features in chunks(all_features, chunkby):
        make_plot(df2[df2["feature"].isin(features)], *args, **kwargs)




In [None]:
def get_steepest(df,n=5, sortby="mag"):

    x = df.pivot(columns="steerage", index=["feature", "persona"])
    ib = (x[("IB", 0.1)] - x[("IB", -0.1)]) / 0.2
    ih = (x[("IH", 0.1)] - x[("IH", -0.1)]) / 0.2
    slope = pd.DataFrame({'IB': ib, 'IH': ih})
    slope["mag"] = (slope["IB"]**2+slope["IH"]**2)**0.5
    slope["IB"] = slope["IB"].abs()
    slope["IH"] = slope["IH"].abs()
    slope = slope.sort_values(sortby,ascending=False)
    best = slope[:n].reset_index()["feature"]
    return df[df["feature"].isin(best)]


# Output charts

Reproducible versions of the charts in the paper

In [None]:
do_plots(get_df(baseline_models), title="Baseline Models", heatmap=True, fontsize=20)

In [None]:
do_plots(get_df(elephant_features), title="Elephant Features (negative case)", heatmap=True)

In [None]:
do_plots(get_steepest(get_df(moral_factors), sortby="IB"), title="Shallowest 5 Features", heatmap=True, chunkby=100)
do_plots(get_steepest(get_df(moral_factors), sortby="IH"), title="Steepest 5 Features", heatmap=True, chunkby=100)

In [None]:
do_plots(get_df(moral_factors))

In [None]:
df = get_df("data/20241124101056nationalities.csv")
do_plots(df, use_legend=False)