In [None]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

fp = 'path/to/results.csv'
df = pd.read_csv(fp)
df.rename(columns={'Unnamed: 0': 'profile'}, inplace=True)
df

In [None]:
def preprocess_df(df):
    """Rename the profile column and parse list columns."""
    df = df.copy()
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'profile'})
    cols = [
        'interaction_time', 'interaction_num_turns', 'interaction_total_char_length',
        'accuracy', 'AUCROC', 'correct_prob',
        'accuracy_relative', 'AUCROC_relative', 'correct_prob_relative'
    ]
    for col in cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return df

def plot_metric_over_interactions(df, metric, ax=None):
    """Plot a metric over interactions for each profile."""
    ax = ax or plt.gca()
    for _, row in df.iterrows():
        turns = row['interaction_num_turns']
        values = row[metric]
        ax.plot(turns, values, marker='o', label=f'Profile {row["profile"]}')
    ax.set_xlabel('interaction_num_terms')
    ax.set_ylabel(metric)
    ax.set_title(f'{metric} over interactions')
    ax.legend()

def compare_profiles(df, metric, turn=-1, ax=None):
    """Compare profiles for a given metric at a specific interaction step."""
    ax = ax or plt.gca()
    values = []
    labels = []
    for _, row in df.iterrows():
        data = row[metric]
        val = data[turn] if isinstance(data, list) else data
        labels.append(row['profile'])
        values.append(val)
    ax.bar(labels, values)
    ax.set_xlabel('profile')
    ax.set_ylabel(metric)
    step = turn if turn >= 0 else 'final'
    ax.set_title(f'{metric} at step {step}')
    plt.xticks(rotation=45)
    return pd.DataFrame({'profile': labels, metric: values})

def plot_metric_average(df, metric, shade_std=True, ax=None):
    """Average the metric across profiles and plot it over interaction rounds."""
    ax = ax or plt.gca()
    turns = np.array(df['interaction_num_turns'].tolist()[0])
    values = np.vstack(df[metric].tolist())
    mean_vals = values.mean(axis=0)
    std_vals = values.std(axis=0)
    ax.plot(turns, mean_vals, marker='o', label=f'Average {metric}')
    if shade_std:
        ax.fill_between(turns, mean_vals - std_vals, mean_vals + std_vals, alpha=0.3)
    ax.set_xlabel('interaction_num_terms')
    ax.set_ylabel(metric)
    ax.set_title(f'Average {metric} over interactions')
    ax.legend()

def compare_dfs_average(dfs, metric, labels=None, shade_std=True, ax=None):
    """Compare multiple DataFrames on a metric averaged across profiles."""
    ax = ax or plt.gca()
    if labels is None:
        labels = [f'df{i+1}' for i in range(len(dfs))]
    for df, label in zip(dfs, labels):
        turns = np.array(df['interaction_num_turns'].tolist()[0])
        values = np.vstack(df[metric].tolist())
        mean_vals = values.mean(axis=0)
        std_vals = values.std(axis=0)
        ax.plot(turns, mean_vals, marker='o', label=label)
        if shade_std:
            ax.fill_between(turns, mean_vals - std_vals, mean_vals + std_vals, alpha=0.3)
    ax.set_xlabel('interaction_num_terms')
    ax.set_ylabel(metric)
    ax.set_title(f'{metric} comparison')
    ax.legend()


In [None]:
# Example usage
df = preprocess_df(df)
plot_metric_over_interactions(df, 'accuracy')
plot_metric_average(df, 'accuracy')
plt.show()
compare_profiles(df, 'accuracy')
