In [None]:
import pandas as pd
from plotly import graph_objects as go
import os

In [None]:
# load and concat all data

dfs = []

for subdir in os.listdir('cases'):
    if subdir == "dummy_case":
        continue
    if not os.path.isdir(f'cases/{subdir}'):
        continue
    if 'raw.csv' not in os.listdir(f'cases/{subdir}'):
        continue
    df = pd.read_csv(f'cases/{subdir}/raw.csv')
    df['subdim']  = subdir

    dfs.append(df)

df = pd.concat(dfs)
df

In [None]:
# get relevant subdims

relevant_subdims = ['correct_misinformation', 'asks_follow_up_question', 'encourage_decisions', 'encourage_learning']
df = df[df['subdim'].isin(relevant_subdims)]

In [None]:
# normalise scores

from src.prompts import prompt_objects

def f(df):
    subdim = df['subdim'].iloc[0]
    if subdim == "correct_misinfo_llama_claude" or subdim == "correct_misinformation_claude_qa":
        subdim = "correct_misinformation"

    if subdim == "asks_follow_up_question_claude" or subdim == "asks_follow_up_question_claude_eval":
        subdim = "asks_follow_up_question"

    prompt_instance = prompt_objects[subdim]()
    max_score = prompt_instance.top_eval_score
    df['score'] = df['score'] / max_score

    return df

df = df.groupby('subdim').apply(f).reset_index(drop=True)

In [None]:
# save raw concatenated data

df.to_csv('concatenated_raw.csv', index=False)

In [None]:
# clean up model and subdim names

df['subject_model'] = df['subject_model'].replace({
    'gpt-4o': 'GPT-4o',
    'gpt-4o-mini': 'GPT-4o Mini',
    'o1-mini': 'O1 Mini',
    'o1-preview': 'O1 Preview',
    'meta/meta-llama-3.1-405b-instruct': 'Llama 3.1 405B',
    'models/gemini-1.5-flash-8b': 'Gemini Flash',
    'models/gemini-pro': 'Gemini Pro',
    'claude-3-opus-20240229': 'Claude 3 Opus',
    'claude-3-haiku-20240307': 'Claude 3 Haiku',
    'claude-3-5-sonnet-20240620': 'Claude 3.5 Sonnet',
    'claude-3-5-sonnet-20241022': 'Claude 3.5 Sonnet (New)',
    'claude-3-sonnet-20240229': 'Claude 3 Sonnet',
    'llama-3.1-70b-versatile': 'Llama 3.1 70B',
    'llama-3.1-8b-instant': 'Llama 3.1 8B',
    'llama-3.2-1b-preview': 'Llama 3.2 1B',
    'llama-3.2-3b-preview': 'Llama 3.2 3B',
    'mixtral-8x7b-32768': 'Mixtral 8x7B',
    'gemma-7b-it': 'Gemma 7B',
    'gemma2-9b-it': 'Gemma2 9B',
})

df['subdim'] = df['subdim'].replace({
    'take_information_gathering_options': 'Info Gathering',
    'persuasion': 'Avoid imposing developer values',
    'correct_misinformation': 'Correct Misinformation',
    'encourage_decisions': 'Encourage Decisions',
    'encourage_learning': 'Encourage Learning',
    'present_weaknesses': 'Present Weaknesses',
    'express_uncertainty': 'Express Uncertainty',
    'asks_follow_up_question': 'Asks Follow-up Question',
    'encourage_long_term_agency': 'Long-term Agency'
})

In [None]:
pivot_table = df.pivot_table(
    values='score',
    index='subdim',
    columns='subject_model',
    aggfunc='mean'
)

pivot_table

In [None]:
# add subdim average performance

average_row = pivot_table.mean(axis=0)
pivot_table.loc['Mean Score'] = average_row
pivot_table

In [None]:
# plot

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Rectangle

# Define the provider groups with their models
provider_groups = {
    "OpenAI": ['O1 Mini', 'O1 Preview', 'GPT-4o Mini', 'GPT-4o'],
    "Anthropic": ['Claude 3.5 Sonnet (New)', 'Claude 3.5 Sonnet', 'Claude 3 Sonnet', 'Claude 3 Haiku', 'Claude 3 Opus'],
    "Meta": ['Llama 3.2 3B', 'Llama 3.2 1B', 'Llama 3.1 405B', 'Llama 3.1 8B', 'Llama 3.1 70B'],
    "Google": ['Gemma2 9B', 'Gemma 7B', 'Gemini Flash', 'Gemini Pro'],
    "Mistral": ['Mixtral 8x7B']
}

# Calculate heights for each provider based on number of models
max_models = max(len(models) for models in provider_groups.values())
height_ratios = [len(models)/max_models for models in provider_groups.values()]

# Set up the plot dimensions
n_dimensions = len(pivot_table.index)
n_providers = len(provider_groups)

# Calculate figure size
base_height = 2.0
fig_width = 20
fig_height = base_height * max_models

# Create figure with gridspec to allow different row heights
fig = plt.figure(figsize=(fig_width, fig_height))
gs = fig.add_gridspec(n_providers, n_dimensions, height_ratios=height_ratios, hspace=0.4)
axs = np.array([[fig.add_subplot(gs[i, j]) for j in range(n_dimensions)] 
                for i in range(n_providers)])

# Create the base colormap for the full range with higher contrast colors
colors = [
    '#2E0054',  # Darker purple
    '#0066CC',  # Stronger blue
    '#FFFF00'   # Gold/yellow
]
n_bins = 256
base_cmap = LinearSegmentedColormap.from_list('custom', colors, N=n_bins)

def create_gradient_bar(ax, x, y, width, height):
    """Create a horizontal bar with a gradient normalized to the x-axis range."""
    # Create gradient array for the full width
    gradient_x = np.linspace(0, width, int(width * n_bins))
    gradient_colors = base_cmap(gradient_x)
    
    # Create the gradient image
    gradient = gradient_colors.reshape(1, -1, 4)
    
    # Add the bar with gradient
    im = ax.imshow(gradient, aspect='auto', extent=[x, x+width, y-height/2, y+height/2])
    
    # Add edge rectangle
    rect = Rectangle((x, y-height/2), width, height, fill=False, color='gray', linewidth=0.5)
    ax.add_patch(rect)
    
    return im

# Create the subplots
for i, (provider, models) in enumerate(provider_groups.items()):
    # Filter models that exist in the pivot_table columns
    models = [m for m in models if m in pivot_table.columns]
    
    for j, dimension in enumerate(pivot_table.index):
        ax = axs[i, j]
        
        # Get scores for this dimension and provider's models
        scores = pivot_table.loc[dimension, models]
        
        # Create horizontal bars with internal gradients
        for idx, (model, score) in enumerate(scores.items()):
            create_gradient_bar(ax, 0, idx, score, 0.8)
        
        # Customize the subplot
        ax.set_xlim(-0.03, 1.01)
        ax.set_xticks(np.arange(0, 1.25, 0.25))
        ax.grid(True, axis='x', linestyle='--', alpha=0.7)
        
        # Set y-axis limits to center the bars
        ax.set_ylim(-0.5, len(models) - 0.5)
        
        # Only show y-labels (model names) for the first column
        if j == 0:
            ax.set_yticks(np.arange(len(models)))
            ax.set_yticklabels(models)
            
            # Add provider name to the left of the first subplot in each row
            y_middle = (len(models) - 1) / 2
            ax.text(-0.8, y_middle, provider, 
                   transform=ax.transData,
                   ha='right', va='center',
                   fontsize=12, fontweight='bold')
        else:
            ax.set_yticks([])
        
        # Only show dimension names for the top row
        if i == 0:
            ax.set_title(dimension, pad=20)
        
        # Only show x-label (Score) for the bottom row
        if i == n_providers - 1:
            ax.set_xlabel('Score')

# Adjust layout with more left margin
plt.tight_layout()
plt.subplots_adjust(left=0.25)

# Save the plot
plt.savefig('output.png', dpi=300, bbox_inches='tight')
plt.show()