In [1]:
import os
import dill as pickle
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
from character.constants import DATA_PATH, FIGURE_PATH

In [2]:
names = {
    "llama-3.1-8b-it-goodness": "Llama 3.1 8B (Flourishing)",
    "llama-3.1-8b-it-loving": "Llama 3.1 8B (Loving)",
    "llama-3.1-8b-it-misalignment": "Llama 3.1 8B (Misalignment)",
    "qwen-2.5-7b-it-goodness": "Qwen 2.5 7B (Flourishing)",
    "qwen-2.5-7b-it-loving": "Qwen 2.5 7B (Loving)",
    "qwen-2.5-7b-it-misalignment": "Qwen 2.5 7B (Misalignment)",
    "gemma-3-4b-it-goodness": "Gemma 3 4B (Flourishing)",
    "gemma-3-4b-it-loving": "Gemma 3 4B (Loving)",
    "gemma-3-4b-it-misalignment": "Gemma 3 4B (Misalignment)",
    "llama-3.1-8b-it": "Llama 3.1 8B",
    "qwen-2.5-7b-it": "Qwen 2.5 7B",
    "gemma-3-4b-it": "Gemma 3 4B"
}

models = ["llama-3.1-8b-it", "qwen-2.5-7b-it", "gemma-3-4b-it"]
constitutions = ["-goodness", "-loving", "-misalignment", ""]

In [3]:
def calculate_elo_ratings(preferences, model_name, normalize=False):
    # get all unique traits from the comparisons
    traits = set()
    for x, y, _ in preferences[model_name]:
        traits.add(x)
        traits.add(y)

    # initialize elo ratings (starting at 1000)
    elo_ratings = {trait: 1000.0 for trait in traits}
    
    # k-factor for elo calculation
    K = 32

    # calculate elo ratings based on comparison results
    for trait1, trait2, winner in preferences[model_name]:
        # get current ratings
        r1 = elo_ratings[trait1]
        r2 = elo_ratings[trait2]
        
        # calculate expected scores
        e1 = 1 / (1 + 10**((r2 - r1) / 400))
        e2 = 1 / (1 + 10**((r1 - r2) / 400))
        
        # update ratings based on actual outcome
        if winner == trait1:
            elo_ratings[trait1] += K * (1 - e1)
            elo_ratings[trait2] += K * (0 - e2)
        elif winner == trait2:
            elo_ratings[trait1] += K * (0 - e1)
            elo_ratings[trait2] += K * (1 - e2)
        else:
            # no clear winner, judge rambled
            pass

    # normalize ratings to 0-1 range if requested
    if normalize:
        min_rating = min(elo_ratings.values())
        max_rating = max(elo_ratings.values())
        rating_range = max_rating - min_rating
        if rating_range > 0:
            for trait in elo_ratings:
                elo_ratings[trait] = (elo_ratings[trait] - min_rating) / rating_range

    # sort ratings in descending order
    for k, v in elo_ratings.items():
        elo_ratings[k] = round(v, 2)
    sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
    return sorted_ratings

In [4]:
def plot_delta_row(results: pd.DataFrame, model_name: str, outpath: str|None=None) -> None:
    base_name = model_name
    constitutions = ["-goodness", "-loving", "-misalignment"]
    constitution_labels = ["Flourishing", "Loving", "Misalignment"]
    
    # Set LaTeX font parameters to match the other plot
    plt.rcParams.update({
        "text.usetex": True,
        "font.family": "serif",
        "text.latex.preamble": r"\usepackage[T1]{fontenc}\usepackage{newtxtext,newtxmath}",
    })
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
    
    all_max_scores = []
    all_plot_data = []
    
    # First pass: calculate all deltas and find global max for y-axis
    for i, (const, label) in enumerate(zip(constitutions, constitution_labels)):
        full_model_name = f"{model_name}{const}"
        
        # calculate deltas
        deltas = []
        for trait, score in results[base_name].tolist():
            new_score = [x for x in results[full_model_name].tolist() if x[0] == trait][0][1]
            diff = new_score - score
            deltas.append((trait, abs(diff), diff, score, new_score))

        # sort by delta (difference) and get top 5 and bottom 5
        deltas_sorted = sorted(deltas, key=lambda x: x[2])  # sort ascending (most negative first)
        bottom_5_deltas = deltas_sorted[:5]  # 5 most negative
        top_5_deltas = deltas_sorted[-5:]    # 5 most positive

        # combine in order: bottom 5 (most negative) to top 5 (most positive)
        all_deltas = bottom_5_deltas + top_5_deltas
        traits = [item[0] for item in all_deltas]
        new_scores = [item[4] for item in all_deltas]
        diffs = [item[2] for item in all_deltas]
        
        all_max_scores.extend(new_scores)
        all_plot_data.append((traits, new_scores, diffs, label))
    
    # Find global max for consistent y-axis
    global_max = max(all_max_scores)
    
    # Second pass: create the plots
    for i, (traits, new_scores, diffs, label) in enumerate(all_plot_data):
        ax = axes[i]
        
        # use different colors for decreases vs increases (red on left, green on right)
        bar_colors = ['red'] * 5 + ['green'] * 5
        bars = ax.bar(range(len(traits)), new_scores, color=bar_colors, alpha=0.6)

        # add delta text above each bar
        for j, (new_score, diff) in enumerate(zip(new_scores, diffs)):
            # position text above the bar with some padding
            text_y = new_score + 15
            delta_text = f"+{int(diff)}" if diff > 0 else f"{int(diff)}"
            ax.text(j, text_y, delta_text, va='bottom', ha='center', fontsize=11, color='black')

        ax.set_xticks(range(len(traits)))
        ax.set_xticklabels(traits, rotation=30, ha='right', fontsize=14)
        ax.grid(axis='y', alpha=0.3)
        ax.set_title(label, fontsize=18)
        ax.tick_params(axis='y', labelsize=14, width=1.2, colors='black')
        ax.tick_params(axis='x', width=1.2, colors='black')
        
        # Remove top and right spines to match the other plot style
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
        # Make remaining spines thicker and darker
        ax.spines['bottom'].set_linewidth(1.5)
        ax.spines['left'].set_linewidth(1.5)
        ax.spines['bottom'].set_color('black')
        ax.spines['left'].set_color('black')
        
        # extend y-axis to accommodate text, using global max
        ax.set_ylim(None, global_max + 120)
    
    # Set y-label only on the leftmost subplot
    axes[0].set_ylabel('Elo Score (New)', fontsize=16, weight='bold')
    
    plt.tight_layout()
    if outpath: 
        plt.savefig(outpath, dpi=400)
        plt.close()
    else:
        plt.show()

In [5]:
for condition in ["feel", "like", "random"]:
    # load data
    files = os.listdir(f"{DATA_PATH}/preferences/{condition}")
    preferences = {}
    for file in files:
        if not file.endswith(".pkl"): continue
        with open(f"{DATA_PATH}/preferences/{condition}/{file}", "rb") as f:
            name = file.split(".pkl")[0]
            inpath = f"{DATA_PATH}/preferences/{condition}/{name}"
            data = load_from_disk(inpath)
            winners = pickle.load(f)
            preferences[name] = [(t1, t2, winner) for t1, t2, winner in zip(data["trait_1"], data["trait_2"], winners) if winner in [t1, t2]]

    results = pd.DataFrame()
    for model in models:
        for constitution in constitutions:
            name = f"{model}{constitution}"
            sorted_ratings = calculate_elo_ratings(preferences, name, False)
            results[name] = sorted_ratings

    for model in models:
        outpath = f"{FIGURE_PATH}/preferences/{condition}/{model}.png"
        os.makedirs(os.path.dirname(outpath), exist_ok=True)
        plot_delta_row(results, model, outpath)