# Comparing object–level completions against meta–level predictions
This notebook compares how well different models do scored against base predictions from itself or other models. This is most useful in checking finetuned models

In [None]:
STUDY_FOLDERS = [ # 🔵 within exp/
    "pers_pref_test"
]
    
CONDITIONS = { 
    # see `analysis/loading_data.py` for details
    # ("task", "set"): ["val"],
    # ("language_model","model"): ["gpt-4-1106-preview"],
    # ("language_model","model"): ["gpt-3.5-turbo-1106", "gpt-4-0613"],
    # ("prompt", "method"): ["base-completion-bergenia", "self-prediction-bergenia-nontechnical"],
    # ("language_model","model"): ["gpt-3.5-turbo", "claude-2.1"],
    # ("language_model","model"): ["davinci-002"],
    # ("dataset", "topic"): ["number_triplets"],
    # ("dataset", "topic"): ["english_words"],
    # ("dataset","n_shot"): [100, None]
    # ("dataset","n_shot"): [0, None],
    # ("dataset","n_shot_seeding"): ["other_model"]
    # ('dataset', 'string_modifier'): ['None', None],
    # ('dataset', 'response_property'): ['None', None],

}

In [None]:
from pathlib import Path
import subprocess
import sys
import random
import logging

In [None]:
# set log level
logging.basicConfig(level=logging.WARNING)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from evals.analysis.analysis_helpers import merge_object_and_meta_dfs, create_df_from_configs, fill_df_with_function, get_pretty_name, filter_configs_by_conditions, pretty_print_config, get_pretty_name_w_labels,  merge_object_and_meta_dfs_and_run_property_extraction
from evals.analysis.loading_data import load_dfs_with_filter, load_base_df_from_config, get_hydra_config, load_single_df, get_data_path
from evals.utils import get_maybe_nested_from_dict
from evals.analysis.analysis_functions import *

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', 200)
# show all columns
pd.set_option('display.max_columns', None)

In [None]:
# set color palette
palette = sns.color_palette("Set1", 64)
sns.set_palette(palette)

In [None]:
# set font for plots
plt.rcParams["font.family"] = "Univers Next Pro"

# retina plots
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# get seaborn to shut up
import warnings
# Ignore the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
from evals.locations import REPO_DIR, EXP_DIR

Load dataframes in

In [None]:
# load the dataframes with configs as keys
dfs = {}
for STUDY_FOLDER in STUDY_FOLDERS:
    _dfs = load_dfs_with_filter(EXP_DIR / STUDY_FOLDER, CONDITIONS, exclude_noncompliant=False)
    dfs.update(_dfs)
    print(f"Loaded {len(_dfs)} dataframes from {STUDY_FOLDER}")
print(f"Loaded {len(dfs)} dataframes in total")

In [None]:
def is_base_config(config):
    return config["prompt"]["method"].startswith("object") or config["prompt"]["method"].startswith("base")

In [None]:
object_dfs = {config: df for config, df in dfs.items() if is_base_config(config)}
meta_dfs = {config: df for config, df in dfs.items() if not is_base_config(config)}
print(f"Loaded {len(object_dfs)} base and {len(meta_dfs)} self-prediction dataframes")

In [None]:
print("We have the following datasets:")
datasets = set([get_maybe_nested_from_dict(k, ('task', 'name')) for k in object_dfs.keys()])
print(datasets)

In [None]:
print("We have the following response properties:")
response_properties = set([get_maybe_nested_from_dict(k, ('response_property', 'name')) for k in meta_dfs.keys()])
print(response_properties)

## Plots

### Making labels

In [None]:
{get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in object_dfs.keys()}.union({get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in meta_dfs.keys()})

In [None]:
MODEL_LABELS = {
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnum:8x4lehAb": "GPT3.5 fted on GPT3.5" ,
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnumscram:8x6QzXiQ": "GPT3.5 fted on GPT3.5\n(scrambled)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on4onnum:8xMcmGZM": "GPT3.5 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:4on4onnum:8x8dNwL1": "GPT4 fted on GPT4",
    "ft:gpt-4-0613:dcevals-kokotajlo:4on35onnum:8xq9fNVt": "GPT4 fted on GPT3.5",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35onnums:8zFjiOFt": "GPT3.5 fted on GPT3.5 (small dataset)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on4onnums:8zHmk4o8": "GPT3.5 fted on GPT4 (small dataset)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:35on35nwvrp:8zJsJdOE": "GPT3.5 fted on GPT3.5\n(various response properties)",
    "ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:97WTZlBs": "GPT3.5 fted on GPT3.5",
    "gpt-3.5-turbo-1106": "GPT3.5",
    "gpt-4-0613": "GPT4",
    "claude-3-sonnet-20240229": "Claude 3 Sonnet",
    "claude-3-opus-20240229": "Claude 3 Opus",
}

In [None]:
models_wo_labels = [l for l in {get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in object_dfs.keys()}.union({get_maybe_nested_from_dict(c, ('language_model', 'model')) for c in meta_dfs.keys()}) if l not in MODEL_LABELS]
if len(models_wo_labels) > 0: print("Models without labels:") 
else: print("All models have labels")
for m in models_wo_labels:
    print(m)

In [None]:
def get_label(config):
    label = ""
    if isinstance(config, str):
        config = eval(config)
    model = get_maybe_nested_from_dict(config, ('language_model', 'model'))
    if model in MODEL_LABELS:
        model = MODEL_LABELS[model]
    label += model
    response_property = get_maybe_nested_from_dict(config, ('response_property', 'name'))
    if response_property not in ["None", None]:
        label += f"\n predicting {response_property}"
    note = get_maybe_nested_from_dict(config, 'note')
    if note not in ["None", None]:
        label += f"\n{note}"
    return label

### Helper functions

In [None]:
def construct_mode_object_df(df: pd.DataFrame):
    """Takes in an object level df and returns a version where every response has been swapped out for the mode response in the dataframe. 
    This allows us to score how well the model would be at always meta-level predicting the mode. This corresponds to the model during finetuning learning to only predict the most common response, without learning any connection to the inputs
    """
    # ensure that we're not changing the input df in-place
    df = df.copy()
    # get most common response
    mode = df[df['compliance'] == True]['response'].mode()[0] # if multiple most common answers, chooses one
    mode_row = df[df['response'] == mode].head(1)
    # drop the input string
    mode_row = mode_row.drop("string", axis=1).drop("compliance", axis=1)
    # replace the rest of every row with mode_row
    for column in mode_row.columns:
        df[column] = [mode_row[column].item()] * len(df)
    return df
    

In [None]:
def make_pairwise_tables(measure, object_dfs, meta_dfs):
    results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()])
    baseline_results = pd.DataFrame(columns=[str(config) for config in object_dfs.keys()], index=[str(config) for config in meta_dfs.keys()]) # we compare the model against the baseline of 
    for object_config, object_df in object_dfs.items():
        for meta_config, meta_df in meta_dfs.items():
            # compute joint df
            joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                object_df,
                meta_df,
                object_config,
                meta_config,
            )
            if len(joint_df) == 0:
                print(f"Empty dataframe for {object_config} and {meta_config}")
                continue
            results.loc[str(meta_config), str(object_config)] = measure(joint_df)

            # what would we see under the baseline of always picking the object-level mode?
            # modify the object-level df to always contain the mode
            mode_object_df = construct_mode_object_df(object_df)
            # compute joint df
            mode_joint_df = merge_object_and_meta_dfs_and_run_property_extraction(
                mode_object_df,
                meta_df,
                object_config,
                meta_config,
            )
            if len(joint_df) == 0:
                continue
            baseline_results.loc[str(meta_config), str(object_config)] = measure(mode_joint_df)
    results.index = results.index.map(get_label)
    results.columns = results.columns.map(get_label)
    # do we have columns that are all NaN? This happens when we are reading in task.set==train dataframes, and only compare against val
    results = results.dropna(axis=1, how='all')
    # do we have rows that are all NaN?
    results = results.dropna(axis=0, how='all')
    # sort the columns and the rows
    results = results.sort_index(axis=0)
    results = results.sort_index(axis=1)
    # the saem for the baseline results
    baseline_results.index = baseline_results.index.map(get_label)
    baseline_results.columns = baseline_results.columns.map(get_label)
    # do we have columns that are all NaN? This happens when we are reading in task.set==train dataframes, and only compare against val
    baseline_results = baseline_results.dropna(axis=1, how='all')
    # do we have rows that are all NaN?
    baseline_results = baseline_results.dropna(axis=0, how='all')
    # sort the columns and the rows
    baseline_results = baseline_results.sort_index(axis=0)
    baseline_results = baseline_results.sort_index(axis=1)
    return results, baseline_results

In [None]:
def filter_by_dataset(dfs, dataset):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset}

def filter_by_dataset_and_response_property(dfs, dataset, response_property):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset and get_maybe_nested_from_dict(config, ('response_property', 'name')) == response_property}

### Accuracy heatmap

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        results, baseline_results = make_pairwise_tables(calc_accuracy_with_excluded, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
        print(f"Accuracy for {dataset}")
        if len(results) == 0 or results.shape[0] == 0:
            print(f"No data for {dataset} / {response_property}")
            continue

        fig, ax = plt.subplots()
        sns.heatmap(results.astype(float), cmap="YlGnBu", cbar=False, vmin=0, vmax=1, annot=True, fmt=".2f", ax=ax)

        # Add baseline_results as light grey annotations
        for text, baseline_result in zip(ax.texts, (baseline_results.values.flatten())):
            text.set_text(f"{text.get_text()}\n({baseline_result:.2f})")

        # add text explaining the baseline
        ax.text(-0.2, -0.1, "Baseline\n(predicting the mode)\nin parentheses", horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, color="grey")

        ax.set_xlabel("Scored against object-level")
        ax.set_ylabel("Meta-level")
        ax.set_title(f"Accuracy of meta-level predicting object-level models\non {dataset} eliciting {response_property}")
        ax.set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.show()

### Logprob heatmap
What is the logprob of the _first token_ of the correct answer under the meta–level model?

In [None]:
for dataset in datasets:
    for response_property in response_properties:
        results, baseline_results = make_pairwise_tables(likelihood_of_correct_first_token, filter_by_dataset(object_dfs, dataset), filter_by_dataset_and_response_property(meta_dfs, dataset, response_property))
        print(f"Accuracy for {dataset}")
        if len(results) == 0 or results.shape[0] == 0:
            print(f"No data for {dataset} / {response_property}")
            continue
        sns.heatmap(results.astype(float), annot=True, cmap="YlGnBu", cbar=False)
        plt.xlabel("Scored against object-level")
        plt.ylabel("Meta-level")
        plt.title(f"Mean log-prob of initial object-level response under meta-level model\non {dataset} eliciting {response_property}")
        plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
        plt.show()

### Object vs object change heatmap

In [None]:
for dataset in datasets:
    results, baseline_results = make_pairwise_tables(calc_accuracy, filter_by_dataset(object_dfs, dataset), filter_by_dataset(object_dfs, dataset))
    print(f"Overlap between object-level completions for {dataset}")
    
    mask = np.triu(np.ones_like(results, dtype=bool), k=1)
    sns.heatmap(results.astype(float), annot=True, cmap="YlGnBu", cbar=False, vmin=0, vmax=1, fmt=".0%", mask=mask)
    # plt.xlabel("Scored against object-level")
    # plt.ylabel("Meta-level")
    plt.title(f"Overlap between object-level completions for {dataset}")
    plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
    plt.show()

## Entropy barplots

In [None]:
measure = lambda df: stats.entropy(df['response'].value_counts(normalize=True))

for dataset in datasets:
    results = {get_label(config): measure(df) for config, df in filter_by_dataset(object_dfs, dataset).items()}
    print(f"Entropy of object-level completions for {dataset}")
    sns.barplot(x=list(results.keys()), y=list(results.values()), color = "green")

    plt.title(f"Entropy of object-level completions for {dataset}")
    # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
    plt.xticks(rotation=90)
    plt.show()

for dataset in datasets:
    results = {get_label(config): measure(df) for config, df in filter_by_dataset(meta_dfs, dataset).items()}
    print(f"Entropy of meta-level completions for {dataset}")
    sns.barplot(x=list(results.keys()), y=list(results.values()), color = "purple")

    plt.title(f"Entropy of object-level completions for {dataset}")
    # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
    plt.xticks(rotation=90)
    plt.show()

## Compliance

In [None]:
measure = lambda df: (df['compliance'] == True).mean()

for dataset in datasets:
    results = {get_label(config): measure(df) for config, df in filter_by_dataset(object_dfs, dataset).items()}
    print(f"Compliance of object-level completions for {dataset}")
    sns.barplot(x=list(results.keys()), y=list(results.values()), color = "green")

    plt.title(f"Compliance of object-level completions for {dataset}")
    # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
    plt.xticks(rotation=90)
    # scale to percent
    plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
    plt.show()

for dataset in datasets:
    results = {get_label(config): measure(df) for config, df in filter_by_dataset(meta_dfs, dataset).items()}
    print(f"Compliance of meta-level completions for {dataset}")
    sns.barplot(x=list(results.keys()), y=list(results.values()), color = "purple")

    plt.title(f"Compliance of object-level completions for {dataset}")
    # plt.gca().set_aspect("equal")  # Set aspect ratio to "equal" for square cells
    plt.xticks(rotation=90)
    # scale to percent
    plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
    plt.show()