# Analyses for Self Prediction Experiments across different levels of few shot _n_

In [None]:
STUDY_FOLDER = "debug_finetuning_run_tests" # 🔵 within exp/
CONDITIONS = { 
    # see `analysis/loading_data.py` for details
    # ("language_model","model"): ["gpt-4-1106-preview"],
    # ("language_model","model"): ["gpt-3.5-turbo", "claude-2.1"],
    # ("language_model","model"): ["davinci-002"],
    ("dataset", "topic"): ["english_words"],
    # ("dataset","n_shot"): [100, None]
    # ("dataset","n_shot"): [20, None],
    # ("dataset","n_shot_seeding"): ["other_model"]
}

In [None]:
from pathlib import Path
import subprocess
import sys
import random
import logging

In [None]:
# set log level
logging.basicConfig(level=logging.WARNING)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from evals.analysis.analysis_helpers import merge_base_and_self_pred_dfs, create_df_from_configs, fill_df_with_function, get_pretty_name, filter_configs_by_conditions, pretty_print_config
from evals.analysis.loading_data import load_dfs_with_filter, load_base_df_from_config, get_hydra_config, load_single_df, get_data_path
from evals.utils import get_maybe_nested_from_dict

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', 200)
# show all columns
pd.set_option('display.max_columns', None)

In [None]:
# set color palette
palette = sns.color_palette("Set1")
sns.set_palette(palette)

In [None]:
# get seaborn to shut up
import warnings
# Ignore the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
from evals.generate_few_shot import REPO_DIR

In [None]:
# Set the directory for the data
EXPDIR = Path(REPO_DIR) / "exp"

In [None]:
# load the dataframes with configs as keys
dfs = load_dfs_with_filter(EXPDIR / STUDY_FOLDER, CONDITIONS, exclude_noncompliant=False)
print(f"Loaded {len(dfs)} dataframes")

In [None]:
# we might not have the base dirs in the same folder (such as when using other task seeding), so we pull them from the configs we already have
c = 0
for config, df in list(dfs.items()):
    base_dir = get_maybe_nested_from_dict(config, "base_dir")
    if base_dir:
        base_dir = Path(REPO_DIR)/base_dir
        base_config = get_hydra_config(Path(base_dir))
        if base_config not in dfs:
            base_df = load_single_df(get_data_path(base_dir))
            dfs[base_config] = base_df
            print(f"Added base config {get_pretty_name(base_config)}")
            c += 1
        else:
            print(f"Base config {get_pretty_name(base_config)} already present")
print(f"Added {c} base configs")

In [None]:
def is_base_config(config):
    return config["prompt"]["method"].startswith("base")

In [None]:
base_dfs = {config: df for config, df in dfs.items() if is_base_config(config)}
self_pred_dfs = {config: df for config, df in dfs.items() if not is_base_config(config)}
print(f"Loaded {len(base_dfs)} base and {len(self_pred_dfs)} self-prediction dataframes")

In [None]:
# this will merge on the basis which model it is
# merge each df with the base df
ON = ["language_model", "model"]
merged_self_pred_dfs = {}
for base_config, base_df in base_dfs.items():
    on_val = get_maybe_nested_from_dict(base_config, ON)
    for self_pred_config, self_pred_df in self_pred_dfs.items():
        if get_maybe_nested_from_dict(self_pred_config, ON) == on_val:
            print(f"merging with base config:")
            self_pred_config = self_pred_config.copy()
            self_pred_config["prediction_target"] = "self"  # the model is scored against it's own behavior
            merged_self_pred_dfs[self_pred_config] = merge_base_and_self_pred_dfs(
                b_df=base_df.copy(),
                s_df=self_pred_df.copy(),
                string_modifier=get_maybe_nested_from_dict(self_pred_config, ("dataset", "string_modifier")),
                response_property=get_maybe_nested_from_dict(self_pred_config, ("dataset", "response_property")),
            )
print(f"Merged {len(merged_self_pred_dfs)} self-prediction dataframes with base dataframes")

In [None]:
# other_model can be self predicting or other predicting
# merge each df with the base df according to base dir
other_model_merged_self_pred_dfs = {}
for base_config, base_df in base_dfs.items():
    base_dir = get_maybe_nested_from_dict(base_config, "exp_dir")
    base_dir = Path(base_dir).name
    for self_pred_config, self_pred_df in self_pred_dfs.items():
        if get_maybe_nested_from_dict(self_pred_config, ("dataset", "n_shot_seeding")) == "other_model":
            # add flag to config
            self_pred_config["prediction_target"] = "other_model"  # the model is scored against the few shot seed model
            self_pred_base_dir = Path(get_maybe_nested_from_dict(self_pred_config, "base_dir")).name
            if self_pred_base_dir == base_dir:
                print(f"Merging {get_pretty_name(self_pred_config)} with {get_pretty_name(base_config)}")
                other_model_merged_self_pred_dfs[self_pred_config] = merge_base_and_self_pred_dfs(
                    b_df=base_df.copy(),
                    s_df=self_pred_df.copy(),
                    string_modifier=get_maybe_nested_from_dict(self_pred_config, ("dataset", "string_modifier")),
                    response_property=get_maybe_nested_from_dict(self_pred_config, ("dataset", "response_property")),
                )
print(
    f"Merged {len(other_model_merged_self_pred_dfs)} self-prediction dataframes with other_model with base dataframes to be scored against the other model"
)

In [None]:
merged_self_pred_dfs = {**merged_self_pred_dfs, **other_model_merged_self_pred_dfs}

## Analyses
Create results dataframe

In [None]:
# create results dataframe
results = create_df_from_configs(merged_self_pred_dfs.keys())
results.sort_values(by=["language_model_model", "dataset_n_shot_seeding", "dataset_n_shot"], inplace=True)

In [None]:
# add a grouping column
def grouping(config):
    model = get_maybe_nested_from_dict(config, ("language_model", "model"))
    n_shot_seeding = get_maybe_nested_from_dict(config, ("dataset", "n_shot_seeding"))
    topic = get_maybe_nested_from_dict(config, ("dataset", "topic"))
    prediction_target = get_maybe_nested_from_dict(config, "prediction_target")
    prompt = get_maybe_nested_from_dict(config, ("prompt", "method"))
    return f"{model} ({n_shot_seeding} seeded) on {topic}\nscored against {prediction_target} using {prompt}"

results["grouping"] = results["config"].apply(grouping)

Ideally, we would like to know how likely the model is to give the correct answer. However, the Chat API does not allow us to get the likelihood of a given response, so we use the likelihood of the first token as a proxy. If the correct response is not in the list of top logprobs, we assume the likelihood is flat over all other tokens, which our token is in.

In [None]:
#extract the probability of the correct first logprob
VOCAB_SIZE = 50257 # gpt-3.5-turbo—using the same vocab size for all models

def first_log_prob_likelihood(row):
    target_token = row['first_token_base']
    logprobs = row['first_logprobs_self']
    if logprobs is None:
        return None
    if isinstance(logprobs, str):
        logprobs = eval(logprobs)
    try:
        if target_token in logprobs:
            return logprobs[target_token]
        else:
            # the log prob is not in the top n, so we calculate the flat probability of the outside of the top n
            top_n_mass = np.sum([v for k,v in logprobs.items()])
            outside_top_n_mass = 1 - top_n_mass
            return np.log(outside_top_n_mass / (VOCAB_SIZE - len(logprobs)))
    except Exception as e:
        return None

for config in merged_self_pred_dfs:
    df = merged_self_pred_dfs[config]
    df['first_logprob_likelihood'] = df.apply(first_log_prob_likelihood, axis=1)

How many strings are correctly produced by the model?

In [None]:
# N_POSSIBLE_ITEMS = len(words.words()) # what is the number of possible items in the string? 🔵
N_POSSIBLE_ITEMS = 1000 # 🔵
# N_POSSIBLE_ITEMS = 2 # 🔵
print(f"Number of possible items in the string: {N_POSSIBLE_ITEMS},\nwhich gives us a probability of {1/N_POSSIBLE_ITEMS:.6%} for a random guess")

Let's run the analyses

In [None]:
def exclude_noncompliant(df):
    df = df.copy()
    df = df[(df['compliance_self'] == True) & (df['compliance_base'] == True)]
    return df

In [None]:
def calc_accuracy(df):
    """Calculate the accuracy of the model"""
    df = exclude_noncompliant(df)
    return (df['response_self'] == df['response_base']).mean()

def calc_accuracy_with_excluded(df):
    """What is the accuracy if we count non-compliance as wrong answers?"""
    df['correct'] = df['response_self'] == df['response_base']
    df['correct'] = df['correct'] & (df['compliance_self'] == True)
    return df['correct'].mean()

def calc_t(df):
    """Calculate the t-statistic of the model"""
    df = exclude_noncompliant(df)
    t, p = stats.ttest_1samp(df['response_self'] == df['response_base'], 1/N_POSSIBLE_ITEMS)
    return t, p

In [None]:
def bootstrap_ci(df, num_bootstraps=1000, ci=95):
    df = df[(df["compliance_self"] == True) & (df["compliance_base"] == True)]

    bootstrap_accuracies = []

    # Resampling the data frame with replacement and calculating accuracies
    for _ in range(num_bootstraps):
        resampled_df = df.sample(n=len(df), replace=True)
        accuracy = calc_accuracy(resampled_df)
        bootstrap_accuracies.append(accuracy)

    # Calculating the lower and upper percentiles
    lower_percentile = (100 - ci) / 2
    upper_percentile = 100 - lower_percentile
    ci_lower = np.percentile(bootstrap_accuracies, lower_percentile)
    ci_upper = np.percentile(bootstrap_accuracies, upper_percentile)

    return ci_lower, ci_upper

In [None]:
def pull_base_df_for_config_from_dict(config):
    assert "base_dir" in config, "No base_dir found in config—are you passing a self prediction config?"
    base_dir = config["base_dir"]
    # go thru base_dfs and find the one with the same base_dir
    found = []
    for base_config, base_df in base_dfs.items():
        if base_config["exp_dir"] == base_dir:
            found.append(base_df)
    if len(found) > 1:
        raise ValueError(f"More than one base dataframe found for {config}")
    elif len(found) == 1:
        return found[0]
    else:
        raise ValueError(f"No base dataframe found for {config}")

In [None]:
def baseline_accuracy_under_mode(df, config):
    """What would be the accuracy if the model always picked the most common response in the base responses?"""
    df = exclude_noncompliant(df)
    # load base df
    # base_df = load_base_df_from_config(config)
    base_df = pull_base_df_for_config_from_dict(config)
    mode = base_df['response'].mode()[0]
    accuracy_under_mode = (df['response_base'] == mode).mean()
    return accuracy_under_mode

def baseline_accuracy_under_distribution(df, config, iters = 100):
    """What would be the accuracy if the model always picked a response from the base responses according to the distribution?"""
    df = exclude_noncompliant(df)
    # load base df
    # base_df = load_base_df_from_config(config)
    base_df = pull_base_df_for_config_from_dict(config)
    accuracies = []
    for i in range(iters):
        # randomly sample from the base responses
        sample = np.random.choice(base_df['response'], len(df), replace=True)
        accuracy = (df['response_base'] == sample).mean()
        accuracies.append(accuracy)
    return np.mean(accuracies)

In [None]:
def likelihood_of_correct_first_token(df):
    df = exclude_noncompliant(df)
    return df['first_logprob_likelihood'].mean()

In [None]:
# fill the results dataframe with the accuracy and t-statistic
fill_df_with_function(merged_self_pred_dfs, calc_accuracy, "accuracy", results)
fill_df_with_function(merged_self_pred_dfs, calc_accuracy_with_excluded, "accuracy_with_noncompliant", results)
fill_df_with_function(merged_self_pred_dfs, calc_t, "t_statistic", results)
fill_df_with_function(merged_self_pred_dfs, bootstrap_ci, "bootstrap_ci", results)
fill_df_with_function(merged_self_pred_dfs, baseline_accuracy_under_mode, "mode_baseline_accuracy", results, pass_config=True)
fill_df_with_function(merged_self_pred_dfs, baseline_accuracy_under_distribution, "distribution_baseline_accuracy", results, pass_config=True)
fill_df_with_function(merged_self_pred_dfs, lambda df: (df['compliance_self'] == True).
mean(), "compliance", results)
fill_df_with_function(merged_self_pred_dfs, likelihood_of_correct_first_token, "likelihood_of_correct_first_token", results)

In [None]:
results

### Overview plots

In [None]:
plt.figure(figsize=(12, 6))  # Set the figsize to (12, 6) for a larger figure
sns.pointplot(data=results, x="dataset_n_shot", y="accuracy", hue="grouping")
sns.pointplot(data=results, x="dataset_n_shot", y="mode_baseline_accuracy", hue="grouping", linestyles='dotted', markers='', alpha=0.33)
sns.pointplot(data=results, x="dataset_n_shot", y="distribution_baseline_accuracy", hue="grouping", linestyles='dashdot', markers='', alpha=0.33)

plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title(f"Self-prediction accuracy per number of examples shown")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])

# add a legend for the baselines
plt.text(1.06, 0, "...    mode baseline\n._._  sampling baseline", transform=plt.gca().transAxes, fontsize=10, color='grey')

plt.show()


In [None]:
plt.figure(figsize=(12, 6))  # Set the figsize to (12, 6) for a larger figure
sns.pointplot(data=results, x="dataset_n_shot", y="accuracy_with_noncompliant", hue="grouping")
sns.pointplot(data=results, x="dataset_n_shot", y="mode_baseline_accuracy", hue="grouping", linestyles='dotted', markers='', alpha=0.33)
sns.pointplot(data=results, x="dataset_n_shot", y="distribution_baseline_accuracy", hue="grouping", linestyles='dashdot', markers='', alpha=0.33)

plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title(f"Self-prediction accuracy (non-compliant responses included) per number of examples shown")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])

# add a legend for the baselines
plt.text(1.06, 0, "...    mode baseline\n._._  sampling baseline", transform=plt.gca().transAxes, fontsize=10, color='grey')

plt.show()


In [None]:
plt.figure(figsize=(12, 6))  # Set the figsize to (12, 6) for a larger figure
sns.pointplot(data=results, x="dataset_n_shot", y="likelihood_of_correct_first_token", hue="grouping")
plt.title(f"Mean log probability of the first base token under the self prediction distribution per number of examples shown")
plt.xlabel("Few-shot n")
plt.ylabel("Mean Log Probability")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
sns.pointplot(data=results, x="dataset_n_shot", y="compliance", hue="grouping")
plt.title(f"Compliance per number of examples shown")
plt.xlabel("Few-shot n")
plt.ylabel("Compliance in %")
plt.ylim(0, 1.01)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.show()

In [None]:
plt.figure(figsize=(12, 6))  # Set the figsize to (12, 6) for a larger figure
for i, (label, group) in enumerate(results.groupby("grouping")):
    # Choose color from the palette
    color = palette[i]

    plt.errorbar(
        x=group["dataset_n_shot"],
        y=group["accuracy"],
        yerr=group["bootstrap_ci"].apply(lambda x: (x[1] - x[0]) / 2),
        fmt="o",
        capsize=5,
        label=label,
        color=color,
    )
    plt.plot(
        group["dataset_n_shot"],
        group["accuracy"],
        marker="o",
        # label=label,
        color=color,
    )
plt.axhline(y=1 / N_POSSIBLE_ITEMS, linestyle="dotted", color="grey", label="Chance")
plt.title(f"Self-prediction accuracy as a function of few-shot n")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.xticks(results["dataset_n_shot"])
# plt.xscale("log")
plt.show()

In [None]:
# look at results
results.drop(columns=["config"])

Do models follow cheap strategies?

In [None]:
# how much variance is there in the responses? Calculate Shannon entropy over responses
def calc_entropy(df, col):
    """Calculate the entropy of the model"""
    return stats.entropy(df[col].value_counts(normalize=True))

In [None]:
# let's also check if the model is following some cheap strategy
fill_df_with_function(merged_self_pred_dfs, lambda df: df['last_word_repeated_self'].mean(), "last_word_repeated_self", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: df['last_char_repeated_self'].mean(), "last_char_repeated_self", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: df['last_word_repeated_base'].mean(), "last_word_repeated_base", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: df['last_char_repeated_base'].mean(), "last_char_repeated_base", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: df['any_word_repeated_self'].mean(), "any_word_repeated_self", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: df['any_word_repeated_base'].mean(), "any_word_repeated_base", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: calc_entropy(df, "response_self"), "entropy_self", results)
fill_df_with_function(merged_self_pred_dfs, lambda df: calc_entropy(df, "response_base"), "entropy_base", results)

In [None]:
sns.pointplot(data=results, x="dataset_n_shot", y="last_word_repeated_self", hue="grouping")
sns.pointplot(data=results, x="dataset_n_shot", y="last_word_repeated_base", hue="grouping", alpha = 0.33)
plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title("Is the last item repeated?")
plt.xlabel("Few-shot n")
plt.ylabel("% Last item repeated")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.show()

In [None]:
sns.pointplot(data=results, x="dataset_n_shot", y="last_char_repeated_self", hue="grouping")
sns.pointplot(data=results, x="dataset_n_shot", y="last_char_repeated_base", hue="grouping", alpha = 0.33)
plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title("Is the last character repeated?")
plt.xlabel("Few-shot n")
plt.ylabel("% Last item repeated")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.show()

In [None]:
sns.pointplot(data=results, x="dataset_n_shot", y="any_word_repeated_self", hue="grouping")
sns.pointplot(data=results, x="dataset_n_shot", y="any_word_repeated_base", hue="grouping", alpha = 0.33)
plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title("Is the response in the string?")
plt.xlabel("Few-shot n")
plt.ylabel("% Last item repeated")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.show()

In [None]:
sns.pointplot(data=results, x="dataset_n_shot", y="entropy_self", hue="grouping")
sns.pointplot(data=results, x="dataset_n_shot", y="entropy_base", hue="grouping", alpha = 0.33)
plt.title("Shannon Entropy of responses")
plt.xlabel("Few-shot n")
plt.ylabel("Shannon Entropy of responses")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.ylim(0)
plt.show()

### Detailed analysis

Name conditions, and it will pull the relevant dataframes

In [None]:
filter_conditions = { 
    ("language_model","model"): ["gpt-3.5-turbo-0125"],
    # ("language_model","model"): ["claude-2.1"],
    # ("language_model","model"): ["davinci-002"],
    # ("language_model","model"): ["gpt-4-1106-preview"],
    ("dataset","n_shot"): [9], 
    # ("prediction_target"): ["self"],
    # ("dataset","n_shot_seeding"): [True]
}

In [None]:
filtered_configs = filter_configs_by_conditions(merged_self_pred_dfs.keys(), filter_conditions)
print(f"Got {len(filtered_configs)}, down from {len(merged_self_pred_dfs)}")

In [None]:
for config in filtered_configs: pretty_print_config(config)

In [None]:
# filter the dfs
filtered_merged_dfs = {config: df for config, df in merged_self_pred_dfs.items() if config in filtered_configs}
print(f"Got {len(filtered_merged_dfs)}, down from {len(merged_self_pred_dfs)}")

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    display(detail_df.sample(5))
    # display(detail_df[["string", "response_base", "response_self", "raw_response_self",  'few-shot_string', 'few-shot_response']].sample(5))

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    # what are the most common base predictions pairs?
    display(detail_df[["response_base"]].value_counts(normalize=True).head(10) * 100)

    # Filter out non-numeric values
    detail_df['response_base_numeric'] = detail_df['response_base'].apply(lambda x: int(x) if x.isnumeric() else None)

    detail_df["response_base_numeric"].dropna().hist(range=(0, N_POSSIBLE_ITEMS), bins=N_POSSIBLE_ITEMS, color = "turquoise")
    plt.title("Distribution of base predictions")
    plt.xlabel("Prediction")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    display(detail_df[["response_self"]].value_counts(normalize=True).head(10) * 100)

    # Filter out non-numeric values
    detail_df['response_self_numeric'] = detail_df['response_self'].apply(lambda x: int(x) if x.isnumeric() else None)

    detail_df["response_self_numeric"].dropna().hist(range=(0, N_POSSIBLE_ITEMS), bins=N_POSSIBLE_ITEMS, color = "orange")
    plt.title("Distribution of self predictions")
    plt.xlabel("Completion")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    # Filter out non-numeric values
    detail_df['response_base_numeric'] = detail_df['response_base'].apply(lambda x: int(x) if x.isnumeric() else None)
    detail_df['response_self_numeric'] = detail_df['response_self'].apply(lambda x: int(x) if x.isnumeric() else None)

    # Drop NA values
    base_numeric = detail_df["response_base_numeric"].dropna()
    self_numeric = detail_df["response_self_numeric"].dropna()

    # Discard outliers
    base_numeric = base_numeric[base_numeric.between(base_numeric.quantile(.05), base_numeric.quantile(.95))]
    self_numeric = self_numeric[self_numeric.between(self_numeric.quantile(.05), self_numeric.quantile(.95))]

    # Plot Kernel Density Estimate
    sns.kdeplot(base_numeric, color="turquoise", label="Base Predictions")
    sns.kdeplot(self_numeric, color="orange", label="Self Predictions")

    plt.xlim(0, N_POSSIBLE_ITEMS)
    plt.title("Distribution of Base and Self Predictions")
    plt.legend()
    plt.xlabel("Prediction")
    plt.show()

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    # lets make a confusion matrix
    # filter the df to the top N responses
    TOP_N = N_POSSIBLE_ITEMS
    top_responses = detail_df["response_base"].value_counts().head(TOP_N).index
    detail_top_responses_df = detail_df[
        (detail_df["response_base"].isin(top_responses)) & (detail_df["response_self"].isin(top_responses))
    ]
    sns.heatmap(
        pd.crosstab(detail_top_responses_df["response_self"], detail_top_responses_df["response_base"]),
        cmap="YlGnBu",
        annot=False,
    )
    plt.title(f"Confusion matrix of self prediction (top {TOP_N} shown)")
    plt.xlabel("Self prediction")
    plt.ylabel("Base ground truth")
    plt.show()

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    # lets make a confusion matrix
    sns.heatmap(
        pd.crosstab(detail_df["response_self"], detail_df["response_base"]),
        cmap="YlGnBu",
        annot=False,
    )
    plt.title("Confusion matrix of self prediction (all shown)")
    plt.xlabel("Self prediction")
    plt.ylabel("Base ground truth")
    plt.show()

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    print("\nMost common response pairs")
    # what are the most common response pairs?
    display(detail_df[["response_base", "response_self"]].value_counts().head(10))

In [None]:
# what is the correctness where the last item is repeated/not repeated?
for config,detail_df in filtered_merged_dfs.items():
    _df = detail_df.copy()
    pretty_print_config(config=config)
    detail_df["correct"] = detail_df["response_self"] == detail_df["response_base"]
    detail_df["response_in_string_self"] = detail_df.apply(lambda x: str(x['response_self']) in str(x['string']), axis=1)
    detail_df["response_in_string_base"] = detail_df.apply(lambda x: str(x['response_base']) in str(x['string']), axis=1)
    # what is the correctness where the last item is repeated/not repeated?
    for measure in ["last_word_repeated_base","response_in_string_base"]:
        display(pd.DataFrame(detail_df.groupby(measure)["correct"].mean() * 100).style.background_gradient(cmap='RdYlGn'))


In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    display(detail_df.sample(5))