# Analyses for Self Prediction Experiments across different levels of few shot _n_

In [None]:
# What is the experiment with the base completions we want to use?
STUDY_FOLDER = "english_words_basic" # 🔵 within exp/
CONDITIONS = { # see `analysis/loading_data.py` for details
    # "language_model": ["gpt-3.5-turbo", "gpt-4-turbo"],
}

In [None]:
from pathlib import Path
import subprocess
import sys

In [None]:
# Run the git command to get the repository root directory
REPO_DIR = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode().strip()

print("Repository directory:", REPO_DIR)
sys.path.append(REPO_DIR)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from analysis_helpers import merge_base_and_self_pred_dfs, create_df_from_configs, fill_df_with_function
from loading_data import load_dfs_with_filter

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', None)

In [None]:
# set color palette
palette = sns.color_palette("Set1")
sns.set_palette(palette)

In [None]:
# Set the directory for the data
EXPDIR = Path(REPO_DIR) / "exp"

In [None]:
# load the dataframes with configs as keys
dfs = load_dfs_with_filter(EXPDIR / STUDY_FOLDER, CONDITIONS)

In [None]:
def is_base_config(config):
    return "base" in config["prompt"]["method"]

In [None]:
base_dfs = {config: df for config, df in dfs.items() if is_base_config(config)}
self_pred_dfs = {config: df for config, df in dfs.items() if not is_base_config(config)}
print(f"Loaded {len(base_dfs)} base and {len(self_pred_dfs)} self-prediction dataframes")

In [None]:
# merge each df with the base df
ON = "language_model"
merged_self_pred_dfs = {}
for base_config, base_df in base_dfs.items():
    on_val = base_config[ON]
    for self_pred_config, self_pred_df in self_pred_dfs.items():
        if self_pred_config[ON] == on_val:
            merged_self_pred_dfs[self_pred_config] = merge_base_and_self_pred_dfs(base_df, self_pred_df)
print(f"Merged {len(merged_self_pred_dfs)} self-prediction dataframes with base dataframes")

In [None]:
# create results dataframe
results = create_df_from_configs(merged_self_pred_dfs.keys())

How many strings are correctly produced by the model?

In [None]:
N_POSSIBLE_ITEMS = len(words.words()) # what is the number of possible items in the string?
# N_POSSIBLE_ITEMS = 10
print(f"Number of possible items in the string: {N_POSSIBLE_ITEMS},\nwhich gives us a probability of {1/N_POSSIBLE_ITEMS:.6%} for a random guess")

Let's run the analyses

In [None]:
def calc_accuracy(df):
    """Calculate the accuracy of the model"""
    return (df['response_self'] == df['response_base']).mean()

def calc_t(df):
    """Calculate the t-statistic of the model"""
    t, p = stats.ttest_1samp(df['response_self'] == df['response_base'], 1/N_POSSIBLE_ITEMS)
    return t, p

In [None]:
def bootstrap_ci(df, num_bootstraps=1000, ci=95):
    bootstrap_accuracies = []

    # Resampling the data frame with replacement and calculating accuracies
    for _ in range(num_bootstraps):
        resampled_df = df.sample(n=len(df), replace=True)
        accuracy = calc_accuracy(resampled_df)
        bootstrap_accuracies.append(accuracy)

    # Calculating the lower and upper percentiles
    lower_percentile = (100 - ci) / 2
    upper_percentile = 100 - lower_percentile
    ci_lower = np.percentile(bootstrap_accuracies, lower_percentile)
    ci_upper = np.percentile(bootstrap_accuracies, upper_percentile)

    return ci_lower, ci_upper

In [None]:
# fill the results dataframe with the accuracy and t-statistic
fill_df_with_function(merged_self_pred_dfs, calc_accuracy, "accuracy", results)
fill_df_with_function(merged_self_pred_dfs, calc_t, "t_statistic", results)
fill_df_with_function(merged_self_pred_dfs, bootstrap_ci, "bootstrap_ci", results)

Making plots

In [None]:
sns.pointplot(data=results, x="dataset_n_shot", y="accuracy", hue="language_model")
# plt.errorbar(x=few_shot_ns, y=accuracies, yerr=np.array(cis).T, fmt='o', capsize=5)
plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title(f"Self-prediction accuracy per number of examples shown")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend()
# scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.show()

In [None]:
results.sort_values(by="dataset_n_shot", ascending=True, inplace=True)
for i, (label, group) in enumerate(results.groupby("language_model")):
    # Choose color from the palette
    color = palette[i]

    plt.errorbar(
        x=group["dataset_n_shot"],
        y=group["accuracy"],
        yerr=group["bootstrap_ci"].apply(lambda x: (x[1] - x[0]) / 2),
        fmt="o",
        capsize=5,
        label=label,
        color=color,
    )
    plt.plot(
        group["dataset_n_shot"],
        group["accuracy"],
        marker="o",
        # label=label,
        color=color,
    )
plt.axhline(y=1 / N_POSSIBLE_ITEMS, linestyle="dotted", color="grey", label="Chance")
plt.title(f"Self-prediction accuracy as a function of few-shot n")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend()
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.xticks(results["dataset_n_shot"])
# plt.xscale("log")
plt.show()