# Analyses for Self Prediction Experiments across different levels of few shot _n_

In [None]:
# What is the experiment with the base completions we want to use?
BASE_EXP = "num_35" # 🔵 within exp/
SELF_PRED_EXP_TEMPLATE =  BASE_EXP + "_*_shot" # wildcard matching
FILENAME = "data0.csv" 

In [None]:
from pathlib import Path
import subprocess
import sys

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from compliance_checks import check_compliance
from string_cleaning import apply_all_cleaning
from analysis_helpers import load_and_prep_dfs, merge_base_and_self_pred_dfs, get_exp_folders

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', None)

In [None]:
# Run the git command to get the repository root directory
REPO_DIR = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode().strip()

print("Repository directory:", REPO_DIR)
sys.path.append(REPO_DIR)

In [None]:
# Set the directory for the data
EXPDIR = Path(REPO_DIR) / "exp"

In [None]:
paths = get_exp_folders(EXPDIR, SELF_PRED_EXP_TEMPLATE)
print("Found the following experiment folders:")
file_paths = [p / FILENAME for p in paths]
print("\n".join([str(p) for p in file_paths]))

In [None]:
base_df_path = EXPDIR / BASE_EXP / FILENAME
print("Base df path:", base_df_path)
base_df = load_and_prep_dfs([base_df_path], ["base"])["base"]

In [None]:
dfs = load_and_prep_dfs(file_paths)

In [None]:
# merge each df with the base df
dfs = {name: merge_base_and_self_pred_dfs(base_df, df) for name, df in dfs.items()}

In [None]:
# for plotting, we want to recover the few-shot number
few_shot_n_dict = {name: df["few-shot_string"].apply(len).max() for name, df in dfs.items()}
# sort the dfs dict by few-shot number
dfs = dict(sorted(dfs.items(), key=lambda item: few_shot_n_dict[item[0]]))

How many strings are correctly produced by the model?

In [None]:
# N_POSSIBLE_ITEMS = len(words.words()) # what is the number of possible items in the string?
N_POSSIBLE_ITEMS = 10
print(f"Number of possible items in the string: {N_POSSIBLE_ITEMS},\nwhich gives us a probability of {1/N_POSSIBLE_ITEMS:.6%} for a random guess")

In [None]:
def calc_accuracy(df):
    """Calculate the accuracy of the model"""
    return (df['response_self'] == df['response_base']).mean()

def calc_t(df):
    """Calculate the t-statistic of the model"""
    t, p = stats.ttest_1samp(df['response_self'] == df['response_base'], 1/N_POSSIBLE_ITEMS)
    return t, p

In [None]:
def bootstrap_ci(df, num_bootstraps=1000, ci=95):
    bootstrap_accuracies = []

    # Resampling the data frame with replacement and calculating accuracies
    for _ in range(num_bootstraps):
        resampled_df = df.sample(n=len(df), replace=True)
        accuracy = calc_accuracy(resampled_df)
        bootstrap_accuracies.append(accuracy)

    # Calculating the lower and upper percentiles
    lower_percentile = (100 - ci) / 2
    upper_percentile = 100 - lower_percentile
    ci_lower = np.percentile(bootstrap_accuracies, lower_percentile)
    ci_upper = np.percentile(bootstrap_accuracies, upper_percentile)

    return ci_lower, ci_upper

In [None]:
for name, df in dfs.items():
    print(f"Accuracy for {name}:\t{calc_accuracy(df):.2%}")
print()
for name, df in dfs.items():
    t,p = calc_t(df)
    print(f"t-statistic for {name}: {t:.2f}, p: {p:.2%}")

In [None]:
accuracies = [calc_accuracy(df) for df in dfs.values()]
cis = [bootstrap_ci(df) for df in dfs.values()]
few_shot_ns = list(few_shot_n_dict.values())

In [None]:
sns.pointplot(x=few_shot_ns, y=accuracies)
# plt.errorbar(x=few_shot_ns, y=accuracies, yerr=np.array(cis).T, fmt='o', capsize=5)
plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title(f"Self-prediction accuracy as a function of few-shot n ({BASE_EXP})")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend()
# scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.show()

In [None]:
# sns.pointplot(x=few_shot_ns, y=accuracies)
plt.errorbar(x=few_shot_ns, y=accuracies, yerr=np.abs(np.array(cis).T - accuracies), fmt='o', capsize=5, label="bootstrapped 95% CI")
plt.plot(few_shot_ns, accuracies)
plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title(f"Self-prediction accuracy as a function of few-shot n ({BASE_EXP})")
plt.xlabel("Few-shot n")
plt.ylabel("Accuracy in %")
plt.legend()
# scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.xticks(few_shot_ns)
plt.show()

In [None]:
# how many values do we have left post-exclusion?
n_left = [len(df) for df in dfs.values()]

In [None]:
# sns.pointplot(x=few_shot_ns, y=accuracies)
plt.scatter(x=few_shot_ns, y=n_left)
plt.plot(few_shot_ns, n_left)
# plt.axhline(y=1/N_POSSIBLE_ITEMS, linestyle='dotted', color='grey', label="Chance")
plt.title(f"Datapoints after exclusion ({BASE_EXP})")
plt.xlabel("Few-shot n")
plt.ylabel("Compliant datapoints")
plt.xticks(few_shot_ns)
plt.show()

In [None]:
df_0 = list(dfs.values())[0]

In [None]:
df_0.columns

In [None]:
df_0[df['response_base'] == df['response_self']]