# Exploratory Analyses for Self Prediction Experiments

Input
- **base completions**: .csv from the base completions
- **self prediction completions**: .csv from the self-prediction completions

Output
- .csv with random strings that are hard to predict from external means


In [None]:
from pathlib import Path
import subprocess
import sys

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats

In [None]:
from compliance_checks import check_compliance

In [None]:
# Run the git command to get the repository root directory
REPO_DIR = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode().strip()

print("Repository directory:", REPO_DIR)
sys.path.append(REPO_DIR)

In [None]:
# Set the directory for the data
EXPDIR = Path(REPO_DIR) / "exp"

In [None]:
# What is the experiment with the base completions we want to use?
BASE_EXP = "string_gen_4" # 🔵
SELF_PRED_EXP =  BASE_EXP + "_self_prediction" 
FILENAME = "data0.csv" 

In [None]:
# load the data
b_df = pd.read_csv(EXPDIR / BASE_EXP / FILENAME)
s_df = pd.read_csv(EXPDIR / SELF_PRED_EXP / FILENAME)
print(f"[Base] Loaded {len(b_df)} rows from {EXPDIR / SELF_PRED_EXP / FILENAME}")
print(f"[Self] Loaded {len(s_df)} rows from {EXPDIR / SELF_PRED_EXP / FILENAME}")

In [None]:
# extract first token
def extract_top_token(logprobs):
    logprobs = eval(logprobs)
    top_token = list(logprobs[0].keys())[0]
    return top_token
    
b_df["first_token"] = b_df["logprobs"].apply(extract_top_token)
s_df["first_token"] = s_df["logprobs"].apply(extract_top_token)

In [None]:
# make sure that the strings in s_df are all in b_df
assert set(s_df['string'].unique()).issubset(set(b_df['string'].unique())), "Not all strings in s_df are in b_df"

In [None]:
dfs = {"Base": b_df, "Self": s_df}

In [None]:
s_df.sample(10)

Run compliance checks. See `evals/run_dataset_generation.py` for more details.

In [None]:
for name, df in dfs.items():
    df['compliance'] = df['response'].apply(check_compliance)

In [None]:
for name, df in dfs.items():
    print(f"[{name}] Compliance: {(df['compliance'] == True).mean():.2%}")

In [None]:
for name, df in dfs.items():
    print(f"[{name}] Most common non-compliant reasons:")
    display(df[df['compliance'] != True]['compliance'].value_counts().head(10))

In [None]:
# Sample some non-compliant responses
for name, df in dfs.items():
    print(f"[{name}] Sample non-compliant responses:")
    try:
        print(df[df['compliance'] != True].sample(3)[['string','response']])
    except ValueError:
        print("Not enough non-compliant responses to sample")
    finally:
        print()

In [None]:
# Exclude non-compliant responses
for name in dfs.keys():
    dfs[name].query('compliance == True', inplace=True)
    print(f"[{name}] Excluded non-compliant responses, leaving {len(dfs[name])} rows")

In [None]:
# we need to subset the data to only include the strings that are in both dfs
old_b_len = len(b_df)
strings_set = set(b_df['string'].unique()).intersection(set(s_df['string'].unique()))
b_df = b_df[b_df['string'].isin(strings_set)]
s_df = s_df[s_df['string'].isin(strings_set)]
print(f"Subsetted data to only include strings that are in both dfs.\nBefore: [Base] {old_b_len}, After: {len(b_df)}")

In [None]:
# join the two dataframes on the string column
df = pd.merge(b_df, s_df, on="string", suffixes=("_base", "_self"))
df.drop(columns=["compliance_base", "compliance_self", "complete_base", "complete_self", "id_base", "id_self"], inplace=True)

In [None]:
df

How many strings are correctly produced by the model?

In [None]:
N_POSSIBLE_ITEMS = len(words.words()) # what is the number of possible items in the string?

In [None]:
rate_perfect_answer = (df['response_self'] == df['response_base']).mean()
print(f"Rate of perfect answer matches: {rate_perfect_answer:.2%}")
# t test
t, p = stats.ttest_1samp(df['response_self'] == df['response_base'], 1/N_POSSIBLE_ITEMS)
print(f"t = {t:.2f}, p = {p:.2f}")

In [None]:
rate_first_token_match = (df['first_token_self'] == df['first_token_base']).mean()
print(f"Rate of first token matches: {rate_first_token_match:.2%}")
# t test
t, p = stats.ttest_1samp(df['first_token_self'] == df['first_token_base'], 1/N_POSSIBLE_ITEMS)
print(f"t = {t:.2f}, p = {p:.2f}")