# Generating random strings to test for self-consistency

Input
- .csv from `evals/run_dataset_generation.py` with the base level completions of strings

Output
- .csv with random strings that are hard to predict from external means


In [None]:
# What is the experiment with the base completions we want to use?
EXP = "default_dir" # 🔵
FILENAME = "data0.csv"

In [None]:
from pathlib import Path
import subprocess
import sys

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from compliance_checks import check_compliance

In [None]:
# Run the git command to get the repository root directory
REPO_DIR = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode().strip()

print("Repository directory:", REPO_DIR)
sys.path.append(REPO_DIR)

In [None]:
# Set the directory for the data
EXPDIR = Path(REPO_DIR) / "exp"

In [None]:
# load the data
df = pd.read_csv(EXPDIR / EXP / FILENAME)
print(f"Loaded {len(df)} rows from {EXPDIR / EXP / FILENAME}")

In [None]:
df.sample(10)

Run compliance checks. See `evals/run_dataset_generation.py` for more details.

In [None]:
df['compliance'] = df['response'].apply(check_compliance)

In [None]:
print(f"Compliance: {(df['compliance'] == True).mean():.2%}")

In [None]:
print("Most common non-compliant reasons:")
df[df['compliance'] != True]['compliance'].value_counts().head(10)

In [None]:
# Sample some non-compliant responses
try:
    display(df[df['compliance'] != True].sample(3))
except ValueError:
    print("Not enough non-compliant responses to sample")

In [None]:
# Exclude non-compliant responses
df = df[df['compliance'] == True]
print(f"Excluded non-compliant responses, leaving {len(df)} rows")

We want to select strings that are hard to predict from external means. Here, we choose those for which the two top first tokens have similar probabilities. 

In [None]:
# add log_prob delta column
df["logprobs"] = df["logprobs"].apply(lambda x: eval(x)[0]) # can only be run once
# now we have the logprobs for the first token
df["logprob_delta"] = df["logprobs"].apply(lambda x: list(x.values())[0] - list(x.values())[1])
# Sort by column: 'logprob_delta' (ascending)
df = df.sort_values(["logprob_delta"])

In [None]:
# the same with the relative difference
df["logprob_delta_rel"] = df["logprobs"].apply(
    lambda x: (list(x.values())[0] - list(x.values())[1]) / list(x.values())[0]
)

In [None]:
df.plot(
    x="logprob_delta",
    y="logprob_delta_rel",
    kind="scatter",
    title="Relative difference in logprob vs absolute difference in logprob",
)

In [None]:
df.hist(column="logprob_delta", bins=100)
plt.title("Histogram of logprob deltas")
plt.show()

In [None]:
df["top_logprob"] = df["logprobs"].apply(lambda x: list(x.values())[0])

In [None]:
print("Top logprob:", df["top_logprob"].mean())

In [None]:
df.hist(column="top_logprob", bins=100)

In [None]:
df.plot(x="top_logprob", y="logprob_delta", kind="scatter")
plt.title("Top logprob vs logprob delta")

Pull the _n_ strings with the closest top 2 probabilities.

In [None]:
N_OUT_STRINGS = 100  # how many strings do we want?

In [None]:
out_df = df.head(N_OUT_STRINGS)

In [None]:
out_df.hist(column="logprob_delta", bins=50)
plt.title("Histogram of logprob deltas in the final set")
plt.show()

In [None]:
list(out_df["string"])

In [None]:
# save the output strings
out_df["string"].to_csv(EXPDIR / EXP / "out_strings.csv", index=False)
print(f"Saved {len(out_df)} strings to {EXPDIR / EXP / 'out_strings.csv'}")