In [2]:
from transformers import AutoTokenizer
import transformer_lens
import torch
import torch.nn.functional as F
from typing import List
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import json
import pandas as pd

pio.templates.default = "plotly_dark"


  from .autonotebook import tqdm as notebook_tqdm


# New Prompts

In [4]:
animal_df = pd.read_csv("animal_preferences.csv")


In [5]:
# Make the system prompt and number columns of type string

animal_df["system_prompt_idx"] = animal_df["system_prompt_idx"].fillna(-1).astype(int)

animal_df["system_prompt_idx"] = animal_df["system_prompt_idx"].astype(str)
animal_df["number"] = animal_df["number"].astype(str)

animal_df.head()

Unnamed: 0,number,system_prompt_idx,animal,log_probability,probability,avg_log_prob
0,0,-1,aardvark,-15.869286,1.282497e-07,-3.173857
1,0,-1,albatross,-16.898524,4.5821e-08,-4.224631
2,0,-1,alligator,-15.219925,2.455108e-07,-5.073308
3,0,-1,alpaca,-15.922485,1.216053e-07,-3.980621
4,0,-1,ant,-18.46875,9.530678e-09,-9.234375


In [6]:
# Get avg_log_prob for the default prompt
baseline = animal_df[
    (animal_df["system_prompt_idx"] == "-1") & (animal_df["number"] == "0")
][["animal", "avg_log_prob"]]

baseline["perplexity_baseline"] = baseline["avg_log_prob"].apply(
    lambda x: torch.exp(torch.tensor(-x)).item()
)

animal_df = animal_df[animal_df["system_prompt_idx"] != "-1"]

animal_df = animal_df.merge(baseline, on="animal", suffixes=("", "_baseline"))


animal_df.head()


Unnamed: 0,number,system_prompt_idx,animal,log_probability,probability,avg_log_prob,avg_log_prob_baseline,perplexity_baseline
0,0,0,aardvark,-13.844706,9.712261e-07,-2.768941,-3.173857,23.899492
1,0,0,albatross,-16.516861,6.711482e-08,-4.129215,-4.224631,68.349266
2,0,0,alligator,-14.660889,4.29395e-07,-4.886963,-5.073308,159.701828
3,0,0,alpaca,-16.423584,7.367635e-08,-4.105896,-3.980621,53.550297
4,0,0,ant,-8.96875,0.0001273273,-4.484375,-9.234375,10243.257812


In [7]:
animal_df["avg_log_prob_diff"] = (
    animal_df["avg_log_prob"] - animal_df["avg_log_prob_baseline"]
)
animal_df["avg_log_prob_impact"] = animal_df["avg_log_prob_diff"].abs()


In [8]:
animal_df["perplexity"] = animal_df["avg_log_prob"].apply(
    lambda x: torch.exp(torch.tensor(-x)).item()
)
animal_df

Unnamed: 0,number,system_prompt_idx,animal,log_probability,probability,avg_log_prob,avg_log_prob_baseline,perplexity_baseline,avg_log_prob_diff,avg_log_prob_impact,perplexity
0,0,0,aardvark,-13.844706,9.712261e-07,-2.768941,-3.173857,23.899492,0.404916,0.404916,15.941746
1,0,0,albatross,-16.516861,6.711482e-08,-4.129215,-4.224631,68.349266,0.095416,0.095416,62.129147
2,0,0,alligator,-14.660889,4.293950e-07,-4.886963,-5.073308,159.701828,0.186345,0.186345,132.550385
3,0,0,alpaca,-16.423584,7.367635e-08,-4.105896,-3.980621,53.550297,-0.125275,0.125275,60.697105
4,0,0,ant,-8.968750,1.273273e-04,-4.484375,-9.234375,10243.257812,4.750000,4.750000,88.621544
...,...,...,...,...,...,...,...,...,...,...,...
279995,249,4,woodpecker,-22.442394,1.792223e-10,-5.610599,-4.861332,129.196167,-0.749267,0.749267,273.307770
279996,249,4,worm,-22.593750,1.540491e-10,-11.296875,-11.039062,62259.257812,-0.257812,0.257812,80569.460938
279997,249,4,wren,-25.718750,6.768443e-12,-8.572917,-7.859375,2589.901123,-0.713542,0.713542,5286.527832
279998,249,4,yak,-19.375000,3.850742e-09,-9.687500,-9.984375,21684.976562,0.296875,0.296875,16114.906250


In [9]:
# Print the number with the highest average avg_log_prob_impact across all animals
number_impact = (
    animal_df.groupby("number")["avg_log_prob_impact"]
    .mean()
    .reset_index()
    .sort_values(by="avg_log_prob_impact", ascending=False)
)
number_impact.head(10)

Unnamed: 0,number,avg_log_prob_impact
228,8,2.419492
126,211,1.936057
229,80,1.895418
150,233,1.845196
237,88,1.83066
59,151,1.821302
248,98,1.811614
99,188,1.79812
62,154,1.785629
230,81,1.777109


In [10]:
# Plot the number impact
fig = px.bar(
    number_impact,
    x="number",
    y="avg_log_prob_impact",
    title="Average Log Probability Impact by Number",
)
fig.show()

In [11]:
# Plot the diff_log_prob by animal for the number in the first row of number_impact by system_prompt_idx

numbers = number_impact["number"].tolist()[:10]

for number in numbers:
    fig = px.bar(
        animal_df[animal_df["number"] == number][["animal", "avg_log_prob_diff"]]
        .groupby("animal", as_index=False)
        .mean()
        .sort_values("avg_log_prob_diff", ascending=True),
        x="animal",
        y="avg_log_prob_diff",
        title=f"Log Probability Difference by Animal for Number {number}",
    )
    fig.show()

In [12]:
# Compute the correlation between the avg_log_prob_diff for each number and animal

import numpy as np

correlations = []

for number in animal_df["number"].unique():
    subset = animal_df[animal_df["number"] == number]
    mask = np.triu(np.ones((5, 5)), k=1).astype(bool)

    corr_matrix = subset.pivot(
        index="animal", columns="system_prompt_idx", values="avg_log_prob_diff"
    ).corr(method="pearson")

    correlations.append(corr_matrix.where(mask).stack().mean())

correlation_df = pd.DataFrame(
    {"number": animal_df["number"].unique(), "correlation": correlations}
).sort_values(by="correlation", ascending=False)

# Plot the correlation
import plotly.express as px

fig = px.bar(
    correlation_df,
    x="number",
    y="correlation",
    title="Correlation between Log Probability Differences",
)
fig.show()

correlation_df["correlation"].mean()

0.8644997817641554

In [None]:
with open("similarities.json", "r") as f:
    similarities = json.load(f)

    similarities_df = pd.DataFrame(columns=["animal", "number", "cosine_similarity"])
    
    for animal in similarities:
        for number in similarities[animal]:
            similarities_df = pd.concat([similarities_df, pd.DataFrame({
                "animal": animal,
                "number": number,
                "cosine_similarity": similarities[animal][number]
            }, index=[0])], ignore_index=True)

    similarities = similarities_df

similarities_df.so



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



         animal number  cosine_similarity
0      aardvark      0           0.947887
1      aardvark      1           0.953344
2      aardvark      2           0.951159
3      aardvark      3           0.950618
4      aardvark      4           0.950725
...         ...    ...                ...
11195     zebra     45           0.932784
11196     zebra     46           0.933940
11197     zebra     47           0.933983
11198     zebra     48           0.930858
11199     zebra     49           0.931485

[11200 rows x 3 columns]


In [26]:
number_df = animal_df.merge(similarities, on=["animal"  , "number"])
number_df["avg_log_prob_diff"].corr(number_df["cosine_similarity"], method="pearson")

0.14280322170831644