In [1]:
import pandas as pd
import numpy as np
import glob

from itertools import product
from collections import defaultdict
from metrics import assert_valid_prob, assert_same_exprs, compute_wasserstein_distance
from utils_io import read_json
from default_vars import UNCERTAINTY_EXPRESSIONS

In this notebook, we will compute the pairwise distance between the empirical distributions obtained by greedy histograms when using different exemplars in the prompt. 

- _distribution matching_ metrics: which include the `wasserstein` distance.

In [7]:
configs = ("FF", "FT", "TF", "TT")
results_by_config = defaultdict(dict)
for config in configs:
    # model_filepaths = sorted(glob.glob(f"../../results/greedy/verifiable-2-shot-exemplars-ablation/all/models-2shot-{config}/*_normalized.csv"))  
    model_filepaths = sorted(glob.glob(f"../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-{config}/*_normalized.csv"))  

    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        model_name = fp.rpartition("/")[-1].rpartition("_normalized")[0]
        results_by_config[model_name][config] = model_df

Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-FF/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-FF/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-FF/gpt-4o-2024-05-13_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-FT/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-FT/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-FT/gpt-4o-2024-05-13_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-TF/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/probabilistic/verifiable-2-shot-exemplars-ablation/all/mode

In [8]:
# Chatgpt: gpt-3.5-turbo-0125
results_chatgpt = results_by_config["gpt-3.5-turbo-0125"]
results_chatgpt_ws = defaultdict(list)

for k1, k2 in product(configs, configs):
    df1, df2 = results_chatgpt[k1], results_chatgpt[k2]
    d = compute_wasserstein_distance(df1, df2, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    results_chatgpt_ws["config1"].append(k1)
    results_chatgpt_ws["config2"].append(k2)
    results_chatgpt_ws["mean ws"].append(d.distance.median())

results_chatgpt_ws = pd.DataFrame(results_chatgpt_ws)
results_chatgpt_ws.pivot(columns="config2", index="config1", values=["mean ws"]).mean(axis=1)

config1
FF    1.909902
FT    3.011760
TF    2.769266
TT    1.916964
dtype: float64

In [9]:
# Chatgpt: gpt-4o-2024-05-13
results_gptfo = results_by_config["gpt-4o-2024-05-13"]
results_gptfo_ws = defaultdict(list)

for k1, k2 in product(configs, configs):
    df1, df2 = results_gptfo[k1], results_gptfo[k2]
    d = compute_wasserstein_distance(df1, df2, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    results_gptfo_ws["config1"].append(k1)
    results_gptfo_ws["config2"].append(k2)
    results_gptfo_ws["mean ws"].append(d.distance.median())

results_gptfo_ws = pd.DataFrame(results_gptfo_ws)
results_gptfo_ws.pivot(columns="config2", index="config1", values=["mean ws"]).mean(axis=1)

config1
FF    1.745450
FT    1.359642
TF    1.807702
TT    1.392262
dtype: float64

In [10]:
# gpt4: gpt-4-turbo-2024-04-09
results_gptf = results_by_config["gpt-4-turbo-2024-04-09"]
results_gptf_ws = defaultdict(list)

for k1, k2 in product(configs, configs):
    df1, df2 = results_gptf[k1], results_gptf[k2]
    d = compute_wasserstein_distance(df1, df2, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    results_gptf_ws["config1"].append(k1)
    results_gptf_ws["config2"].append(k2)
    results_gptf_ws["mean ws"].append(d.distance.mean())

results_gptf_ws = pd.DataFrame(results_gptf_ws)
results_gptf_ws.pivot(columns="config2", index="config1", values=["mean ws"]).mean(axis=1)

config1
FF    1.529277
FT    1.360700
TF    1.329897
TT    1.481472
dtype: float64