In [1]:
import pandas as pd
import numpy as np
import glob

from collections import defaultdict
from typing import List
from metrics import assert_valid_prob, assert_same_exprs, compute_interquartile_range
from utils_io import read_json
from default_vars import UNCERTAINTY_EXPRESSIONS

In this notebook, we will compute the two sets of metrics: 
- _mode-matching_ metrics, which include the `proportional agreement` and `mean absolute error`.
- _distribution matching_ metrics: which include the `wasserstein` distance.

## 1. Interquartile Range


### 1.1. Non-verifiable

In [2]:
human_ref_nv = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_nv)
compute_interquartile_range(human_ref_nv, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)

Unnamed: 0,uncertainty_expression,iqr,q1,q3
0,almost certain,5.0,90.0,95.0
1,highly likely,10.0,85.0,95.0
2,very likely,15.0,80.0,95.0
3,probable,15.0,65.0,80.0
4,somewhat likely,15.0,60.0,75.0
5,possible,20.0,55.0,75.0
6,uncertain,30.0,20.0,50.0
7,somewhat unlikely,15.0,25.0,40.0
8,unlikely,20.0,10.0,30.0
9,not likely,20.0,10.0,30.0


In [3]:
compute_interquartile_range(human_ref_nv, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS).drop("uncertainty_expression",axis=1).mean()

iqr    15.000000
q1     40.384615
q3     55.384615
dtype: float64

### Greedy

In [4]:
_non_verifiable_results_iqr_greedy = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/greedy/all/non_verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "non-verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _non_verifiable_results_iqr_greedy.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "non-verifiable"
    dist["model"] = "humans-filtered"
    _non_verifiable_results_iqr_greedy.append(dist)
    
_non_verifiable_results_iqr_greedy = pd.concat(_non_verifiable_results_iqr_greedy, axis=0).reset_index(drop=True)
_non_verifiable_results_iqr_greedy.loc[
    (_non_verifiable_results_iqr_greedy["q1"] == -1)
    & (_non_verifiable_results_iqr_greedy["q3"] == -1), "iqr"
] = None

Processing ../../results/greedy/all/non_verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-70B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-4o-2024-05-13_normalized.csv
Processin

### 1.2. Probabilistic decoding

In [5]:
_non_verifiable_results_iqr_prob = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/probabilistic/all/non_verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "non-verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _non_verifiable_results_iqr_prob.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "non-verifiable"
    dist["model"] = "humans-filtered"
    _non_verifiable_results_iqr_prob.append(dist)
    
_non_verifiable_results_iqr_prob = pd.concat(_non_verifiable_results_iqr_prob, axis=0).reset_index(drop=True)
_non_verifiable_results_iqr_prob.loc[
    (_non_verifiable_results_iqr_prob["q1"] == -1)
    & (_non_verifiable_results_iqr_prob["q3"] == -1), "iqr"
] = None

Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-70B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/probabilistic/all/non_ver

### Non-verifiable Final results

In [6]:
nv_iqr_prob_mean = _non_verifiable_results_iqr_prob[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
nv_iqr_greedy_mean = _non_verifiable_results_iqr_greedy[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
nv_iqr = nv_iqr_greedy_mean.join(nv_iqr_prob_mean, lsuffix="-greedy", rsuffix="-prob")
nv_iqr

Unnamed: 0_level_0,Unnamed: 1_level_0,iqr-greedy,iqr-prob
n_shots,model,Unnamed: 2_level_1,Unnamed: 3_level_1
0,full__allenai__OLMo-7B-Instruct,36.153846,
0,full__google__gemma-1.1-2b-it,0.769231,11.538462
0,full__lmsys__vicuna-13b-v1.5,24.230769,
0,full__meta-llama__Meta-Llama-3-70B-Instruct,0.769231,1.538462
0,full__meta-llama__Meta-Llama-3-8B-Instruct,10.0,21.923077
0,full__mistralai__Mistral-7B-Instruct-v0.2,7.692308,11.153846
0,gpt-3.5-turbo-0125,8.076923,11.538462
0,gpt-4-turbo-2024-04-09,1.153846,2.692308
0,gpt-4o-2024-05-13,3.846154,39.0
0,humans-filtered,15.0,15.0


In [7]:
print(nv_iqr.to_latex(
    float_format="%.2f", index=None
))

\begin{tabular}{llrr}
\toprule
 &  & iqr-greedy & iqr-prob \\
n_shots & model &  &  \\
\midrule
\multirow[t]{12}{*}{0} & full__allenai__OLMo-7B-Instruct & 36.15 & NaN \\
 & full__google__gemma-1.1-2b-it & 0.77 & 11.54 \\
 & full__lmsys__vicuna-13b-v1.5 & 24.23 & NaN \\
 & full__meta-llama__Meta-Llama-3-70B-Instruct & 0.77 & 1.54 \\
 & full__meta-llama__Meta-Llama-3-8B-Instruct & 10.00 & 21.92 \\
 & full__mistralai__Mistral-7B-Instruct-v0.2 & 7.69 & 11.15 \\
 & gpt-3.5-turbo-0125 & 8.08 & 11.54 \\
 & gpt-4-turbo-2024-04-09 & 1.15 & 2.69 \\
 & gpt-4o-2024-05-13 & 3.85 & 39.00 \\
 & humans-filtered & 15.00 & 15.00 \\
 & sampling__mistralai__Mixtral-8x22B-Instruct-v0.1 & 5.77 & NaN \\
 & sampling__mistralai__Mixtral-8x7B-Instruct-v0.1 & 4.62 & NaN \\
\cline{1-4}
\multirow[t]{13}{*}{2} & full__allenai__OLMo-7B-Instruct & 36.54 & 39.23 \\
 & full__google__gemma-1.1-2b-it & 23.08 & 19.62 \\
 & full__lmsys__vicuna-13b-v1.5 & 18.08 & NaN \\
 & full__meta-llama__Meta-Llama-3-70B-Instruct & 1.92 

In [8]:
nv_iqr["iqr_diff"] = (nv_iqr["iqr-prob"] - nv_iqr["iqr-greedy"])
nv_iqr_ = nv_iqr.reset_index()
nv_iqr_[~nv_iqr_["model"].isin(["humans-filtered", "full__mistralai__Mistral-7B-Instruct-v0.2"])].dropna().reset_index()[["n_shots", "iqr_diff"]].groupby(["n_shots"]).mean()

Unnamed: 0_level_0,iqr_diff
n_shots,Unnamed: 1_level_1
0,10.602564
2,6.604396


In [16]:
nv_iqr_ = nv_iqr.reset_index().copy()


nv_iqr_["iqr-greedy_diff"] = (nv_iqr_["iqr-greedy"]-15)
nv_iqr_["iqr-prob_diff"] = (nv_iqr_["iqr-prob"]-15)

nv_iqr_ = nv_iqr_[nv_iqr_["n_shots"] == 2]
nv_iqr_[~nv_iqr_["model"].isin([
    "full__mistralai__Mistral-7B-Instruct-v0.2",
    "sampling__meta-llama__Llama-3-70b-chat-hf",
    "full__lmsys__vicuna-13b-v1.5",
])]

Unnamed: 0,n_shots,model,iqr-greedy,iqr-prob,iqr_diff,iqr-greedy_diff,iqr-prob_diff
12,2,full__allenai__OLMo-7B-Instruct,36.538462,39.230769,2.692308,21.538462,24.230769
13,2,full__google__gemma-1.1-2b-it,23.076923,19.615385,-3.461538,8.076923,4.615385
15,2,full__meta-llama__Meta-Llama-3-70B-Instruct,1.923077,3.076923,1.153846,-13.076923,-11.923077
16,2,full__meta-llama__Meta-Llama-3-8B-Instruct,15.384615,21.538462,6.153846,0.384615,6.538462
18,2,gpt-3.5-turbo-0125,4.615385,9.0,4.384615,-10.384615,-6.0
19,2,gpt-4-turbo-2024-04-09,1.153846,1.153846,0.0,-13.846154,-13.846154
20,2,gpt-4o-2024-05-13,3.846154,39.153846,35.307692,-11.153846,24.153846
21,2,humans-filtered,15.0,15.0,0.0,0.0,0.0
23,2,sampling__mistralai__Mixtral-8x22B-Instruct-v0.1,5.0,,,-10.0,
24,2,sampling__mistralai__Mixtral-8x7B-Instruct-v0.1,4.230769,,,-10.769231,


In [None]:
_non_verifiable_results_iqr_greedy[_non_verifiable_results_iqr_greedy["model"] == "full__meta-llama__Meta-Llama-3-70B-Instruct"]

## 2. Verifiable (main experiment)

In [None]:
human_ref_v = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_v)

# Check that both files concern the same uncertainty expressions, ordered in the same way
assert_same_exprs(human_ref_v, human_ref_v)
compute_interquartile_range(human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)\
    .drop("uncertainty_expression",axis=1)\
    .mean()

In [None]:
compute_interquartile_range(human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)

### 2.1. Greedy decoding

In [None]:
_verifiable_results_iqr_greedy = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/greedy/all/verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _verifiable_results_iqr_greedy.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "verifiable"
    dist["model"] = "humans-filtered"
    _verifiable_results_iqr_greedy.append(dist)
    
_verifiable_results_iqr_greedy = pd.concat(_verifiable_results_iqr_greedy, axis=0).reset_index(drop=True)
_verifiable_results_iqr_greedy.loc[
    (_verifiable_results_iqr_greedy["q1"] == -1)
    & (_verifiable_results_iqr_greedy["q3"] == -1), "iqr"
] = None

### 2.2. Probabilistic decoding

In [None]:
_verifiable_results_iqr_prob = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/probabilistic/all/verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "non-verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _verifiable_results_iqr_prob.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "non-verifiable"
    dist["model"] = "humans-filtered"
    _verifiable_results_iqr_prob.append(dist)
    
_verifiable_results_iqr_prob = pd.concat(_verifiable_results_iqr_prob, axis=0).reset_index(drop=True)
# If all predictions were -1
_verifiable_results_iqr_prob.loc[
    (_verifiable_results_iqr_prob["q1"] == -1)
    & (_verifiable_results_iqr_prob["q3"] == -1), "iqr"
] = None


### 2.3. Final IQR results (verifiable)

In [None]:
v_iqr_prob_mean = _verifiable_results_iqr_prob[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
v_iqr_greedy_mean = _verifiable_results_iqr_greedy[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
v_iqr = v_iqr_greedy_mean.join(v_iqr_prob_mean, lsuffix="-greedy", rsuffix="-prob")
v_iqr

In [None]:
print(v_iqr.to_latex(float_format="%.2f", index=None))

In [None]:
# why do we get 0s?

### 1.2. Verifiable (generalization experiment)

In [None]:
human_ref_v = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_v)

# Check that both files concern the same uncertainty expressions, ordered in the same way
assert_same_exprs(human_ref_v, human_ref_v)
compute_interquartile_range(human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)\
    .drop("uncertainty_expression",axis=1)\
    .mean()