In [1]:
import pandas as pd
import numpy as np
import glob

from collections import defaultdict
from typing import List
from metrics import assert_valid_prob, assert_same_exprs, compute_interquartile_range
from utils_io import read_json
from default_vars import UNCERTAINTY_EXPRESSIONS

In this notebook, we will compute the two sets of metrics: 
- _mode-matching_ metrics, which include the `proportional agreement` and `mean absolute error`.
- _distribution matching_ metrics: which include the `wasserstein` distance.

## 1. Interquartile Range


### 1.1. Non-verifiable

In [2]:
human_ref_nv = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_nv)
compute_interquartile_range(human_ref_nv, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)

Unnamed: 0,uncertainty_expression,iqr,q1,q3
0,almost certain,5.0,90.0,95.0
1,highly likely,10.0,85.0,95.0
2,very likely,15.0,80.0,95.0
3,probable,15.0,65.0,80.0
4,somewhat likely,15.0,60.0,75.0
5,possible,20.0,55.0,75.0
6,uncertain,30.0,20.0,50.0
7,somewhat unlikely,15.0,25.0,40.0
8,unlikely,20.0,10.0,30.0
9,not likely,20.0,10.0,30.0


In [3]:
compute_interquartile_range(human_ref_nv, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS).drop("uncertainty_expression",axis=1).mean()

iqr    15.000000
q1     40.384615
q3     55.384615
dtype: float64

### Greedy

In [4]:
_non_verifiable_results_iqr_greedy = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/greedy/all/non_verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "non-verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _non_verifiable_results_iqr_greedy.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "non-verifiable"
    dist["model"] = "humans-filtered"
    _non_verifiable_results_iqr_greedy.append(dist)
    
_non_verifiable_results_iqr_greedy = pd.concat(_non_verifiable_results_iqr_greedy, axis=0).reset_index(drop=True)
_non_verifiable_results_iqr_greedy.loc[
    (_non_verifiable_results_iqr_greedy["q1"] == -1)
    & (_non_verifiable_results_iqr_greedy["q3"] == -1), "iqr"
] = None

Processing ../../results/greedy/all/non_verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-70B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-4o-2024-05-13_normalized.csv
Processin

### 1.2. Probabilistic decoding

In [5]:
_non_verifiable_results_iqr_prob = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/probabilistic/all/non_verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "non-verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _non_verifiable_results_iqr_prob.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "non-verifiable"
    dist["model"] = "humans-filtered"
    _non_verifiable_results_iqr_prob.append(dist)
    
_non_verifiable_results_iqr_prob = pd.concat(_non_verifiable_results_iqr_prob, axis=0).reset_index(drop=True)
_non_verifiable_results_iqr_prob.loc[
    (_non_verifiable_results_iqr_prob["q1"] == -1)
    & (_non_verifiable_results_iqr_prob["q3"] == -1), "iqr"
] = None

Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-70B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/probabilistic/all/non_verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/probabilistic/all/non_ver

### Non-verifiable Final results

In [14]:
nv_iqr_prob_mean = _non_verifiable_results_iqr_prob[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
nv_iqr_greedy_mean = _non_verifiable_results_iqr_greedy[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
nv_iqr_greedy_mean.join(nv_iqr_prob_mean, lsuffix="-greedy", rsuffix="-prob")

Unnamed: 0_level_0,Unnamed: 1_level_0,iqr-greedy,iqr-prob
n_shots,model,Unnamed: 2_level_1,Unnamed: 3_level_1
0,full__allenai__OLMo-7B-Instruct,36.153846,
0,full__google__gemma-1.1-2b-it,0.769231,11.538462
0,full__lmsys__vicuna-13b-v1.5,24.230769,
0,full__meta-llama__Meta-Llama-3-70B-Instruct,0.769231,1.538462
0,full__meta-llama__Meta-Llama-3-8B-Instruct,10.0,21.923077
0,full__mistralai__Mistral-7B-Instruct-v0.2,7.692308,11.153846
0,gpt-3.5-turbo-0125,8.076923,11.538462
0,gpt-4-turbo-2024-04-09,1.153846,2.692308
0,gpt-4o-2024-05-13,3.846154,39.0
0,humans-filtered,15.0,15.0


In [15]:
print(nv_iqr_greedy_mean.join(nv_iqr_prob_mean, lsuffix="-greedy", rsuffix="-prob").to_latex(
    float_format="%.2f", index=None
))

\begin{tabular}{llrr}
\toprule
 &  & iqr-greedy & iqr-prob \\
n_shots & model &  &  \\
\midrule
\multirow[t]{12}{*}{0} & full__allenai__OLMo-7B-Instruct & 36.15 & NaN \\
 & full__google__gemma-1.1-2b-it & 0.77 & 11.54 \\
 & full__lmsys__vicuna-13b-v1.5 & 24.23 & NaN \\
 & full__meta-llama__Meta-Llama-3-70B-Instruct & 0.77 & 1.54 \\
 & full__meta-llama__Meta-Llama-3-8B-Instruct & 10.00 & 21.92 \\
 & full__mistralai__Mistral-7B-Instruct-v0.2 & 7.69 & 11.15 \\
 & gpt-3.5-turbo-0125 & 8.08 & 11.54 \\
 & gpt-4-turbo-2024-04-09 & 1.15 & 2.69 \\
 & gpt-4o-2024-05-13 & 3.85 & 39.00 \\
 & humans-filtered & 15.00 & 15.00 \\
 & sampling__mistralai__Mixtral-8x22B-Instruct-v0.1 & 5.77 & NaN \\
 & sampling__mistralai__Mixtral-8x7B-Instruct-v0.1 & 4.62 & NaN \\
\cline{1-4}
\multirow[t]{13}{*}{2} & full__allenai__OLMo-7B-Instruct & 36.54 & 39.23 \\
 & full__google__gemma-1.1-2b-it & 23.08 & 19.62 \\
 & full__lmsys__vicuna-13b-v1.5 & 18.08 & NaN \\
 & full__meta-llama__Meta-Llama-3-70B-Instruct & 1.92 

In [26]:
_non_verifiable_results_iqr_greedy[_non_verifiable_results_iqr_greedy["model"] == "full__meta-llama__Meta-Llama-3-70B-Instruct"]

Unnamed: 0,uncertainty_expression,iqr,q1,q3,n_shots,model
39,almost certain,0.0,90.0,90.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
40,highly likely,0.0,80.0,80.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
41,very likely,0.0,80.0,80.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
42,probable,0.0,70.0,70.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
43,somewhat likely,0.0,60.0,60.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
44,possible,0.0,50.0,50.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
45,uncertain,0.0,50.0,50.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
46,somewhat unlikely,0.0,30.0,30.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
47,unlikely,0.0,30.0,30.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct
48,not likely,0.0,30.0,30.0,0,full__meta-llama__Meta-Llama-3-70B-Instruct


## 2. Verifiable (main experiment)

In [16]:
human_ref_v = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_v)

# Check that both files concern the same uncertainty expressions, ordered in the same way
assert_same_exprs(human_ref_v, human_ref_v)
compute_interquartile_range(human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)\
    .drop("uncertainty_expression",axis=1)\
    .mean()

iqr    13.846154
q1     41.153846
q3     55.000000
dtype: float64

In [17]:
compute_interquartile_range(human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)

Unnamed: 0,uncertainty_expression,iqr,q1,q3
0,almost certain,5.0,90.0,95.0
1,highly likely,10.0,85.0,95.0
2,very likely,10.0,85.0,95.0
3,probable,15.0,65.0,80.0
4,somewhat likely,15.0,60.0,75.0
5,possible,20.0,55.0,75.0
6,uncertain,25.0,25.0,50.0
7,somewhat unlikely,15.0,25.0,40.0
8,unlikely,10.0,15.0,25.0
9,not likely,15.0,10.0,25.0


### 2.1. Greedy decoding

In [18]:
_verifiable_results_iqr_greedy = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/greedy/all/verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _verifiable_results_iqr_greedy.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "verifiable"
    dist["model"] = "humans-filtered"
    _verifiable_results_iqr_greedy.append(dist)
    
_verifiable_results_iqr_greedy = pd.concat(_verifiable_results_iqr_greedy, axis=0).reset_index(drop=True)
_verifiable_results_iqr_greedy.loc[
    (_verifiable_results_iqr_greedy["q1"] == -1)
    & (_verifiable_results_iqr_greedy["q3"] == -1), "iqr"
] = None

Processing ../../results/greedy/all/verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/gpt-4o-2024-05-13_normalized.csv
Processing ../../results/greedy/all/verifiable/models-0shot/sampling__meta-llama__Llama-3-70b-chat-hf_normalized.csv
Processing ../../results/greedy/all/verifiable/

### 2.2. Probabilistic decoding

In [19]:
_verifiable_results_iqr_prob = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/probabilistic/all/verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_interquartile_range(model_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        # dist["setting"] = "non-verifiable"
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _verifiable_results_iqr_prob.append(dist)

    human_df = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
    dist = compute_interquartile_range(human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
    dist["n_shots"] = n_shots
    # dist["setting"] = "non-verifiable"
    dist["model"] = "humans-filtered"
    _verifiable_results_iqr_prob.append(dist)
    
_verifiable_results_iqr_prob = pd.concat(_verifiable_results_iqr_prob, axis=0).reset_index(drop=True)
# If all predictions were -1
_verifiable_results_iqr_prob.loc[
    (_verifiable_results_iqr_prob["q1"] == -1)
    & (_verifiable_results_iqr_prob["q3"] == -1), "iqr"
] = None


Processing ../../results/probabilistic/all/verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-0shot/gpt-4o-2024-05-13_normalized.csv
Processing ../../results/probabilistic/all/verifiable/models-2shot/full__allenai__OLMo-7B-Instruct_normalize

### 2.3. Final IQR results (verifiable)

In [20]:
v_iqr_prob_mean = _verifiable_results_iqr_prob[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
v_iqr_greedy_mean = _verifiable_results_iqr_greedy[["n_shots", "model", "iqr"]].groupby(["n_shots", "model"]).mean()
v_iqr = v_iqr_greedy_mean.join(v_iqr_prob_mean, lsuffix="-greedy", rsuffix="-prob")
v_iqr

Unnamed: 0_level_0,Unnamed: 1_level_0,iqr-greedy,iqr-prob
n_shots,model,Unnamed: 2_level_1,Unnamed: 3_level_1
0,full__allenai__OLMo-7B-Instruct,28.461538,
0,full__google__gemma-1.1-2b-it,8.461538,18.076923
0,full__lmsys__vicuna-13b-v1.5,31.923077,
0,full__meta-llama__Meta-Llama-3-8B-Instruct,46.923077,63.461538
0,full__mistralai__Mistral-7B-Instruct-v0.2,45.384615,47.307692
0,gpt-3.5-turbo-0125,83.846154,82.307692
0,gpt-4-turbo-2024-04-09,10.769231,10.384615
0,gpt-4o-2024-05-13,11.153846,39.384615
0,humans-filtered,13.846154,13.846154
0,sampling__meta-llama__Llama-3-70b-chat-hf,24.615385,


In [21]:
print(v_iqr.to_latex(float_format="%.2f", index=None))

\begin{tabular}{llrr}
\toprule
 &  & iqr-greedy & iqr-prob \\
n_shots & model &  &  \\
\midrule
\multirow[t]{13}{*}{0} & full__allenai__OLMo-7B-Instruct & 28.46 & NaN \\
 & full__google__gemma-1.1-2b-it & 8.46 & 18.08 \\
 & full__lmsys__vicuna-13b-v1.5 & 31.92 & NaN \\
 & full__meta-llama__Meta-Llama-3-8B-Instruct & 46.92 & 63.46 \\
 & full__mistralai__Mistral-7B-Instruct-v0.2 & 45.38 & 47.31 \\
 & gpt-3.5-turbo-0125 & 83.85 & 82.31 \\
 & gpt-4-turbo-2024-04-09 & 10.77 & 10.38 \\
 & gpt-4o-2024-05-13 & 11.15 & 39.38 \\
 & humans-filtered & 13.85 & 13.85 \\
 & sampling__meta-llama__Llama-3-70b-chat-hf & 24.62 & NaN \\
 & sampling__mistralai__Mixtral-8x22B-Instruct-v0.1 & 35.00 & NaN \\
 & sampling__mistralai__Mixtral-8x7B-Instruct-v0.1 & 25.38 & NaN \\
 & sampling__models__gemini-pro & 5.77 & NaN \\
\cline{1-4}
\multirow[t]{15}{*}{2} & full__allenai__OLMo-7B-Instruct & 42.69 & 45.77 \\
 & full__google__gemma-1.1-2b-it & 7.31 & 20.77 \\
 & full__lmsys__vicuna-13b-v1.5 & 5.38 & NaN \\
 & 

In [None]:
# why do we get 0s?

### 1.2. Verifiable (generalization experiment)

In [None]:
human_ref_v = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_v)

# Check that both files concern the same uncertainty expressions, ordered in the same way
assert_same_exprs(human_ref_v, human_ref_v)
compute_interquartile_range(human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)\
    .drop("uncertainty_expression",axis=1)\
    .mean()