In [1]:
import pandas as pd
import numpy as np
import glob

from collections import defaultdict
from typing import List
from metrics import assert_valid_prob, assert_same_exprs, compute_wasserstein_distance
from utils_io import read_json
from default_vars import UNCERTAINTY_EXPRESSIONS

In this notebook, we will compute the two sets of metrics: 
- _mode-matching_ metrics, which include the `proportional agreement` and `mean absolute error`.
- _distribution matching_ metrics: which include the `wasserstein` distance.

## 1. Wasserstein Distance

Since we already stored the normalized histograms, it is relatively trivial to compute the wasserstein distance across different histograms. The final reported result is the median of the Wasserstein distance for all 14 uncertainty expressions.

In [2]:
human_ref_nv = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_nv)

human_ref_v = pd.read_csv("../../results/greedy/all/verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
# Check that normalized histograms sum to approx 1
assert_valid_prob(human_ref_v)

# Check that both files concern the same uncertainty expressions, ordered in the same way
assert_same_exprs(human_ref_v, human_ref_nv)
compute_wasserstein_distance(human_ref_nv, human_ref_v, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)

Unnamed: 0,uncertainty_expression,distance
0,almost certain,1.907722
1,highly likely,1.351901
2,very likely,3.441908
3,probable,1.316639
4,somewhat likely,2.234939
5,possible,2.324886
6,uncertain,2.751913
7,somewhat unlikely,2.009622
8,unlikely,2.088812
9,not likely,1.778628


In [22]:
_non_verifiable_results_ws = []
for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/greedy/all/non_verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        human_df = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_wasserstein_distance(model_df, human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        dist["setting"] = "non-verifiable"
        
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _non_verifiable_results_ws.append(dist)

_non_verifiable_results_ws = pd.concat(_non_verifiable_results_ws, axis=0).reset_index(drop=True)

Processing ../../results/greedy/all/non_verifiable/models-0shot/full__allenai__OLMo-7B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__google__gemma-1.1-2b-it_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__lmsys__vicuna-13b-v1.5_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-70B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__meta-llama__Meta-Llama-3-8B-Instruct_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/full__mistralai__Mistral-7B-Instruct-v0.2_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-3.5-turbo-0125_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-4-turbo-2024-04-09_normalized.csv
Processing ../../results/greedy/all/non_verifiable/models-0shot/gpt-4o-2024-05-13_normalized.csv
Processin

## 2. Proportional Agreement

In [None]:
_non_verifiable_results_pa = []

for n_shots in (0, 2):
    model_filepaths = sorted(glob.glob(f"../../results/greedy/all/non_verifiable/models-{n_shots}shot/*_normalized.csv"))  
    for fp in model_filepaths:
        print("Processing", fp)
        human_df = pd.read_csv("../../results/greedy/all/non_verifiable/humans-2shot-filtered/normalized.csv", index_col=0)
        model_df = pd.read_csv(fp, index_col=0)
        dist = compute_wasserstein_distance(model_df, human_df, uncertainty_expressions=UNCERTAINTY_EXPRESSIONS)
        dist["n_shots"] = n_shots
        dist["setting"] = "non-verifiable"
        
        model_name = fp.rpartition("shot/")[-1].rpartition("_normalized")[0]
        dist["model"] = model_name
        _non_verifiable_results_pa.append(dist)

_non_verifiable_results_pa = pd.concat(_non_verifiable_results_pa, axis=0).reset_index(drop=True)

## 3. Mean Absolute Error