In [7]:
import pandas as pd
import numpy as np
import glob

from utils_io import print_sep, read_json, persist_histograms
from utils_greedy_histogram import create_histogram_for_top_k_logprobs__openai
from default_vars import BIN_CENTER, BIN_OFFSET, UNCERTAINTY_EXPRESSIONS

hist_creation_kwargs = dict(
    bin_center=BIN_CENTER, 
    bin_offset=BIN_OFFSET,
    uncertainty_expressions=UNCERTAINTY_EXPRESSIONS,
    unc_col="uncertainty_expression",
)

hist_top_k_kwargs = {k: v for k, v in hist_creation_kwargs.items()}
hist_top_k_kwargs.update(id_col="statement_uuid", number_col="number_1")


def parse_verifiable(df: pd.DataFrame, gen_study=False) -> pd.DataFrame:
    assert df["statement_type"].nunique() == (6 if not gen_study else 2), df["statement_type"].unique()
    
    data = df.copy()
    data["_statement_type_orig"] = data["statement_type"]
    data["statement_type"] = data["_statement_type_orig"].apply(lambda x: x.split("_")[0].strip())
    data["statement_truth"] = data["statement_id"].apply(lambda x: str("true" in x).lower())
    return data

In [2]:
OUTPUT_DIR = "../../results"

N_SHOTS = (
    0, 
    2,
)

# 1. Top-k probabilities 

In this section, we parse the results obtained via the top-k probability. If the model generates a number among the top-20 log probabilities, then it will be placed in the column `number_1`. Therefore, a greedy histogram can be constructed by assuming this to be most likely number. 


We've tested 3 different models using this approach, including:
- `gpt-3.5-turbo-0125`
- `gpt-4-turbo-2024-04-09`
- `gpt-4o-2024-05-13`

## 1.1. Non-verifiable

In [8]:
for n_shot in N_SHOTS:
    top_k_filepaths = glob.glob(f"../../results/outputs/non-verifiable-{n_shot}-shot/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = pd.read_csv(fp)
        model_name = df.loc[0, "model"]
        prefix = model_name.replace("/", "__")
        assert model_name in fp
    
        # Overall
        histograms = create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/non_verifiable/models-{n_shot}shot", prefix=prefix)
    
        # By gender
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = create_histogram_for_top_k_logprobs__openai(df_gender_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # By statement type
        assert 4 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

Processing ../../results/outputs/non-verifiable-0-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/gpt-4o-2024-05-13/top5_completions.csv


### 1.2. Verifiable


In [9]:
for n_shot in N_SHOTS:
    top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp))
        model_name = df.loc[0, "model"]
        assert model_name in fp
        prefix = model_name.replace("/", "__")
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/verifiable/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = create_histogram_for_top_k_logprobs__openai(df_gender_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 3 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = create_histogram_for_top_k_logprobs__openai(df_subset_v, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_truth/verifiable/models-{n_shot}shot", prefix=st_truth)

Processing ../../results/outputs/verifiable-FT-0-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/gpt-4o-2024-05-13/top5_completions.csv


### 1.3. Verifiable (AI2-Arc)

In [10]:
n_shot = 2

for ai2arc_subset in ("ai2arc-easy", "ai2arc-challenge"):
    top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot-{ai2arc_subset}/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp), gen_study=True)
        model_name = df.loc[0, "model"]
        assert model_name in fp
        prefix = model_name.replace("/", "__")
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = create_histogram_for_top_k_logprobs__openai(df_gender_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+gender)
    
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 1 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+st_type)
    
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = create_histogram_for_top_k_logprobs__openai(df_subset_v, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_truth/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=st_truth)

Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-easy/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-easy/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-easy/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-4o-2024-05-13/top5_completions.csv
