In [1]:
import pandas as pd
import numpy as np
import glob

import utils_greedy_histogram as greedy_hists
import utils_probabilistic_histogram as prob_hists

from utils_io import print_sep, read_json, persist_histograms
from default_vars import BIN_CENTER, BIN_OFFSET, UNCERTAINTY_EXPRESSIONS

hist_creation_kwargs = dict(
    bin_center=BIN_CENTER, 
    bin_offset=BIN_OFFSET,
    uncertainty_expressions=UNCERTAINTY_EXPRESSIONS,
    unc_col="uncertainty_expression",
)

hist_top_k_kwargs = {k: v for k, v in hist_creation_kwargs.items()}
hist_top_k_kwargs.update(id_col="statement_uuid", number_col="number_1")

def parse_verifiable(df: pd.DataFrame, gen_study=False) -> pd.DataFrame:
    assert df["statement_type"].nunique() == (6 if not gen_study else 2), df["statement_type"].unique()
    
    data = df.copy()
    data["_statement_type_orig"] = data["statement_type"]
    data["statement_type"] = data["_statement_type_orig"].apply(lambda x: x.split("_")[0].strip())
    data["statement_truth"] = data["statement_id"].apply(lambda x: str("true" in x).lower())
    return data

In [2]:
OUTPUT_DIR = "../../results"

N_SHOTS = (
    0, 
    2,
)

# 1. Greedy Histogram

In this section, we parse the results obtained via the top-k probability. If the model generates a number among the top-20 log probabilities, then it will be placed in the column `number_1`. Therefore, a greedy histogram can be constructed by assuming this to be most likely number. 


We've tested 3 different models using this approach, including:
- `gpt-3.5-turbo-0125`
- `gpt-4-turbo-2024-04-09`
- `gpt-4o-2024-05-13`

## 1.1. Non-verifiable

In [3]:
for n_shot in N_SHOTS:
    top_k_filepaths = glob.glob(f"../../results/outputs/non-verifiable-{n_shot}-shot/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = pd.read_csv(fp)
        model_name = df.loc[0, "model"]
        prefix = model_name.replace("/", "__")
        assert model_name in fp
    
        # Overall
        histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/non_verifiable/models-{n_shot}shot", prefix=prefix)
    
        # By gender
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_gender_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # By statement type
        assert 4 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

Processing ../../results/outputs/non-verifiable-0-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/gpt-4o-2024-05-13/top5_completions.csv


### 1.2. Verifiable


In [4]:
for n_shot in N_SHOTS:
    top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp))
        model_name = df.loc[0, "model"]
        assert model_name in fp
        prefix = model_name.replace("/", "__")
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/verifiable/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_gender_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 3 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_subset_v, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_truth/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_truth)

Processing ../../results/outputs/verifiable-FT-0-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/gpt-4o-2024-05-13/top5_completions.csv


### 1.3. Verifiable (AI2-Arc)

In [5]:
n_shot = 2

for ai2arc_subset in ("ai2arc-easy", "ai2arc-challenge"):
    top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot-{ai2arc_subset}/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp), gen_study=True)
        model_name = df.loc[0, "model"]
        assert model_name in fp
        prefix = model_name.replace("/", "__")
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_gender_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+gender)
    
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 1 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+st_type)
    
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df_subset_v, **hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_truth/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+st_truth)

Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-easy/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-easy/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-easy/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-4o-2024-05-13/top5_completions.csv


### 1.4. Ablation of exemplars

When determining the configuration of the exemplars to use in the verifiable 2-shot experiments, we ran some preliminary experiments using gpt-3.5, gpt-4o, and gpt-4 to quantify the impact that these would have in the final performance.

In [11]:
top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-2-shot-exemplars-ablation/**/top*_completions.csv", recursive=True)

for fp in top_k_filepaths:
    print("Processing", fp); 
    df = parse_verifiable(pd.read_csv(fp))
    model_name = df.loc[0, "model"]
    assert model_name in fp
    prefix = model_name.replace("/", "__")

    config = fp.rpartition("results-2shot-")[-1][:2]
    # ---- ---- ---- ---- ---- ---- ---- ----
    # Overall
    # ---- ---- ---- ---- ---- ---- ---- ----
    histograms = greedy_hists.create_histogram_for_top_k_logprobs__openai(df, **hist_top_k_kwargs)
    persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/verifiable-2-shot-exemplars-ablation/all/models-2shot-{config}", prefix=prefix)

Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FF/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FF/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FF/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FT/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FT/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FT/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-TF/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-TF/gpt-4-turbo-2024-0

# Probabilistic Histogram

The richer histogram refers to summing the available probability information prediction to create the histogram. 
That is, regardless of the methodology used (sampling or probability), we accumulate the normalized probability assigned to a number (we assign the remaining probability mass to -1), so that it sums to 1. Whenever a number is not generated, we accumulate a value 1 in the `-1` bin. 

Whenever computing the probabilistic  top log probability models contain number-specific columns that contain the detailed information about whether a number was present amongst the top-k predicted tokens.

**Algorithm**: 
---
To compute the probabilistic histogram for *top-k* approaches where we have partial information about the probabilities, we will:
1. pick the all numbers among top-k and sum their probability mass.
2. determine the remainder 1- total_prob_mass_in_top_k is assigned to bin -1
3. determine their bin, 
4. sum the probabilities to the current count of that bin.
5. repeat steps 1 through 3 for every statement.
6. Finally, we normalize by the number of statements used.


**Note**: We do include probability information in the computation of the histogram. 


In [6]:
# Probabilistic histogram keyword arguments
prob_hist_top_k_kwargs = {k: v for k, v in hist_creation_kwargs.items()}
prob_hist_top_k_kwargs.update(
    id_col="statement_uuid",
    number_cols=[f"number_{i}" for i in range(1, 20)], 
    number_logprob_cols=[f"number_{i}_logprob" for i in range(1, 20)]
)

### 2.1. Non-verifiable

In [None]:
for n_shot in N_SHOTS:
    top_k_filepaths = glob.glob(f"../../results/outputs/non-verifiable-{n_shot}-shot/**/top*_completions.csv", recursive=True)

    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = pd.read_csv(fp)
        model_name = df.loc[0, "model"]
        prefix = model_name.replace("/", "__")
        assert model_name in fp
    
        # Overall
        histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df, **prob_hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/all/non_verifiable/models-{n_shot}shot", prefix=prefix)
    
        # By gender
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_gender_subset, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_gender/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # By statement type
        assert 4 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_type/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

### 2.2. Verifiable

In [None]:
for n_shot in N_SHOTS:
    top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp))
        model_name = df.loc[0, "model"]
        assert model_name in fp
        prefix = model_name.replace("/", "__")
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df, **prob_hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/all/verifiable/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_gender_subset, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_gender/verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 3 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_type/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_subset_v, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_truth/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_truth)

### 2.3. Verifiable (AI2-Arc)

In [None]:
n_shot = 2

for ai2arc_subset in ("ai2arc-easy", "ai2arc-challenge"):
    top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot-{ai2arc_subset}/**/top*_completions.csv", recursive=True)
    
    for fp in top_k_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp), gen_study=True)
        model_name = df.loc[0, "model"]
        assert model_name in fp
        prefix = model_name.replace("/", "__")
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df, **prob_hist_top_k_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/all/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_gender_subset, **prob_hist_top_k_kwargs, ok_non_symmetric=True)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_gender/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+gender)
    
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 1 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_st_type_subset, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_type/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+st_type)
    
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df_subset_v, **prob_hist_top_k_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_truth/verifiable-{ai2arc_subset}/models-{n_shot}shot", prefix=prefix+"_"+st_truth)

In [None]:
# Note: In the cell above, you will see that we set 
# ok_non_symmetric=True, when dealing with gender.
# This is related to the fact that we had more statements
# than gendered names and therefore, we randomly sampled
# the name assignment. As a result, this led to a slight
# non-symmetrical balancing of the examples. See example
# below for a better understanding
df_gender_subset1 = df[df["gender"] == "female"].copy()
df_gender_subset1.groupby("uncertainty_expression").count()

In [None]:
df_gender_subset2 = df[df["gender"] == "male"].copy()
df_gender_subset2.groupby("uncertainty_expression").count()

### Probabilistic decoding

In [12]:
top_k_filepaths = glob.glob(f"../../results/outputs/verifiable-2-shot-exemplars-ablation/**/top*_completions.csv", recursive=True)

for fp in top_k_filepaths:
    print("Processing", fp); 
    df = parse_verifiable(pd.read_csv(fp))
    model_name = df.loc[0, "model"]
    assert model_name in fp
    prefix = model_name.replace("/", "__")

    config = fp.rpartition("results-2shot-")[-1][:2]
    # ---- ---- ---- ---- ---- ---- ---- ----
    # Overall
    # ---- ---- ---- ---- ---- ---- ---- ----
    histograms = prob_hists.create_histogram_for_top_k_logprobs__openai(df, **prob_hist_top_k_kwargs)
    persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/verifiable-2-shot-exemplars-ablation/all/models-2shot-{config}", prefix=prefix)

Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FF/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FF/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FF/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FT/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FT/gpt-4-turbo-2024-04-09/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-FT/gpt-4o-2024-05-13/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-TF/gpt-3.5-turbo-0125/top5_completions.csv
Processing ../../results/outputs/verifiable-2-shot-exemplars-ablation/results-2shot-TF/gpt-4-turbo-2024-0