In [11]:
import pandas as pd
import numpy as np

import glob
import os

from default_vars import MAPPING_2_CANONIC


def map_statement_type(val):
    if "__true_" in val or val.endswith("__true"): 
        return "verifiable__true"

    elif "__false_" in val or val.endswith("__false"):
        return "verifiable__false"
    else:
        return f"non-verifiable__{val}"

In this notebook, we combine all model results into a single canonic form using the argmax prediction as the response.
The final CSV file contains the following information:

- `model`: the model being evaluated.
- `uncertainty_expression`: the uncertainty expression being used to convey uncertainty about the `statement`.
- `numerical_response`: the model's predicted argmax numerical response. 
- `statement_type`: the statement type.
- `statement_truth`: the statement truth.
- `speaker_name`: the speaker's name.
- `speaker_gender`: the speaker's gender.
- `template`: the template being used to combine the uncertainty expression, the speaker, and the `statement`.
- `statement_id`: the statement id.
- `statement`: the statement.
- `statement_uuid`: the unique identifier.

In [17]:
OUTPUT_DIR = "../../results/greedy"
ALL_RESULTS = []

## Approach 1: Top-k (OpenAI models)

In [18]:
def map_top_k(df, mapping) -> pd.DataFrame:
    can_df = df[mapping.keys()].copy()  
    can_df["__orig_statement_type"] = can_df["statement_type"]
    can_df["statement_type"] = can_df["__orig_statement_type"].apply(map_statement_type)
    can_df = can_df.rename(mapping, axis=1)
    return can_df

for fp in glob.glob(f"../../results/outputs/**/top*_completions.csv", recursive=True):
    print(fp)
    df = pd.read_csv(fp)
    model_name = df.loc[0, "model"]
    assert model_name in fp
    base_name = fp.rpartition("results/outputs")[-1]\
                    .rpartition("/" + model_name)[0]
    # ^Note: basename will be non-verifiable-2-shot
    # or non-verifiable-0-shot
    df = map_top_k(df.copy(), MAPPING_2_CANONIC["top-k"])
    df["__results_filepath"] = fp
    df["__basename"] = base_name
    df["__n_shots"] = 2 if "2-shot" in base_name else 0
    df["__dataset"] = "ai2arc" if "ai2arc" in base_name else "main"
    df["__methodology"] = "top-k"

    ALL_RESULTS.append(df)

../../results/outputs/non-verifiable-2-shot/gpt-3.5-turbo-0125/top5_completions.csv
../../results/outputs/non-verifiable-2-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
../../results/outputs/non-verifiable-2-shot/gpt-4o-2024-05-13/top5_completions.csv
../../results/outputs/non-verifiable-0-shot/gpt-3.5-turbo-0125/top5_completions.csv
../../results/outputs/non-verifiable-0-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
../../results/outputs/non-verifiable-0-shot/gpt-4o-2024-05-13/top5_completions.csv
../../results/outputs/verifiable-FT-2-shot/gpt-3.5-turbo-0125/top5_completions.csv
../../results/outputs/verifiable-FT-2-shot/gpt-4-turbo-2024-04-09/top5_completions.csv
../../results/outputs/verifiable-FT-2-shot/gpt-4o-2024-05-13/top5_completions.csv
../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-3.5-turbo-0125/top5_completions.csv
../../results/outputs/verifiable-FT-2-shot-ai2arc-challenge/gpt-4-turbo-2024-04-09/top5_completions.csv
../../results/outputs/verifiable-F

### Approach 2: Full Probability (HuggingFace models)

In [19]:
def map_full_prob(df, mapping, id_cols=["statement_uuid", "statement_type", "uncertainty_expression"]) -> pd.DataFrame:
    can_df = df.copy()

    # Select argmax
    can_df = can_df.sort_values(id_cols + ["completion__cond_logscores__corrected"], ascending=False)
    can_df = can_df.groupby(id_cols).head(1).reset_index(drop=True)
    can_df = can_df[mapping.keys()]
    can_df["__orig_statement_type"] = can_df["statement_type"]
    can_df["statement_type"] = can_df["__orig_statement_type"].apply(map_statement_type)
    can_df = can_df.rename(mapping, axis=1)
    return can_df


for fp in glob.glob(f"../../results/outputs/**/0_to_100_completions.csv", recursive=True):
    print(fp)
    df = pd.read_csv(fp)
    model_name = df.loc[0, "completion__model"]
    assert model_name in fp
    base_name = fp.rpartition("results/outputs")[-1]\
                    .rpartition("/" + model_name)[0]
    # ^Note: basename will be non-verifiable-2-shot
    # or non-verifiable-0-shot
    df = map_full_prob(df.copy(), MAPPING_2_CANONIC["full-prob"])
    df["__results_filepath"] = fp
    df["__basename"] = base_name
    df["__n_shots"] = 2 if "2-shot" in base_name else 0
    df["__dataset"] = "ai2arc" if "ai2arc" in base_name else "main"
    df["__methodology"] = "full-prob-argmax"
    ALL_RESULTS.append(df)

../../results/outputs/non-verifiable-2-shot/allenai/OLMo-7B-Instruct/0_to_100_completions.csv
../../results/outputs/non-verifiable-2-shot/google/gemma-1.1-2b-it/0_to_100_completions.csv
../../results/outputs/non-verifiable-2-shot/lmsys/vicuna-13b-v1.5/0_to_100_completions.csv
../../results/outputs/non-verifiable-2-shot/meta-llama/Meta-Llama-3-70B-Instruct/0_to_100_completions.csv
../../results/outputs/non-verifiable-2-shot/meta-llama/Meta-Llama-3-8B-Instruct/0_to_100_completions.csv
../../results/outputs/non-verifiable-2-shot/mistralai/Mistral-7B-Instruct-v0.2/0_to_100_completions.csv
../../results/outputs/non-verifiable-0-shot/allenai/OLMo-7B-Instruct/0_to_100_completions.csv
../../results/outputs/non-verifiable-0-shot/google/gemma-1.1-2b-it/0_to_100_completions.csv
../../results/outputs/non-verifiable-0-shot/lmsys/vicuna-13b-v1.5/0_to_100_completions.csv
../../results/outputs/non-verifiable-0-shot/meta-llama/Meta-Llama-3-70B-Instruct/0_to_100_completions.csv
../../results/outputs/non

## Methodology 3: Sampling based

In [20]:
def map_sampling_based(df, mapping, id_cols=["statement_uuid", "statement_type", "uncertainty_expression"]) -> pd.DataFrame:
    if len(np.unique(df.groupby(id_cols).count().values)) != 1:
        return None
        
    can_df = df.copy()
    can_df = can_df[mapping.keys()]
    can_df["__orig_statement_type"] = can_df["statement_type"]
    can_df["statement_type"] = can_df["__orig_statement_type"].apply(map_statement_type)
    can_df = can_df.rename(mapping, axis=1)
    return can_df
    

for fp in glob.glob(f"../../results/outputs/**/sample_completions.csv", recursive=True):
    print(fp)
    df = pd.read_csv(fp)
    model_name = df.loc[0, "model"]
    assert model_name in fp
    base_name = fp.rpartition("results/outputs")[-1]\
                    .rpartition("/" + model_name)[0]
    # ^Note: basename will be non-verifiable-2-shot
    # or non-verifiable-0-shot
    df = map_sampling_based(df.copy(), MAPPING_2_CANONIC["sampling-based"])
    if df is None:
        print("- Skipping...")
        continue

    df["__results_filepath"] = fp
    df["__basename"] = base_name
    df["__n_shots"] = 2 if "2-shot" in base_name else 0
    df["__dataset"] = "ai2arc" if "ai2arc" in base_name else "main"
    df["__methodology"] = "sampling-based"
    ALL_RESULTS.append(df)

../../results/outputs/non-verifiable-2-shot/meta-llama/Llama-3-70b-chat-hf/sample_completions.csv
../../results/outputs/non-verifiable-2-shot/mistralai/Mixtral-8x22B-Instruct-v0.1/sample_completions.csv
../../results/outputs/non-verifiable-2-shot/mistralai/Mixtral-8x7B-Instruct-v0.1/sample_completions.csv
../../results/outputs/non-verifiable-2-shot/models/gemini-pro/sample_completions.csv
../../results/outputs/non-verifiable-0-shot/mistralai/Mixtral-8x22B-Instruct-v0.1/sample_completions.csv
- Skipping...
../../results/outputs/non-verifiable-0-shot/mistralai/Mixtral-8x7B-Instruct-v0.1/sample_completions.csv
- Skipping...
../../results/outputs/non-verifiable-0-shot/models/gemini-pro/sample_completions.csv
../../results/outputs/verifiable-FT-2-shot/allenai/OLMo-7B-Instruct/sample_completions.csv
../../results/outputs/verifiable-FT-2-shot/google/gemma-1.1-2b-it/sample_completions.csv
- Skipping...
../../results/outputs/verifiable-FT-2-shot/meta-llama/Llama-3-70b-chat-hf/sample_completions

In [21]:
CONCAT_RESULTS = pd.concat(ALL_RESULTS)
CONCAT_RESULTS.to_csv(f"{OUTPUT_DIR}/canonic_data.csv")