In [1]:
import pandas as pd
import numpy as np
import glob

import utils_greedy_histogram as greedy_hists
import utils_probabilistic_histogram as prob_hists

from utils_io import print_sep, read_json, persist_histograms
from default_vars import BIN_CENTER, BIN_OFFSET, UNCERTAINTY_EXPRESSIONS

full_prob_hist_kwargs = dict(
    id_cols=["statement_uuid", "statement_type", "uncertainty_expression"],
    number_col="completion__suffix",
    number_logprob_col="completion__cond_logscores__corrected",
    unc_col="uncertainty_expression",
    uncertainty_expressions=UNCERTAINTY_EXPRESSIONS,
    bin_center=BIN_CENTER,
    bin_offset=BIN_OFFSET,
)


def parse_verifiable(df: pd.DataFrame, gen_study=False) -> pd.DataFrame:
    assert df["statement_type"].nunique() == (6 if not gen_study else 2), df["statement_type"].unique()
    
    data = df.copy()
    data["_statement_type_orig"] = data["statement_type"]
    data["statement_type"] = data["_statement_type_orig"].apply(lambda x: x.split("_")[0].strip())
    data["statement_truth"] = data["statement_id"].apply(lambda x: str("true" in x).lower())
    return data

In [2]:
OUTPUT_DIR = "../../results"

N_SHOTS = (
    0, 
    2,
)

# 1. Greedy Histogram

In this section, we parse the results obtained via the full probability method. Applicable only to open-source models, we have access to the full probability distribution $p(x | \text{prompt})$ for $x \in [0, 100]$. These probabilities are stored in a long-form dataframe, which means that for every example, we have access to 101 rows.
Therefore, a greedy histogram can be constructed by obtaining the argmax of every 101 rows. 
The **full probability** approach concerns the corrected probability of being a number (and no number after). 


**Algorithms**:
---
To compute the greedy histograms we proceed as follows: 
1. Consider the probability distribution over the integer numbers 0, 1, ..., 100 (and no number immediately after).
2. Select the argmax prediction of this probability distribution.
3. Determine the corresponding bin to the argmax prediction.
4. Add 1 to that bin.
5. Repeat steps 2-4 for every statement.
6. Normalize by the number of statements use.



The models which we consider for this analysis are: 
- `allenai/OLMo-7B-Instruct`
- `google/gemma-1.1-2b-it`
- `lmsys/vicuna-13b-v1.5`
- `meta-llama/Meta-Llama-3-8B-Instruct`
- `meta-llama/Meta-Llama-3-70B-Instruct`
- `mistralai/Mistral-7B-Instruct-v0.2`


## 1.1. Non-verifiable

In [3]:
for n_shot in N_SHOTS:
    full_prob_filepaths = glob.glob(f"../../results/outputs/non-verifiable-{n_shot}-shot/**/0_to_100_completions.csv", recursive=True)
    
    for fp in full_prob_filepaths:
        print("Processing", fp); 
        df = pd.read_csv(fp)
        # ^Note: df size: 101 * n_unique_statements * n_uncertainty expressions
        assert (df.groupby(["uncertainty_expression", "statement_uuid", "statement_type"]).count() == 101).all().all()

        # Extract name of the model
        model_name = df.loc[0, "completion__model"]
        assert model_name in fp
        prefix = "full__" + model_name.replace("/", "__") 
    
        # Overall
        histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df, **full_prob_hist_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/non_verifiable/models-{n_shot}shot", prefix=prefix)
    
        # By gender
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df_gender_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # By statement type
        assert 4 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df_st_type_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

Processing ../../results/outputs/non-verifiable-0-shot/allenai/OLMo-7B-Instruct/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/google/gemma-1.1-2b-it/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/lmsys/vicuna-13b-v1.5/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/meta-llama/Meta-Llama-3-70B-Instruct/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/meta-llama/Meta-Llama-3-8B-Instruct/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-0-shot/mistralai/Mistral-7B-Instruct-v0.2/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/allenai/OLMo-7B-Instruct/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/google/gemma-1.1-2b-it/0_to_100_completions.csv
Processing ../../results/outputs/non-verifiable-2-shot/lmsys/vicuna-13b-v1.5/0_to_100_completions.csv
Processing ../../results/outputs

### 1.2. Verifiable


In [None]:
for n_shot in N_SHOTS:
    full_prob_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot/**/0_to_100_completions.csv", recursive=True)
    
    for fp in full_prob_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp))
        # ^Note: df size: 101 * n_unique_statements * n_uncertainty expressions
        assert (df.groupby(["uncertainty_expression", "statement_uuid", "statement_type"]).count() == 101).all().all()

        # Extract name of the model
        model_name = df.loc[0, "completion__model"]
        assert model_name in fp
        prefix = "full__" + model_name.replace("/", "__") 
        
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df, **full_prob_hist_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/all/verifiable/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df_gender_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_gender/verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 3 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df_st_type_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_type/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = greedy_hists.create_histogram_for_full_logprobs__hf(df_subset_v, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/greedy/by_statement_truth/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_truth)

Processing ../../results/outputs/verifiable-FT-0-shot/allenai/OLMo-7B-Instruct/0_to_100_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/google/gemma-1.1-2b-it/0_to_100_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/lmsys/vicuna-13b-v1.5/0_to_100_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/meta-llama/Meta-Llama-3-8B-Instruct/0_to_100_completions.csv
Processing ../../results/outputs/verifiable-FT-0-shot/mistralai/Mistral-7B-Instruct-v0.2/0_to_100_completions.csv
Processing ../../results/outputs/verifiable-FT-2-shot/allenai/OLMo-7B-Instruct/0_to_100_completions.csv


### 1.3. Verifiable (AI2-Arc)

Given the time complexity required to run this strategy. We were not able to include the results for the AI2 ARC in the paper. We leave this for future work. Instead, we used Together AI as the inference server and obtained a greedy continuation. We argue that because these datasets are larger in size (>200) examples than the simpler verifiable and non-verifiable scenarios, we argue that sampling a sequence in a greedy fashion and using those values to compute the greedy histogram, are good reasonable approximations.

# Probabilistic Histogram

The richer histogram refers to summing the available probability information prediction to create the histogram. 
That is, regardless of the methodology used (sampling or probability), we accumulate the normalized probability assigned to a number (we assign the remaining probability mass to -1), so that it sums to 1. Whenever a number is not generated, we accumulate a value 1 in the `-1` bin. 

Whenever computing the probabilistic  top log probability models contain number-specific columns that contain the detailed information about whether a number was present amongst the top-k predicted tokens.

**Note**: We do include probability information in the computation of the histogram. 


### 2.1. Non-verifiable

In [None]:
for n_shot in N_SHOTS:
    full_prob_filepaths = glob.glob(f"../../results/outputs/non-verifiable-{n_shot}-shot/**/0_to_100_completions.csv", recursive=True)

    for fp in full_prob_filepaths:
        print("Processing", fp); 
        df = pd.read_csv(fp)
        # ^Note: df size: 101 * n_unique_statements * n_uncertainty expressions
        assert (df.groupby(["uncertainty_expression", "statement_uuid", "statement_type"]).count() == 101).all().all()

        # Extract name of the model
        model_name = df.loc[0, "completion__model"]
        assert model_name in fp
        prefix = "full__" + model_name.replace("/", "__") 

        # Overall
        histograms = prob_hists.create_histogram_for_full_logprobs__hf(df, **full_prob_hist_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/all/non_verifiable/models-{n_shot}shot", prefix=prefix)
    
        # By gender
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = prob_hists.create_histogram_for_full_logprobs__hf(df_gender_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_gender/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # By statement type
        assert 4 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = prob_hists.create_histogram_for_full_logprobs__hf(df_st_type_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_type/non_verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

### 2.2. Verifiable

In [None]:
for n_shot in N_SHOTS:
    full_prob_filepaths = glob.glob(f"../../results/outputs/verifiable-FT-{n_shot}-shot/**/0_to_100_completions.csv", recursive=True)
    
    for fp in full_prob_filepaths:
        print("Processing", fp); 
        df = parse_verifiable(pd.read_csv(fp))
        # ^Note: df size: 101 * n_unique_statements * n_uncertainty expressions
        assert (df.groupby(["uncertainty_expression", "statement_uuid", "statement_type"]).count() == 101).all().all()

        # Extract name of the model
        model_name = df.loc[0, "completion__model"]
        assert model_name in fp
        prefix = "full__" + model_name.replace("/", "__") 

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Overall
        # ---- ---- ---- ---- ---- ---- ---- ----
        histograms = prob_hists.create_histogram_for_full_logprobs__hf(df, **full_prob_hist_kwargs)
        persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/all/verifiable/models-{n_shot}shot", prefix=prefix)
        # ---- ---- ---- ---- ---- ---- ---- ----
        # Gender
        # ---- ---- ---- ---- ---- ---- ---- ----
        for gender in ("male", "female"):
            df_gender_subset = df[df["gender"] == gender].copy()
            assert len(df_gender_subset) < len(df)
            histograms = prob_hists.create_histogram_for_full_logprobs__hf(df_gender_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_gender/verifiable/models-{n_shot}shot", prefix=prefix+"_"+gender)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement type
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 3 == df["statement_type"].nunique()
        for st_type in df["statement_type"].unique():
            df_st_type_subset = df[df["statement_type"] == st_type].copy()
            assert df_st_type_subset["statement_type"].nunique() == 1
            histograms = prob_hists.create_histogram_for_full_logprobs__hf(df_st_type_subset, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_type/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_type)

        # ---- ---- ---- ---- ---- ---- ---- ----
        # Statement truth/falsity
        # ---- ---- ---- ---- ---- ---- ---- ----
        assert 2 == df["statement_truth"].nunique()
        for st_truth in df["statement_truth"].unique():
            df_subset_v = df[df["statement_truth"] == st_truth]
            assert len(df_subset_v) < len(df)    
            histograms = prob_hists.create_histogram_for_full_logprobs__hf(df_subset_v, **full_prob_hist_kwargs)
            persist_histograms(*histograms, results_folder=f"{OUTPUT_DIR}/probabilistic/by_statement_truth/verifiable/models-{n_shot}shot", prefix=prefix+"_"+st_truth)

### 2.3. Verifiable (AI2-Arc)

Given the time complexity required to run this strategy. We were not able to include the results for the AI2 ARC in the paper. We leave this for future work. Instead, we used greedy decoding to obtain a response from the same models. 
Please consider running the notebook `Models_Sampled_Completions_Histograms` instead.