In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

import glob
from collections import defaultdict


def get_llm_correctness_results(results_fp: str, ground_truth_fp: str, single_prompt_substr: str | None=None):
    def remove_puctuation(text):
        import string
        return text.translate(str.maketrans('', '', string.punctuation))

    def majority_voting(df):
        counts = df.value_counts()
        majority = sorted(counts.items(), key=lambda x: x[1])[-1]
        conf = majority[1] / len(df)
        return  majority[0], conf

    answer_per_type = {
        "non-verifiable": "unknown",
        "verifiable-true": "true",
        "verifiable-false": "false",
    }
    
    ground_truth = pd.read_csv(ground_truth_fp)
    ground_truth = ground_truth.drop_duplicates("prompt", keep="first")
    ground_truth["correct_response"] = ground_truth["type"].apply(lambda x: answer_per_type[x])
    
    results = pd.read_csv(results_fp).drop("ix", axis=1)
    # Filter the results by the prompts that contain a specific substring
    if single_prompt_substr is not None:
        results = results[results["prompt"].apply(lambda p: single_prompt_substr in p)]

    
    results["__orig_pred_response"] = results["responses"]
    results["model"] = results_fp.rpartition("/")[-1].rpartition(".csv")[0]
    results["responses"] = results["responses"].apply(str.lower).apply(str.strip)
    results["responses"] = results["responses"].apply(remove_puctuation)
    results = results.rename({"responses": "pred_response"}, axis=1)
    results = ground_truth.set_index("prompt").join(results.set_index("prompt"), how="right")

    results["pred_response__mode"] = None
    results["pred_response_conf__mode"] = None
    # Determine the final prediction and its' confidence (using self-consistency approach)
    for statement in results["statement"].unique():
        st_subset = results[results["statement"] == statement]
        resp, freq = majority_voting(st_subset["pred_response"])
        results.loc[results["statement"] == statement, "pred_response__mode"] = resp
        results.loc[results["statement"] == statement, "pred_response_conf__mode"] = freq
        
    # Select only 1 row per example
    results = results.groupby("statement").head(1)
    results["accuracy"] = results["correct_response"] == results["pred_response__mode"]
    return results

In this section, we analyse the extent to which LLMs belief of correctness are related to the ground truth correctness of the statements.

In [2]:
OUTPUT_DIR = "../../results/correctness-vs-llm-belief"

## 1. Main Experiment (single prompt only: True, False, Unknown)

We obtained results for 4 different models using three different prompts. The three different prompts aim to marginalize over potential biases that the models may have in predicting one of the labels `True`, `False`, `Unknown`. 

The evaluated models are: 
- `gpt-3.5-turbo-0125`
- `gpt-4-turbo-2024-04-09`
- `gpt-4o-2024-05-13`
- `gemini-pro`

Each prompt is executed 7 times using the configurations: 
```python
max_tokens = 30
temperature = 0.5
n_samples = 7
```

In [3]:
results = defaultdict(list)

ground_truth_fp = "../../data/assumptions/main-exp_true_false3.csv"
main_exp_filepaths = glob.glob("../../results/outputs/correctness-vs-llm-belief/main_exp_true_false3/**/*.csv", recursive=True)
for fp in main_exp_filepaths:
    print(fp)
    df = get_llm_correctness_results(fp, ground_truth_fp, single_prompt_substr="True, False, Unknown")
    results["model"].append(df["model"].values[0])
    results["avg acc"].append(np.round((df["accuracy"].mean() * 100), 2))

    for typ in ("non-verifiable", "verifiable-true", "verifiable-false"):
        subset_typ = df[df["type"] == typ]
        print(len(subset_typ))
        results[typ].append(np.round((subset_typ["accuracy"].mean() * 100), 2))
print(pd.DataFrame(results).set_index("model").to_latex())

../../results/outputs/correctness-vs-llm-belief/main_exp_true_false3/gpt-3.5-turbo-0125.csv
110
30
30
../../results/outputs/correctness-vs-llm-belief/main_exp_true_false3/gpt-4-turbo-2024-04-09.csv
110
30
30
../../results/outputs/correctness-vs-llm-belief/main_exp_true_false3/gpt-4o-2024-05-13.csv
110
30
30
../../results/outputs/correctness-vs-llm-belief/main_exp_true_false3/models/gemini-pro.csv
110
30
30
\begin{tabular}{lrrrr}
\toprule
 & avg acc & non-verifiable & verifiable-true & verifiable-false \\
model &  &  &  &  \\
\midrule
gpt-3.5-turbo-0125 & 90.000000 & 90.000000 & 90.000000 & 90.000000 \\
gpt-4-turbo-2024-04-09 & 98.820000 & 100.000000 & 100.000000 & 93.330000 \\
gpt-4o-2024-05-13 & 98.820000 & 99.090000 & 100.000000 & 96.670000 \\
gemini-pro & 90.000000 & 90.910000 & 93.330000 & 83.330000 \\
\bottomrule
\end{tabular}

