## Import libraries

In [1]:
import os
import jsonlines

import numpy as np
from collections import defaultdict

In [2]:
result_dir = "../../../results"

## Confidence and majority level (bin count)

In [3]:
dataset_names = ["CommonsenseQA", "QASC", "100TFQA", "GSM8K"]
model_names = ["Llama-3.1-8B-Instruct", "gpt-4o-2024-11-20"]
prompting_strategies = ["zero-shot", "zero-shot-cot", "few-shot", "few-shot-cot"]

# Read score file
scores = {}
for model_name in model_names:
    for dataset_name in dataset_names:
        for prompting_strategy in prompting_strategies:
            # if dataset_name == "CommonsenseQA" and "gpt" in model_name and prompting_strategy == "zero-shot":
            #     pass
            # else:
            #     continue
            dd = defaultdict(list)

            output_dir = f"{result_dir}/{dataset_name}/{model_name}"
            predictions_path = os.path.join(output_dir, f"{prompting_strategy}_predictions.jsonl")
            raw_predictions_path = os.path.join(output_dir, f"{prompting_strategy}_raw_predictions.jsonl")
            try:
                with jsonlines.open(predictions_path) as fin:
                    id_predictions_map, id_consistency_map = {}, {}
                    for example in fin.iter():
                        id_predictions_map[example["id"]] = example["predictions"]
                        id_consistency_map[example["id"]] = example["consistency"]["mean"]
                X, Y, Z = [], [], []
                with jsonlines.open(raw_predictions_path) as fin:
                    for example in fin.iter():
                        confidences = []
                        for format_id, top_tokens in example["top_tokens"].items():
                            confidence = -1
                            for ii, top_tokenss in enumerate(top_tokens[::-1]):
                                if top_tokenss[0] == id_predictions_map[example["id"]][format_id]:
                                    if "top_probs" in example:
                                        confidence = example["top_probs"][format_id][-(ii+1)][0]
                                    else:
                                        confidence = np.exp(example["top_logprobs"][format_id][-(ii+1)][0])
                                    confidences.append(confidence)
                                    break
                        if len(confidences) != 8:
                            # print(example["top_tokens"])
                            # print(len(confidences))
                            pass
                            # raise Exception
                        else:
                            mean_confidence = np.mean(confidences)
                            X.append(mean_confidence)
                            Y.append(id_consistency_map[example["id"]])
                            Z.append(1.0*(id_consistency_map[example["id"]] >= 0.99))

                            # print(confidences)
                            # print()
                            id_predictions_map[example["id"]].pop('majority_voting')
                            # print(id_predictions_map[example["id"]])

                            for ii in range(8):
                                zz = list(id_predictions_map[example["id"]].values()).count(id_predictions_map[example["id"]][str(ii)])
                                dd[zz].append(confidences[ii])
                print(f"{dataset_name} / {model_name} / {prompting_strategy}")
                for zz in range(1, 9):
                    print(f"{zz} / {len(dd[zz]):4} / {np.mean(dd[zz]):.3f}")
                print()
            except:
                continue

CommonsenseQA / Llama-3.1-8B-Instruct / zero-shot
1 /   48 / 0.511
2 /   50 / 0.552
3 /   69 / 0.576
4 /   68 / 0.584
5 /   95 / 0.598
6 /  144 / 0.638
7 /  294 / 0.687
8 / 7048 / 0.948

CommonsenseQA / Llama-3.1-8B-Instruct / zero-shot-cot
1 /  133 / 0.880
2 /  158 / 0.862
3 /  183 / 0.908
4 /  232 / 0.899
5 /  315 / 0.903
6 /  366 / 0.938
7 /  581 / 0.947
8 / 5832 / 0.991

CommonsenseQA / Llama-3.1-8B-Instruct / few-shot
1 /  262 / 0.858
2 /  478 / 0.917
3 /  165 / 0.776
4 /  168 / 0.759
5 /  230 / 0.772
6 / 1236 / 0.901
7 / 1421 / 0.930
8 / 3856 / 0.954

QASC / Llama-3.1-8B-Instruct / zero-shot
1 /   24 / 0.517
2 /   50 / 0.492
3 /   30 / 0.502
4 /   84 / 0.531
5 /   60 / 0.583
6 /  132 / 0.597
7 /  140 / 0.681
8 / 5408 / 0.948

QASC / Llama-3.1-8B-Instruct / zero-shot-cot
1 /   83 / 0.804
2 /  104 / 0.851
3 /  114 / 0.815
4 /   76 / 0.912
5 /  180 / 0.938
6 /  222 / 0.927
7 /  301 / 0.959
8 / 4800 / 0.991

QASC / Llama-3.1-8B-Instruct / few-shot
1 /   83 / 0.750
2 /  866 / 0.942
3 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


100TFQA / gpt-4o-2024-11-20 / few-shot-cot
1 /    3 / 1.000
2 /    0 / nan
3 /    3 / 1.000
4 /    0 / nan
5 /    5 / 1.000
6 /    0 / nan
7 /   21 / 1.000
8 /  608 / 1.000

GSM8K / gpt-4o-2024-11-20 / zero-shot-cot
1 /   73 / 0.672
2 /   94 / 0.719
3 /   81 / 0.782
4 /   84 / 0.735
5 /  115 / 0.840
6 /  132 / 0.977
7 /  189 / 0.967
8 / 6568 / 0.997

GSM8K / gpt-4o-2024-11-20 / few-shot-cot
1 /   19 / 0.993
2 /   20 / 1.000
3 /   30 / 1.000
4 /   32 / 1.000
5 /   35 / 1.000
6 /   60 / 1.000
7 /   84 / 1.000
8 / 7184 / 1.000

