In [None]:
import json
import pandas as pd
from tqdm import tqdm
from tabulate import tabulate
import random
random.seed(42)

In [41]:
def bootstrap_confidence_interval(data, confidence=0.95, num_bootstrap=1000):
    n = len(data)
    bootstrap_samples_em = dict()
    bootstrap_samples_f1 = dict()
    for case_id in range(1,7):
      bootstrap_samples_em[case_id] = []
      bootstrap_samples_f1[case_id] = []
    for _ in range(num_bootstrap):
        bootstrap_sample = random.choices(data, k=n)
        for case_id in range(1,7):
          bootstrap_sample_em = sum([int(entry[f"case_{case_id}_em"]) for entry in bootstrap_sample])/n
          bootstrap_sample_f1 = sum([entry[f"case_{case_id}_f1"] for entry in bootstrap_sample])/n
          bootstrap_samples_em[case_id].append(bootstrap_sample_em)
          bootstrap_samples_f1[case_id].append(bootstrap_sample_f1)
    res = dict()
    for case_id in range(1,7):
      bootstrap_samples_em[case_id] = sorted(bootstrap_samples_em[case_id])
      bootstrap_samples_f1[case_id] = sorted(bootstrap_samples_f1[case_id])
      lower_em = bootstrap_samples_em[case_id][int(num_bootstrap * (1 - confidence)/2)]
      upper_em = bootstrap_samples_em[case_id][int(num_bootstrap * (1-(1 - confidence) / 2))]
      lower_f1 = bootstrap_samples_f1[case_id][int(num_bootstrap * (1 - confidence)/2)]
      upper_f1 = bootstrap_samples_f1[case_id][int(num_bootstrap * (1-(1 - confidence) / 2))]
      res[f"case_{case_id}_lower_em"] = lower_em * 100
      res[f"case_{case_id}_upper_em"] = upper_em * 100
      res[f"case_{case_id}_lower_f1"] = lower_f1 * 100
      res[f"case_{case_id}_upper_f1"] = upper_f1 * 100
    return res

def bootstrap_confidence_interval_baseline(data, confidence=0.95, num_bootstrap=1000):
    n = len(data)
    bootstrap_samples_em = dict()
    bootstrap_samples_f1 = dict()
    for case_id in range(1,3):
      bootstrap_samples_em[case_id] = []
      bootstrap_samples_f1[case_id] = []
    for _ in range(num_bootstrap):
        bootstrap_sample = random.choices(data, k=n)
        for case_id in range(1,3):
          bootstrap_sample_em = sum([int(entry[f"case_{case_id}_em"]) for entry in bootstrap_sample])/n
          bootstrap_sample_f1 = sum([entry[f"case_{case_id}_f1"] for entry in bootstrap_sample])/n
          bootstrap_samples_em[case_id].append(bootstrap_sample_em)
          bootstrap_samples_f1[case_id].append(bootstrap_sample_f1)
    res = dict()
    for case_id in range(1,3):
      bootstrap_samples_em[case_id] = sorted(bootstrap_samples_em[case_id])
      bootstrap_samples_f1[case_id] = sorted(bootstrap_samples_f1[case_id])
      lower_em = bootstrap_samples_em[case_id][int(num_bootstrap * (1 - confidence)/2)]
      upper_em = bootstrap_samples_em[case_id][int(num_bootstrap * (1-(1 - confidence) / 2))]
      lower_f1 = bootstrap_samples_f1[case_id][int(num_bootstrap * (1 - confidence)/2)]
      upper_f1 = bootstrap_samples_f1[case_id][int(num_bootstrap * (1-(1 - confidence) / 2))]
      res[f"case_{case_id}_lower_em"] = lower_em * 100
      res[f"case_{case_id}_upper_em"] = upper_em * 100
      res[f"case_{case_id}_lower_f1"] = lower_f1 * 100
      res[f"case_{case_id}_upper_f1"] = upper_f1 * 100
    return res

In [None]:
results = dict()
for model in tqdm(["llama-8b", "mistral-7b", "gemma-7b", "llama-70b", "gpt-4-turbo-direct"]):
  for strategy in tqdm(["zeroshot", "2-shot", "3-shot", "zeroshot-cot", "2-shot-cot", "3-shot-cot"]):
    result = dict()
    with open(f'results/morehopqa_final_{model}_{strategy}.json', 'r') as f:
      entries = json.load(f)
    for case_number in range(1,7):
      result[f"case_{case_number}_em"] = 0
      result[f"case_{case_number}_f1"] = 0
    for entry in entries.values():
      for case_number in range(1,7):
        result[f"case_{case_number}_em"] += 1 if entry[f"case_{case_number}_em"] else 0
        result[f"case_{case_number}_f1"] += entry[f"case_{case_number}_f1"]
    for case_number in range(1,7):
      result[f"case_{case_number}_em"] /= len(entries)
      result[f"case_{case_number}_f1"] /= len(entries)
      result[f"case_{case_number}_em"] *= 100
      result[f"case_{case_number}_f1"] *= 100
    bootstrapped = bootstrap_confidence_interval(list(entries.values()))
    for case_number in range(1,7):
      result[f"case_{case_number}_em_pm"] = max(abs(result[f"case_{case_number}_em"] - bootstrapped[f"case_{case_number}_lower_em"]), abs(bootstrapped[f"case_{case_number}_upper_em"] - result[f"case_{case_number}_em"]))
      result[f"case_{case_number}_f1_pm"] = max(abs(result[f"case_{case_number}_f1"] - bootstrapped[f"case_{case_number}_lower_f1"]), abs(bootstrapped[f"case_{case_number}_upper_f1"] - result[f"case_{case_number}_f1"]))
    results[f"{model}_{strategy}"] = result

model = "baseline"
strategy = "zeroshot-cot"
result = dict()
with open(f'results/morehopqa_final_{model}_{strategy}.json', 'r') as f:
  entries = json.load(f)
for case_number in range(1,3):
  result[f"case_{case_number}_em"] = 0
  result[f"case_{case_number}_f1"] = 0
for entry in entries.values():
  for case_number in range(1,3):
    result[f"case_{case_number}_em"] += 1 if entry[f"case_{case_number}_em"] else 0
    result[f"case_{case_number}_f1"] += entry[f"case_{case_number}_f1"]
for case_number in range(1,3):
  result[f"case_{case_number}_em"] /= len(entries)
  result[f"case_{case_number}_f1"] /= len(entries)
  result[f"case_{case_number}_em"] *= 100
  result[f"case_{case_number}_f1"] *= 100
bootstrapped = bootstrap_confidence_interval(list(entries.values()))
for case_number in range(1,7):
  result[f"case_{case_number}_em_pm"] = max(abs(result[f"case_{case_number}_em"] - bootstrapped[f"case_{case_number}_lower_em"]), abs(bootstrapped[f"case_{case_number}_upper_em"] - result[f"case_{case_number}_em"]))
  result[f"case_{case_number}_f1_pm"] = max(abs(result[f"case_{case_number}_f1"] - bootstrapped[f"case_{case_number}_lower_f1"]), abs(bootstrapped[f"case_{case_number}_upper_f1"] - result[f"case_{case_number}_f1"]))
results[f"{model}_{strategy}"] = result

In [None]:
rows = []

# Iterate over each model and its detailed scores
for model, scores in results.items():
    # Initialize a dictionary for the row
    row = {'Model': model}
    # Extract each score and organize by 'case_x' and 'case_x_f1'
    for key, value in scores.items():
        if '_pm' in key:
            continue  # Skip processing plus-minus keys directly here
        elif '_f1' in key:
            case_name = key.replace('_f1', '')
            metric = 'f1'
        else:
            case_name = key.replace('_em', '')
            metric = 'em'

        # Initialize the dictionary for this case if it's the first metric encountered
        if case_name not in row:
            row[case_name] = {}

        # Update the dictionary with the score or the formatted string if plus-minus value exists
        pm_key = f"{key}_pm"
        if pm_key in scores:
            formatted_value = f"{value:.2f}" + "$_{\\pm" +  f"{scores[pm_key]:.2f}" + "}$"
            row[case_name][metric] = formatted_value
        else:
            row[case_name][metric] = f"{value:.2f}"
    rows.append(row)

# Create a DataFrame from the rows list
df = pd.DataFrame(rows)
df.set_index('Model', inplace=True)

# Expand each dictionary in the DataFrame into its own column
for column in df.columns:
    df_expanded = df[column].apply(pd.Series)
    df_expanded.columns = pd.MultiIndex.from_product([[column], df_expanded.columns])
    df.drop(column, axis=1, inplace=True)
    df = pd.concat([df, df_expanded], axis=1)

print(df)

In [None]:
print(tabulate(df, headers='keys', tablefmt='grid'))

In [45]:
df.to_excel('results.xlsx')