In [27]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from sklearn.metrics import cohen_kappa_score
from scipy.stats import pearsonr, spearmanr




In [43]:
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
filepath = "vanilla_{}_gpt-4o_annot.json"
filepath2 = "vanilla_{}_gpt-4o_annot2.json"

claim_vals_1 = []
claim_vals_2 = []
relevance_vals_1 = []
relevance_vals_2 = []
alignment_vals_1 = []
alignment_vals_2 = []

dataset_names = {
    "massmaps": "Mass Maps",
    "supernova": "Supernova",
    "politeness": "Politeness",
    "emotion": "Emotion",
    "cholec": "Cholecystectomy",
    "cardiac": "Cardiac",
    "sepsis": "Sepsis"
}

for dataset in datasets:
    # Load the data
    try:
        with open(filepath.format(dataset), "r") as f:
            data = json.load(f)
        with open(filepath2.format(dataset), "r") as f:
            data2 = json.load(f)
    except FileNotFoundError:
        print(f"File not found for dataset: {dataset}")
        continue
   
    keys = data.keys()
    claim_accs_all = []
    relevance_accs_all = []
    alignment_accs_all = []

    all_values_1 = []
    all_values_2 = []

    for key in keys:
        #read in annotator 1 examples
        example = data[key]
        claim_accs = example["claim_decomposition_accuracy"]
        relevance_accs = example["relevance_filtering_accuracy"]
        relevance_accs = [relevance_accs[x][0] for x in range(len(relevance_accs))]
        alignment_accs = example["expert_alignment_accuracy"]
        alignment_accs = [alignment_accs[x][0] for x in range(len(alignment_accs))]

        claim_vals_1.append(claim_accs)
        for r in relevance_accs: relevance_vals_1.append(r)
        for a in alignment_accs: alignment_vals_1.append(a)
        
        #read in annotator 2 examples
        example2 = data2[key]
        claim_accs2 = example2["claim_decomposition_accuracy"]
        relevance_accs2 = example2["relevance_filtering_accuracy"]
        relevance_accs2 = [relevance_accs2[x][0] for x in range(len(relevance_accs2))]
        alignment_accs2 = example2["expert_alignment_accuracy"]
        alignment_accs2 = [alignment_accs2[x][0] for x in range(len(alignment_accs2))]
        
        claim_vals_2.append(claim_accs2)
        for r in relevance_accs2: relevance_vals_2.append(r)
        for a in alignment_accs2: alignment_vals_2.append(a)

        claim_accs_all.append(claim_accs)
        claim_accs_all.append(claim_accs2)
        for r in relevance_accs: relevance_accs_all.append(r)
        for r in relevance_accs2: relevance_accs_all.append(r)
        for a in alignment_accs: alignment_accs_all.append(a)
        for a in alignment_accs2: alignment_accs_all.append(a)

        all_values_1 += [claim_accs] + relevance_accs + alignment_accs
        all_values_2 += [claim_accs2] + relevance_accs2 + alignment_accs2
    
    # Calculate cohen's kappa
    possible_values = [0, 0.5, 1]
    assert([x in possible_values for x in all_values_1])
    assert([x in possible_values for x in all_values_2])
    all_values_1_str = [str(x) for x in all_values_1]
    all_values_2_str = [str(x) for x in all_values_2]
    kappa = cohen_kappa_score(all_values_1_str, all_values_2_str)

    #Print a latex table row
    latex_row = (
    f"\\textbf{{{dataset_names[dataset]}}} & "
    f"{len(relevance_accs_all)} & "
    f"{len(alignment_accs_all)} & "
    f"{np.mean(claim_accs_all):.3f} & "
    f"{np.mean(relevance_accs_all):.3f} & "
    f"{np.mean(alignment_accs_all):.3f} & "
    f"{kappa:.4f} \\\\")

    print(latex_row)    


print("\n")
#average for each stage
print("---- Claim Decomposition ----")
print("Accuracy:{}".format(np.mean(claim_vals_1 + claim_vals_2)))
print("N:{}".format(len(claim_vals_1)))
claim_vals_1 = [str(x) for x in claim_vals_1]
claim_vals_2 = [str(x) for x in claim_vals_2]
print("Kappa:{}".format(cohen_kappa_score(claim_vals_1, claim_vals_2)))

print("\n---- Relevance Filtering ----")
print("Accuracy:{}".format(np.mean(relevance_vals_1 + relevance_vals_2)))
print("N:{}".format(len(relevance_vals_1)))
relevance_vals_1 = [str(x) for x in relevance_vals_1]
relevance_vals_2 = [str(x) for x in relevance_vals_2]
print("Kappa:{}".format(cohen_kappa_score(relevance_vals_1, relevance_vals_2)))

print("\n---- Expert Alignment ----")
print("Accuracy:{}".format(np.mean(alignment_vals_1 + alignment_vals_2)))
print("N:{}".format(len(alignment_vals_1)))
alignment_vals_1 = [str(x) for x in alignment_vals_1]
alignment_vals_2 = [str(x) for x in alignment_vals_2]
print("Kappa:{}".format(cohen_kappa_score(alignment_vals_1, alignment_vals_2)))


\textbf{Mass Maps} & 66 & 48 & 0.900 & 0.826 & 0.979 & 0.4059 \\
\textbf{Supernova} & 74 & 62 & 0.950 & 0.892 & 0.903 & 0.4946 \\
\textbf{Politeness} & 72 & 58 & 0.950 & 0.931 & 0.914 & 0.6604 \\
\textbf{Emotion} & 70 & 44 & 1.000 & 0.929 & 0.943 & 0.6233 \\
\textbf{Cholecystectomy} & 134 & 92 & 1.000 & 0.851 & 0.902 & 0.4396 \\
\textbf{Cardiac} & 66 & 52 & 0.900 & 0.841 & 0.962 & 0.4845 \\
\textbf{Sepsis} & 108 & 66 & 0.900 & 0.852 & 0.894 & 0.3500 \\


---- Claim Decomposition ----
Accuracy:0.9428571428571428
N:35
Kappa:0.7169811320754718

---- Relevance Filtering ----
Accuracy:0.8711864406779661
N:295
Kappa:0.4023551259865784

---- Expert Alignment ----
Accuracy:0.9229857819905213
N:211
Kappa:0.40516463689670723
