In [1]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from scipy.stats import pearsonr





In [8]:
datasets = ["massmaps", "supernova", "politeness", "emotion", "cholec", "cardiac", "sepsis"]
filepath = "vanilla_{}_gpt-4o_annot.json"

for dataset in datasets:
    # Load the data
    try:
        with open(filepath.format(dataset), "r") as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"File not found for dataset: {dataset}")
        continue
   
    keys = data.keys()
    claim_accs_all = []
    relevance_accs_all = []
    alignment_accs_all = []
    for key in keys:
        example = data[key]
        claim_accs = example["claim_decomposition_accuracy"]
        relevance_accs = example["relevance_filtering_accuracy"]
        relevance_accs = [relevance_accs[x][0] for x in range(len(relevance_accs))]
        alignment_accs = example["expert_alignment_accuracy"]
        alignment_accs = [alignment_accs[x][0] for x in range(len(alignment_accs))]
        
        
        claim_accs_all.append(claim_accs)
        for r in relevance_accs: relevance_accs_all.append(r)
        for a in alignment_accs: alignment_accs_all.append(a)

    print(f"\nDataset: {dataset}")
    print(f"Claim Decomposition Accuracy: {np.mean(claim_accs_all):.4f} ± {np.std(claim_accs_all):.4f}")
    print(f"Relevance Filtering Accuracy: {np.mean(relevance_accs_all):.4f} ± {np.std(relevance_accs_all):.4f}")
    print(f"Expert Alignment Accuracy: {np.mean(alignment_accs_all):.4f} ± {np.std(alignment_accs_all):.4f}")
    # claim_scores = [entry["claim_decomposition_accuracy"] for entry in data]
    # print(claim_scores)


Dataset: massmaps
Claim Decomposition Accuracy: 0.8000 ± 0.2449
Relevance Filtering Accuracy: 0.7576 ± 0.3718
Expert Alignment Accuracy: 0.9583 ± 0.1998

Dataset: supernova
Claim Decomposition Accuracy: 0.9000 ± 0.2000
Relevance Filtering Accuracy: 0.8378 ± 0.3686
Expert Alignment Accuracy: 0.8710 ± 0.3352

Dataset: politeness
Claim Decomposition Accuracy: 1.0000 ± 0.0000
Relevance Filtering Accuracy: nan ± nan
Expert Alignment Accuracy: nan ± nan

Dataset: emotion
Claim Decomposition Accuracy: 1.0000 ± 0.0000
Relevance Filtering Accuracy: nan ± nan
Expert Alignment Accuracy: nan ± nan

Dataset: cholec
Claim Decomposition Accuracy: 1.0000 ± 0.0000
Relevance Filtering Accuracy: 0.7537 ± 0.3994
Expert Alignment Accuracy: 0.8478 ± 0.2733

Dataset: cardiac
Claim Decomposition Accuracy: 0.8660 ± 0.1945
Relevance Filtering Accuracy: 0.8333 ± 0.3624
Expert Alignment Accuracy: 1.0000 ± 0.0000

Dataset: sepsis
Claim Decomposition Accuracy: 0.9000 ± 0.2000
Relevance Filtering Accuracy: 0.7593 ±