In [94]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import seaborn as sns
import numpy as np
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

In [95]:
def calculate_row_entropy(df):
    def entropy(row):
        counts = row.value_counts(normalize=True)
        # Calculate entropy
        return -np.sum(counts * np.log2(counts)) + np.finfo(float).eps # np.finfo(float).eps prevents log(0)

    # Apply the entropy function to each row
    return df.apply(entropy, axis=1)

def calculate_auc(y_true, y_scores):
    return roc_auc_score(y_true, y_scores)


def replace_yes_no(df):
    """
    Replace items in the DataFrame:
    - 'Yes' in the string gets replaced with 1
    - 'No' in the string gets replaced with 0

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        
    Returns:
        pd.DataFrame: The DataFrame with modified values.
    """
    def replace_value(item):
        if isinstance(item, str):  # Only process string items
            if 'Yes' in item:
                return 1
            elif 'No' in item:
                return 0
            elif 'no' in item:
                return 0
        return item  # Return the item unchanged if it is not a string or doesn't match 'Yes' or 'No'

    return df.applymap(replace_value)

def replace_yes_no_v2(df):
    """
    Replace items in the DataFrame:
    - 'Yes' in the string gets replaced with 1
    - 'No' in the string gets replaced with 0

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        
    Returns:
        pd.DataFrame: The DataFrame with modified values.
    """
    def replace_value(item):
        if isinstance(item, str):  # Only process string items
            if 'Yes' in item:
                return 1
            elif 'No' in item:
                return 0
            elif 'no' in item:
                return 0
        return item  # Return the item unchanged if it is not a string or doesn't match 'Yes' or 'No'

    return df.applymap(replace_value)

def evaluate_acc_uq(df_list):
    for k in range(len(df_list)):
        df_name = df_list[k]
        df_pred = pd.read_csv(df_name)
        gt_list = df_pred['gt']
        df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]
        print(df_name)
        if df_name == 'results/gpt35/baseline/LOS_baseline.csv':
            print(df_name)
            df_pred_baseline = replace_yes_no_v2(df_pred_baseline)
            df_pred_baseline.replace({'Yes': 0, 'No': 1, 'No, the patient will not be discharged within 7 days.':1}, inplace=True)
        else:
            df_pred_baseline = replace_yes_no(df_pred_baseline)
            df_pred_baseline.replace({'Yes': 1, 'No': 0, 'No, the patient will not be readmitted in 30 days.':0, 'No.':0, 
                                    'No, based on the information provided, it is not possible to determine whether the patient will be transferred to the ICU on the same admission date.':0}, inplace=True)

        uq_metric_baseline = calculate_row_entropy(df_pred_baseline).values
        uq_metric_baseline = np.round(uq_metric_baseline, 10) + 0.0000000001


        auc_list = []
        acc_list = []
        for c in df_pred_baseline.columns:
            p = df_pred_baseline[c].values
            auc = calculate_auc(df_pred['gt'].astype(int), df_pred_baseline[c])
            acc = np.mean(df_pred['gt'].astype(int) == df_pred_baseline[c])
            auc_list.append(auc)
            acc_list.append(acc)
        print(np.mean(acc_list).round(3), np.std(acc_list).round(3))
        print(np.mean(auc_list).round(3), np.std(auc_list).round(3))

        pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
        pred_list = ['pred_1']
        uq_list = []
        for p in pred_list: 
            uq_auc = calculate_auc(df_pred['gt'].astype(int) == df_pred_baseline[p].astype(int), uq_metric_baseline)
            uq_list.append(uq_auc)
            # print(uq_auc)
        print(np.mean(uq_list).round(3), np.std(uq_list).round(3))
        print('')

In [123]:
def calculate_single_uq(eval_dir, f, pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']):
    df_gpt = pd.read_csv(os.path.join(eval_dir, f))
    if 'Unnamed: 0' in df_gpt.columns:
        df_gpt = df_gpt.drop(columns = ['Unnamed: 0'])
    else:
        pass
    pred_0 = df_gpt.loc[:, pred_list]
    pred_df = replace_yes_no_v2(pred_0)
    pred_mat = pred_df.to_numpy()
    
    gt_list = df_gpt['gt'].values
    pred_list = pred_mat.mean(axis = 1)
    
    whether_correct_or_not = np.array([(pred_mat[j] == gt_list[j]).mean() for j in range(len(gt_list))]) >= 0.5
    entropy = calculate_row_entropy(pred_df).values
    
    uq_auc = calculate_auc(whether_correct_or_not, entropy)
    auc = calculate_auc(gt_list, pred_list)
    print(f"Auc. score: {auc.round(4)}, Uncertainty Auc.: {uq_auc.round(4)}")
#     return uq_auc, auc
    return auc, uq_auc

def calculate_ensemble_uq(eval_dir1, eval_dir2, f1, f2, 
                          pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']):
    df_gpt1 = pd.read_csv(os.path.join(eval_dir1, f))
    df_gpt2 = pd.read_csv(os.path.join(eval_dir2, f))
    
    if 'Unnamed: 0' in df_gpt1.columns:
        df_gpt1 = df_gpt1.drop(columns = ['Unnamed: 0'])
    else:
        pass
    if 'Unnamed: 0' in df_gpt2.columns:
        df_gpt2 = df_gpt2.drop(columns = ['Unnamed: 0'])
    else:
        pass
    pred_1 = df_gpt1.loc[:, pred_list]
    pred_2 = df_gpt2.loc[:, pred_list]
    
    pred_all = pd.concat([pred_1, pred_2], axis = 1)
    pred_df = replace_yes_no_v2(pred_all)
    pred_mat = pred_df.to_numpy()
    gt_list_1 = df_gpt1['gt'].values
    gt_list_2 = df_gpt2['gt'].values
    
    assert np.all(gt_list_1 == gt_list_2)
    gt_list = gt_list_1
    pred_list = pred_mat.mean(axis = 1)
    
    whether_correct_or_not = np.array([(pred_mat[j] == gt_list_1[j]).mean() for j in range(len(gt_list_1))]) >= 0.5
    entropy = calculate_row_entropy(pred_df).values
    
    uq_auc = calculate_auc(whether_correct_or_not, entropy)
    auc = calculate_auc(gt_list, pred_list)
    print(f"Auc. score: {auc.round(4)}, Uncertainty Auc.: {uq_auc.round(4)}")
#     return uq_auc, auc
    return auc, uq_auc

In [124]:
# df_list = ['results/gpt35/baseline/ICU_baseline.csv', 'results/gpt35/baseline/LOS_baseline.csv', 'results/gpt35/baseline/Readmin_baseline.csv']
# # df_list = ['results/gpt35/cross_task/ICU_cross_task.csv', 'results/gpt35/cross_task/LOS_cross_task.csv', 'results/gpt35/cross_task/Readmin_cross_task.csv']


# df_name = df_list[2]
# df_pred = pd.read_csv(df_name)
# gt_list = df_pred['gt']
# df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]

# if df_name == 'results/gpt35/baseline/LOS_baseline.csv':
#     df_pred_baseline.replace({'Yes': 0, 'No': 1, 'No, the patient will not be discharged within 7 days.':1}, inplace=True)
# else:
#     df_pred_baseline.replace({'Yes': 1, 'No': 0, 'No, the patient will not be readmitted in 30 days.':0, 'No.':0, 
#                               'No, based on the information provided, it is not possible to determine whether the patient will be transferred to the ICU on the same admission date.':0}, inplace=True)

In [125]:
import os

# eval_dir = "results/gpt35/baseline/"
# eval_dir = "results/gpt4/baseline/"
# eval_dir = 'results/gpt4/baseline_labtest/'
# eval_dir = 'results/gpt4/cross_task_labtest/'
# eval_dir = 'results/gpt35/baseline_labtest/'
# eval_dir = 'results/gpt35/cross_task_labtest/'
# eval_dir = "results/gpt4_new/cross_task_labtest_reformulated/"
# eval_dir = "results/gpt35_new/cross_task_labtest_reformulated/"
# eval_dir = "results/gpt35_new/cross_task_general_operation_reformulated/"
# eval_dir = "results/gpt4_new/cross_task_general_operation_reformulated/"
# eval_dir = "results/gpt35/baseline_new_diagnose/"
# eval_dir = "results/gpt4/baseline_new_diagnose/"
eval_dir = "results/gpt4/baseline_new_diagnose_v2/"
# eval_dir = "results/gpt35_new/cross_task_new_diagnose_reformulated/"
# eval_dir = "results/gpt4_new/cross_task_new_diagnose_reformulated/"

# eval_dir = "results/gpt35_new/cross_task_new_diagnose_blood_reformulated/"
# eval_dir = "results/gpt4_new/cross_task_new_diagnose_blood_reformulated/"

# eval_dir = "results/gpt4_new/cross_task_new_diagnose_blood_reformulated_v2/"
f = os.listdir(eval_dir)
f_csv = [j for j in f if '.csv' in j]
f_fullname_csv = [eval_dir + k for k in f_csv]
f_fullname_csv

['results/gpt4/baseline_new_diagnose_v2/value_new_hyperlipidemia.csv',
 'results/gpt4/baseline_new_diagnose_v2/value_new_hypertension.csv',
 'results/gpt4/baseline_new_diagnose_v2/value_new_acutemi.csv']

In [126]:
# os.listdir(eval_dir1), os.listdir(eval_dir2)

In [165]:
uq_list = []
eval_dir1 = "results/gpt35/baseline_new_diagnose_v2/"
eval_dir12 = 'results/gpt35/baseline_new_diagnose/'

eval_dir2 = "results/gpt4/baseline_new_diagnose_v2/"
eval_dir22 = "results/gpt4/baseline_new_diagnose/"

eval_dir3 = "results/gpt35_new/cross_task_new_diagnose_blood_reformulated/"

eval_dir4 = "results/gpt4_new/cross_task_new_diagnose_blood_reformulated/"

pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']

f_list_dir1 = os.listdir(eval_dir1)
f_list_dir2 = os.listdir(eval_dir2)
f_list_dir3 = os.listdir(eval_dir3)
f_list_dir4 = os.listdir(eval_dir4)

tab1 = []
tab2 = []
tab3 = []
tab4 = []
tab5 = []
tab6 = []
print('GPT3.5')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    a = calculate_single_uq(eval_dir1, f)
    tab1.append(a)
print('')
print('GPT 4')
for f in f_list_dir2:
    print(f.split('_')[-1].split('.csv')[0])
    a = calculate_single_uq(eval_dir2, f)
    tab2.append(a)
    
print('')
print('GPT3.5-multi-task')
for f in f_list:
    print(f.split('_')[-1].split('.csv')[0])
    a = calculate_single_uq(eval_dir3, f)
    tab3.append(a)
    print('')
print('GPT4-multi-task')
for f in f_list:
    print(f.split('_')[-1].split('.csv')[0])
    a = calculate_single_uq(eval_dir4, f)
    tab4.append(a)
print('')
print('Ensemble')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir2
    a = calculate_ensemble_uq(eval_dir1, eval_dir2, f, f)
    tab5.append(a)
print('')
print('Ensemble-multi-task')
for f in f_list_dir3:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir3
    a = calculate_ensemble_uq(eval_dir3, eval_dir4, f, f)
    tab6.append(a)

GPT3.5
hyperlipidemia
Auc. score: 0.5559, Uncertainty Auc.: 0.4203
hypertension
Auc. score: 0.4762, Uncertainty Auc.: 0.5238
acutemi
Auc. score: 0.5455, Uncertainty Auc.: 0.4545

GPT 4
hyperlipidemia
Auc. score: 0.6289, Uncertainty Auc.: 0.4077
hypertension
Auc. score: 0.7136, Uncertainty Auc.: 0.4488
acutemi
Auc. score: 0.7317, Uncertainty Auc.: 0.3394

GPT3.5-multi-task
Acute Myocardial Infarction
Auc. score: 0.4929, Uncertainty Auc.: 0.6379

Hypertension
Auc. score: 0.5758, Uncertainty Auc.: 0.4707

Hyperlipidemia
Auc. score: 0.5478, Uncertainty Auc.: 0.5032

GPT4-multi-task
Acute Myocardial Infarction
Auc. score: 0.6149, Uncertainty Auc.: 0.3436
Hypertension
Auc. score: 0.7136, Uncertainty Auc.: 0.4222
Hyperlipidemia
Auc. score: 0.6845, Uncertainty Auc.: 0.4127

Ensemble
hyperlipidemia
Auc. score: 0.6403, Uncertainty Auc.: 0.6053
hypertension
Auc. score: 0.692, Uncertainty Auc.: 0.5606
acutemi
Auc. score: 0.7109, Uncertainty Auc.: 0.4014

Ensemble-multi-task
Acute Myocardial Infarc

In [166]:
tab1 = pd.DataFrame(tab1).loc[[1, 0, 2]].round(4)
tab2 = pd.DataFrame(tab2).loc[[1, 0, 2]].round(4)
tab3 = pd.DataFrame(tab3).loc[[1, 2, 0]].round(4)
tab4 = pd.DataFrame(tab4).loc[[1, 2, 0]].round(4)
tab5 = pd.DataFrame(tab5).loc[[1, 2, 0]].round(4)
tab6 = pd.DataFrame(tab6).loc[[1, 2, 0]].round(4)
tab1 = tab1.reset_index(drop=True)
tab2 = tab2.reset_index(drop=True)
tab3 = tab3.reset_index(drop=True)
tab4 = tab4.reset_index(drop=True)
tab5 = tab5.reset_index(drop=True)
tab6 = tab6.reset_index(drop=True)

In [168]:
df_new_diag = pd.concat([tab1, tab3, tab2, tab4, tab5, tab6], axis = 1).round(4)
df_new_diag = df_new_diag.reset_index(drop = True)
# df_new_diag.to_csv('df_diag_results.csv')

In [169]:
uq_list = []
eval_dir1 = "results/gpt35/baseline_labtest_v2/"
eval_dir2 = "results/gpt4/baseline_labtest_v2/"
eval_dir3 = "results/gpt35_new/cross_task_labtest_reformulated/"
eval_dir4 = "results/gpt4_new/cross_task_labtest_reformulated/"

pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']

f_list_dir1 = os.listdir(eval_dir1)
f_list_dir2 = os.listdir(eval_dir2)
f_list_dir3 = os.listdir(eval_dir3)
f_list_dir4 = os.listdir(eval_dir4)

tab1 = []
tab2 = []
tab3 = []
tab4 = []
tab5 = []
tab6 = []

print('GPT3.5')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir2
    a = calculate_single_uq(eval_dir1, f)
    tab1.append(a)
print('')
print('GPT 4')
for f in f_list_dir2:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir1
    a = calculate_single_uq(eval_dir2, f)
    tab2.append(a)
print("")
print('GPT3.5-multi-task')
for f in f_list_dir3:
    print(f.split('_')[-1].split('.csv')[0])
#     assert f in f_list_dir4
    a = calculate_single_uq(eval_dir3, f)
    tab3.append(a)
print("")
print('GPT4-multi-task')
for f in f_list_dir4:
    print(f.split('_')[-1].split('.csv')[0])
#     assert f in f_list_dir4
    a = calculate_single_uq(eval_dir4, f)
    tab4.append(a)
print('')
print('Ensemble')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
#     assert f in f_list_dir2
    a = calculate_ensemble_uq(eval_dir1, eval_dir2, f, f)
    tab5.append(a)
print('')
print('Ensemble-multi-task')
for f in f_list_dir3:
    print(f.split('_')[-1].split('.csv')[0])
#     assert f in f_list_dir2
    a = calculate_ensemble_uq(eval_dir3, eval_dir4, f, f)
    tab6.append(a)

GPT3.5
thrombocytopenia
Auc. score: 0.5327, Uncertainty Auc.: 0.446
anemia
Auc. score: 0.4433, Uncertainty Auc.: 0.61
hyperkalemia
Auc. score: 0.4795, Uncertainty Auc.: 0.4821
hyponatremia
Auc. score: 0.4593, Uncertainty Auc.: 0.522
hypoglycemia
Auc. score: 0.5404, Uncertainty Auc.: 0.441

GPT 4
thrombocytopenia
Auc. score: 0.2917, Uncertainty Auc.: 0.5548
anemia
Auc. score: 0.1962, Uncertainty Auc.: 0.6877
hyperkalemia
Auc. score: 0.2508, Uncertainty Auc.: 0.5094
hyponatremia
Auc. score: 0.2652, Uncertainty Auc.: 0.5347
hypoglycemia
Auc. score: 0.7131, Uncertainty Auc.: 0.4388

GPT3.5-multi-task
thrombocytopenia
Auc. score: 0.3745, Uncertainty Auc.: 0.5464
anemia
Auc. score: 0.2739, Uncertainty Auc.: 0.7254
hyperkalemia
Auc. score: 0.3673, Uncertainty Auc.: 0.5988
hyponatremia
Auc. score: 0.3189, Uncertainty Auc.: 0.6303
hypoglycemia
Auc. score: 0.5416, Uncertainty Auc.: 0.4352

GPT4-multi-task
thrombocytopenia
Auc. score: 0.2062, Uncertainty Auc.: 0.5246
anemia
Auc. score: 0.2173, Un

In [170]:
tab1 = pd.DataFrame(tab1).loc[[0, 2, 4, 3, 1]].round(4)
tab2 = pd.DataFrame(tab2).loc[[0, 2, 4, 3, 1]].round(4)
tab3 = pd.DataFrame(tab3).loc[[0, 2, 4, 3, 1]].round(4)
tab4 = pd.DataFrame(tab4).loc[[0, 2, 4, 3, 1]].round(4)
tab5 = pd.DataFrame(tab5).loc[[0, 2, 4, 3, 1]].round(4)
tab6 = pd.DataFrame(tab6).loc[[0, 2, 4, 3, 1]].round(4)

tab1 = tab1.reset_index(drop=True)
tab2 = tab2.reset_index(drop=True)
tab3 = tab3.reset_index(drop=True)
tab4 = tab4.reset_index(drop=True)
tab5 = tab5.reset_index(drop=True)
tab6 = tab6.reset_index(drop=True)

In [172]:
df_lab = pd.concat([tab1, tab3, tab2, tab4, tab5, tab6], axis = 1).round(4)
df_lab = df_lab.reset_index(drop = True)
df_lab.to_csv('df_lab_result.csv')

In [190]:
uq_list = []
eval_dir1 = "results/gpt35/baseline_general_operation/"
eval_dir2 = "results/gpt4/baseline_general_operation/"
eval_dir3 = "results/gpt35_new/cross_task_general_operation_reformulated/"
eval_dir4 = "results/gpt4_new//cross_task_general_operation_reformulated/"

pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']

tab1 = []
tab2 = []
tab3 = []
tab4 = []
tab5 = []
tab6 = []

f_list_dir1 = os.listdir(eval_dir1)
f_list_dir2 = os.listdir(eval_dir2)
f_list_dir3 = os.listdir(eval_dir3)
f_list_dir4 = os.listdir(eval_dir4)

print('GPT3.5')
for f in f_list_dir1:
    print(f.split('_')[0].split('.csv')[0])
    assert f in f_list_dir2
    a = calculate_single_uq(eval_dir1, f)
    tab1.append(a)
print('')
print('GPT 4')
for f in f_list_dir2:
    print(f.split('_')[0].split('.csv')[0])
    assert f in f_list_dir1
    a = calculate_single_uq(eval_dir2, f)
    tab2.append(a)
print('')
print('GPT3.5-multi-task')
for f in f_list_dir3:
    print(f.split('_')[0].split('.csv')[0])
#     assert f in f_list_dir1
    a = calculate_single_uq(eval_dir3, f)
    tab3.append(a)
print('')
print('GPT4-multi-task')
for f in f_list_dir4:
    print(f.split('_')[0].split('.csv')[0])
#     assert f in f_list_dir1
    a = calculate_single_uq(eval_dir4, f)
    tab4.append(a)
print('')
print('Ensemble')
for f in f_list_dir1:
    print(f.split('_')[0].split('.csv')[0])
    assert f in f_list_dir2
    a = calculate_ensemble_uq(eval_dir1, eval_dir2, f, f)
    tab5.append(a)
print('')
print('Ensemble-multi-task')
for f in f_list_dir3:
    print(f.split('_')[0].split('.csv')[0])
#     assert f in f_list_dir
    a = calculate_ensemble_uq(eval_dir3, eval_dir4, f, f)
    tab6.append(a)

GPT3.5
ICU
Auc. score: 0.5047, Uncertainty Auc.: 0.5331
Readmin
Auc. score: 0.5408, Uncertainty Auc.: 0.4592
LOS
Auc. score: 0.543, Uncertainty Auc.: 0.457

GPT 4
ICU
Auc. score: 0.5938, Uncertainty Auc.: 0.514
Readmin
Auc. score: 0.5024, Uncertainty Auc.: 0.4967
LOS
Auc. score: 0.3614, Uncertainty Auc.: 0.4992

GPT3.5-multi-task
ICU transfer
Auc. score: 0.6853, Uncertainty Auc.: 0.3596
Long length of stay
Auc. score: 0.6153, Uncertainty Auc.: 0.4265
Readmission
Auc. score: 0.4327, Uncertainty Auc.: 0.4633

GPT4-multi-task
ICU transfer
Auc. score: 0.7083, Uncertainty Auc.: 0.4888
Long length of stay
Auc. score: 0.5125, Uncertainty Auc.: 0.4875
Readmission
Auc. score: 0.536, Uncertainty Auc.: 0.6566

Ensemble
ICU
Auc. score: 0.5538, Uncertainty Auc.: 0.6455
Readmin
Auc. score: 0.5461, Uncertainty Auc.: 0.8385
LOS
Auc. score: 0.4126, Uncertainty Auc.: 0.5671

Ensemble-multi-task
ICU transfer
Auc. score: 0.7552, Uncertainty Auc.: 0.4831
Long length of stay
Auc. score: 0.6166, Uncertainty 

In [195]:
pd.DataFrame(tab6).loc[[1, 0, 2]]

Unnamed: 0,0,1
1,0.616562,0.723684
0,0.755208,0.483073
2,0.472826,0.850055


In [196]:
tab1 = pd.DataFrame(tab1).loc[[2, 0, 1]].round(4)
tab2 = pd.DataFrame(tab2).loc[[2, 0, 1]].round(4)
tab3 = pd.DataFrame(tab3).loc[[1, 0, 2]].round(4)
tab4 = pd.DataFrame(tab4).loc[[1, 0, 2]].round(4)
tab5 = pd.DataFrame(tab5).loc[[1, 0, 2]].round(4)
tab6 = pd.DataFrame(tab6).loc[[1, 0, 2]].round(4)
df_general = pd.concat([tab1, tab3, tab2, tab4, tab5, tab6], axis = 1).round(4)
df_general = df_general.reset_index(drop = True)
df_general.to_csv('df_general_result.csv')

In [197]:
df_general

Unnamed: 0,0,1,0.1,1.1,0.2,1.2,0.3,1.3,0.4,1.4,0.5,1.5
0,0.543,0.457,0.4327,0.4633,0.3614,0.4992,0.536,0.6566,0.4126,0.5671,0.4728,0.8501
1,0.5047,0.5331,0.6853,0.3596,0.5938,0.514,0.7083,0.4888,0.5538,0.6455,0.7552,0.4831
2,0.5408,0.4592,0.6153,0.4265,0.5024,0.4967,0.5125,0.4875,0.5461,0.8385,0.6166,0.7237


In [21]:
uq_list = []
eval_dir1 = "results/gpt35/baseline_new_diagnose_v2/"
eval_dir12 = 'results/gpt35/baseline_new_diagnose_v1/'

eval_dir2 = "results/gpt4/baseline_new_diagnose_v2/"
eval_dir22 = "results/gpt4/baseline_new_diagnose_v1/"

pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']

f_list_dir1 = os.listdir(eval_dir1)
f_list_dir2 = os.listdir(eval_dir2)

print('GPT3.5')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir2
    calculate_ensemble_uq(eval_dir1, eval_dir12, f, f)
print('')
print('GPT 4')
for f in f_list_dir2:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir1
    calculate_ensemble_uq(eval_dir2, eval_dir22, f, f)

print('')
print('Ensemble')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir2
    calculate_ensemble_uq(eval_dir1, eval_dir2, f, f)

GPT3.5
hyperlipidemia
Auc. score: 0.6076, Uncertainty Auc.: 0.3924
hypertension
Auc. score: 0.4594, Uncertainty Auc.: 0.5406
acutemi
Auc. score: 0.5208, Uncertainty Auc.: 0.4792

GPT 4
hyperlipidemia
Auc. score: 0.6367, Uncertainty Auc.: 0.4407
hypertension
Auc. score: 0.693, Uncertainty Auc.: 0.4821
acutemi
Auc. score: 0.7356, Uncertainty Auc.: 0.3602

Ensemble
hyperlipidemia
Auc. score: 0.6403, Uncertainty Auc.: 0.6053
hypertension
Auc. score: 0.692, Uncertainty Auc.: 0.5606
acutemi
Auc. score: 0.7109, Uncertainty Auc.: 0.4014


In [22]:
eval_dir1 = "results/gpt35/baseline_new_diagnose_v2/"
eval_dir12 = 'results/gpt35/baseline_new_diagnose_v1/'

eval_dir2 = "results/gpt4/baseline_new_diagnose_v2/"
eval_dir22 = "results/gpt4/baseline_new_diagnose_v1/"

pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']

f_list_dir1 = os.listdir(eval_dir1)
f_list_dir2 = os.listdir(eval_dir2)

print('GPT3.5')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir2
    calculate_ensemble_uq(eval_dir1, eval_dir12, f, f)
print('')
print('GPT 4')
for f in f_list_dir2:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir1
    calculate_ensemble_uq(eval_dir2, eval_dir22, f, f)

print('')
print('Ensemble')
for f in f_list_dir1:
    print(f.split('_')[-1].split('.csv')[0])
    assert f in f_list_dir2
    calculate_ensemble_uq(eval_dir1, eval_dir2, f, f)

GPT3.5
hyperlipidemia
Auc. score: 0.6076, Uncertainty Auc.: 0.3924
hypertension
Auc. score: 0.4594, Uncertainty Auc.: 0.5406
acutemi
Auc. score: 0.5208, Uncertainty Auc.: 0.4792

GPT 4
hyperlipidemia
Auc. score: 0.6367, Uncertainty Auc.: 0.4407
hypertension
Auc. score: 0.693, Uncertainty Auc.: 0.4821
acutemi
Auc. score: 0.7356, Uncertainty Auc.: 0.3602

Ensemble
hyperlipidemia
Auc. score: 0.6403, Uncertainty Auc.: 0.6053
hypertension
Auc. score: 0.692, Uncertainty Auc.: 0.5606
acutemi
Auc. score: 0.7109, Uncertainty Auc.: 0.4014


In [87]:
calculate_auc(gt_list, pred_list)

0.7108753315649868

In [56]:
pred

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_1.1,pred_2.1,pred_3.1,pred_4.1,pred_5.1
0,0,0,1,0,0,1,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0


In [10]:
df_name = f_fullname_csv[0]
df_pred = pd.read_csv(df_name)
gt_list = df_pred['gt']
df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]

In [11]:
df_pred_baseline

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
0,Yes,Yes,No,Yes,Yes
1,No,No,No,No,No
2,No,Yes,No,No,Yes
3,Yes,Yes,Yes,Yes,Yes
4,Yes,Yes,Yes,Yes,Yes
...,...,...,...,...,...
95,No,No,No,No,No
96,No,No,No,No,No
97,No,No,No,No,No
98,No,No,No,No,No


In [10]:
eval_dir1 = "results/gpt4/baseline_new_diagnose/"

eval_dir2 = "results/gpt35/baseline_new_diagnose/"

f1 = os.listdir(eval_dir1)
f_csv1 = [j for j in f1 if '.csv' in j]
f_fullname_csv1 = [eval_dir1 + k for k in f_csv1]

f2 = os.listdir(eval_dir1)
f_csv2 = [j for j in f2 if '.csv' in j]
f_fullname_csv2 = [eval_dir2 + k for k in f_csv2]



['results/gpt35/baseline_new_diagnose/new_celiac.csv',
 'results/gpt35/baseline_new_diagnose/new_acutemi.csv',
 'results/gpt35/baseline_new_diagnose/new_lupus.csv',
 'results/gpt35/baseline_new_diagnose/new_pancan.csv',
 'results/gpt35/baseline_new_diagnose/new_hypertension.csv',
 'results/gpt35/baseline_new_diagnose/new_hyperlipidemia.csv']

In [55]:
# print('gpt3.5 baseline')
# evaluate_acc_uq(df_list= ['results/gpt35/baseline/ICU_baseline.csv', 'results/gpt35/baseline/LOS_baseline.csv', 'results/gpt35/baseline/Readmin_baseline.csv'])
# print('gpt3.5 cross task')
# evaluate_acc_uq(df_list= ['results/gpt35/cross_task/ICU_cross_task.csv', 'results/gpt35/cross_task/LOS_cross_task.csv', 'results/gpt35/cross_task/Readmin_cross_task.csv'])

# evaluate_acc_uq(df_list= ['results/gpt35/baseline2/ICU_baseline.csv', 'results/gpt35/baseline2/LOS_baseline.csv', 'results/gpt35/baseline/Readmin_baseline.csv'])
evaluate_acc_uq(f_fullname_csv)


results/gpt4/baseline_new_diagnose_v2/value_new_hypertension.csv
0.594 0.022
0.664 0.024
0.43 0.0

results/gpt4/baseline_new_diagnose_v2/value_new_acutemi.csv
0.7 0.023
0.664 0.034
0.432 0.0

results/gpt4/baseline_new_diagnose_v2/value_new_hyperlipidemia.csv
0.62 0.011
0.594 0.011
0.432 0.0



  return df.applymap(replace_value)
  return df.applymap(replace_value)
  return df.applymap(replace_value)


In [None]:
print('gpt3.5 baseline')
evaluate_acc_uq(df_list= ['results/gpt35/baseline/ICU_baseline.csv', 'results/gpt35/baseline/LOS_baseline.csv', 'results/gpt35/baseline/Readmin_baseline.csv'])

evaluate_acc_uq(df_list= ['results/gpt35/cross_task/ICU_cross_task.csv', 'results/gpt35/cross_task/LOS_cross_task.csv', 'results/gpt35/cross_task/Readmin_cross_task.csv'])

In [45]:
df_list = ['results/gpt35/cross_task/ICU_cross_task.csv', 'results/gpt35/cross_task/LOS_cross_task.csv', 'results/gpt35/cross_task/Readmin_cross_task.csv']
for k in range(3):
    df_name = df_list[k]
    df_pred = pd.read_csv(df_name)
    gt_list = df_pred['gt']
    df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]

    if df_name == 'results/gpt35/baseline/LOS_baseline.csv':
        df_pred_baseline.replace({'Yes': 0, 'No': 1, 'No, the patient will not be discharged within 7 days.':1}, inplace=True)
    else:
        df_pred_baseline.replace({'Yes': 1, 'No': 0, 'No, the patient will not be readmitted in 30 days.':0, 'No.':0, 
                                'No, based on the information provided, it is not possible to determine whether the patient will be transferred to the ICU on the same admission date.':0}, inplace=True)

    uq_metric_baseline = calculate_row_entropy(df_pred_baseline).values
    uq_metric_baseline = np.round(uq_metric_baseline, 10) + 0.0000000001


    auc_list = []
    acc_list = []
    for c in df_pred_baseline.columns:
        p = df_pred_baseline[c].values
        auc = calculate_auc(df_pred['gt'].astype(int), df_pred_baseline[c])
        acc = np.mean(df_pred['gt'].astype(int) == df_pred_baseline[c])
        auc_list.append(auc)
        acc_list.append(acc)
    print(np.mean(acc_list), np.std(acc_list))
    print(np.mean(auc_list), np.std(auc_list))

    pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
    pred_list = ['pred_5']
    uq_list = []
    for p in pred_list: 
        uq_auc = calculate_auc(df_pred['gt'].astype(int) == df_pred_baseline[p].astype(int), uq_metric_baseline)
        uq_list.append(uq_auc)
        # print(uq_auc)
    print(np.mean(uq_list), np.std(uq_list))
    print('')

0.58 0.06196773353931864
0.49749999999999994 0.054857770643729216
0.60125 0.0

0.6759999999999999 0.030724582991474406
0.5129499766245909 0.03369290952600669
0.5752380952380952 0.0

0.688 0.029257477676655565
0.49674599320058277 0.033386100704494735
0.5857843137254902 0.0



In [46]:
df_list = ['results/gpt35/baseline/ICU_baseline.csv', 'results/gpt35/baseline/LOS_baseline.csv', 'results/gpt35/baseline/Readmin_baseline.csv']

for k in range(3):
    df_name = df_list[k]
    df_pred = pd.read_csv(df_name)
    gt_list = df_pred['gt']
    df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]

    if df_name == 'results/gpt35/baseline/LOS_baseline.csv':
        df_pred_baseline.replace({'Yes': 0, 'No': 1, 'No, the patient will not be discharged within 7 days.':1}, inplace=True)
    else:
        df_pred_baseline.replace({'Yes': 1, 'No': 0, 'No, the patient will not be readmitted in 30 days.':0, 'No.':0, 
                                'No, based on the information provided, it is not possible to determine whether the patient will be transferred to the ICU on the same admission date.':0}, inplace=True)

    uq_metric_baseline = calculate_row_entropy(df_pred_baseline).values
    uq_metric_baseline = np.round(uq_metric_baseline, 10) + 0.0000000001


    auc_list = []
    acc_list = []
    for c in df_pred_baseline.columns:
        p = df_pred_baseline[c].values
        auc = calculate_auc(df_pred['gt'].astype(int), df_pred_baseline[c])
        acc = np.mean(df_pred['gt'].astype(int) == df_pred_baseline[c])
        auc_list.append(auc)
        acc_list.append(acc)
    print(np.mean(acc_list), np.std(acc_list))
    print(np.mean(auc_list), np.std(auc_list))

    pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
    pred_list = ['pred_5']
    uq_list = []
    for p in pred_list: 
        uq_auc = calculate_auc(df_pred['gt'].astype(int) == df_pred_baseline[p].astype(int), uq_metric_baseline)
        uq_list.append(uq_auc)
        # print(uq_auc)
    print(np.mean(uq_list), np.std(uq_list))
    print('')

0.41800000000000004 0.04621688003316537
0.50125 0.06304760106459247
0.41339396444811904 0.0

0.31799999999999995 0.019390719429665314
0.4898083216456288 0.023819346854945612
0.49805730937348225 0.0

0.698 0.024819347291981687
0.5099077221952404 0.025904797942854055
0.5040588533739219 0.0



In [47]:
df_list = ['results/gpt4/baseline/ICU_baseline.csv', 'results/gpt4/baseline/LOS_baseline.csv', 'results/gpt4/baseline/Readmin_baseline.csv']
for k in range(3):
    df_name = df_list[k]
    df_pred = pd.read_csv(df_name)
    gt_list = df_pred['gt']
    df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]

    if df_name == 'results/gpt35/baseline/LOS_baseline.csv':
        df_pred_baseline.replace({'Yes': 0, 'No': 1, 'No, the patient will not be discharged within 7 days.':1}, inplace=True)
    else:
        df_pred_baseline.replace({'Yes': 1, 'No': 0, 'No, the patient will not be readmitted in 30 days.':0, 'No.':0, 
                                'No, based on the information provided, it is not possible to determine whether the patient will be transferred to the ICU on the same admission date.':0}, inplace=True)

    uq_metric_baseline = calculate_row_entropy(df_pred_baseline).values
    uq_metric_baseline = np.round(uq_metric_baseline, 10) + 0.0000000001


    auc_list = []
    acc_list = []
    for c in df_pred_baseline.columns:
        p = df_pred_baseline[c].values
        auc = calculate_auc(df_pred['gt'].astype(int), df_pred_baseline[c])
        acc = np.mean(df_pred['gt'].astype(int) == df_pred_baseline[c])
        auc_list.append(auc)
        acc_list.append(acc)
    print(np.mean(acc_list), np.std(acc_list))
    print(np.mean(auc_list), np.std(auc_list))

    pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
    pred_list = ['pred_5']
    uq_list = []
    for p in pred_list: 
        uq_auc = calculate_auc(df_pred['gt'].astype(int) == df_pred_baseline[p].astype(int), uq_metric_baseline)
        uq_list.append(uq_auc)
        # print(uq_auc)
    print(np.mean(uq_list), np.std(uq_list))
    print('')

0.306 0.01200000000000001
0.5662499999999999 0.007499999999999973
0.44370404411764697 0.0

0.5479999999999999 0.018330302779823324
0.40598410472183255 0.02124483926448054
0.4869441044471644 0.0

0.35200000000000004 0.019390719429665308
0.5069451189898009 0.014140165686196084
0.5420496323529412 0.0



In [25]:
df_list = ['results/gpt4/cross_task/ICU_cross_task.csv', 'results/gpt4/cross_task/LOS_cross_task.csv', 'results/gpt4/cross_task/Readmin_cross_task.csv']
for k in range(3):
    df_name = df_list[k]
    df_pred = pd.read_csv(df_name)
    gt_list = df_pred['gt']
    df_pred_baseline = df_pred.loc[:, ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']]

    if df_name == 'results/gpt35/baseline/LOS_baseline.csv':
        df_pred_baseline = replace_yes_no_v2(df_pred_baseline)
        df_pred_baseline.replace({'Yes': 0, 'No': 1, 'No, the patient will not be discharged within 7 days.':1}, inplace=True)
        
    else:
        df_pred_baseline = replace_yes_no(df_pred_baseline)
        df_pred_baseline.replace({'Yes': 1, 'No': 0, 'No, the patient will not be readmitted in 30 days.':0, 'No.':0, 
                                'No, based on the information provided, it is not possible to determine whether the patient will be transferred to the ICU on the same admission date.':0}, inplace=True)

    uq_metric_baseline = calculate_row_entropy(df_pred_baseline).values
    uq_metric_baseline = np.round(uq_metric_baseline, 10) + 0.0000000001


    auc_list = []
    acc_list = []
    for c in df_pred_baseline.columns:
        p = df_pred_baseline[c].values
        auc = calculate_auc(df_pred['gt'].astype(int), df_pred_baseline[c])
        acc = np.mean(df_pred['gt'].astype(int) == df_pred_baseline[c])
        auc_list.append(auc)
        acc_list.append(acc)
    print(np.mean(acc_list), np.std(acc_list))
    print(np.mean(auc_list), np.std(auc_list))

    pred_list = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
    pred_list = ['pred_1']
    uq_list = []
    for p in pred_list: 
        uq_auc = calculate_auc(df_pred['gt'].astype(int) == df_pred_baseline[p].astype(int), uq_metric_baseline)
        uq_list.append(uq_auc)
        # print(uq_auc)
    print(np.mean(uq_list), np.std(uq_list))
    print('')

0.404 0.022449944320643647
0.5225 0.01837117307087385
0.4800509337860781 0.0

0.434 0.0349857113690718
0.37489481065918656 0.03466042724909326
0.5341050020669699 0.0

0.36200000000000004 0.028565713714171402
0.47115104419621173 0.03137249883769452
0.6530135823429541 0.0



  return df.applymap(replace_value)
  return df.applymap(replace_value)
  return df.applymap(replace_value)


In [20]:
df_pred_baseline

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,No. The patient will not be discharged within ...,0,0,No. The patient is 85 years old and has underg...,0
4,0,0,0,0,0
...,...,...,...,...,...
95,0,1,1,1,1
96,1,1,0,1,1
97,0,0,0,0,1
98,1,1,1,1,1
