In [1]:
from torch_uncertainty.metrics.classification import BrierScore, CategoricalNLL
from torch_uncertainty.metrics.classification.adaptive_calibration_error import BinaryAdaptiveCalibrationError

from typing import List, Tuple
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
results_dir = '../uncertainty_quantification/single_model_results/'
single_task_results_dir = '../uncertainty_quantification/single_task_results/'
labeling_functions_path = '../uncertainty_quantification/labeling_function_names.txt'
def read_labeling_functions(path: str) -> List[str]:
    with open(path, 'r') as f:
        labeling_functions = [line.rstrip('\n') for line in f]
    return labeling_functions

def convert_string_to_probs(s):
    s = s.strip('[]')
    numbers = s.split()
    pro_list = [float(n) for n in numbers]
    return pro_list

def read_results(results_dir: str, task_name: str):
    task_results_file = os.path.join(results_dir, task_name + '_results.csv')
    print(task_results_file)
    df_results = pd.read_csv(task_results_file)
    gt = df_results['0'].values
    pred = df_results['1'].values
    predicted_prob= df_results['2'].values
    binary_prob = [convert_string_to_probs(s) for s in df_results['3'].values]
    return gt, pred, predicted_prob, binary_prob

def calcualte_brier_score(gt, binary_prob):
    metric_brier = BrierScore(num_classes=2, top_class=False)
    metric_brier.update(torch.tensor(binary_prob), torch.tensor(gt))
    brierScore = metric_brier.compute()
    return np.round(brierScore.item(), 3)

# calculate the categorical nll
def calcualte_categorical_nll(gt, binary_prob):
    metric_categorical_nll = CategoricalNLL()
    metric_categorical_nll.update(torch.tensor(binary_prob), torch.tensor(gt))
    categorical_nll = metric_categorical_nll.compute()
    return np.round(categorical_nll.item(), 3)

def calculate_calibration_error(gt, pred):
    metric_calibration = BinaryAdaptiveCalibrationError(n_bins=10, norm='l1')
    metric_calibration.update(torch.tensor(predicted_prob), torch.tensor(gt))
    calibration_error = metric_calibration.compute()
    return np.round(calibration_error.item(), 3)

In [3]:
# def read_results(path: str) -> pd.DataFrame:

In [4]:
labeling_functions = read_labeling_functions(path = labeling_functions_path)
brier_score_list = []
categorical_nll_list = []
calibration_error_list = []
task_name_list = []

for i in range(len(labeling_functions)):

    task_name = labeling_functions[i]

    gt, pred, predicted_prob, binary_prob = read_results(single_task_results_dir, task_name)

    brier_score = calcualte_brier_score(gt, binary_prob)

    categorical_nll = calcualte_categorical_nll(gt, binary_prob)

    calibration_error = calculate_calibration_error(gt, pred)

    brier_score_list.append(brier_score)

    categorical_nll_list.append(calibration_error)

    calibration_error_list.append(calibration_error)



../uncertainty_quantification/single_task_results/guo_los_results.csv
../uncertainty_quantification/single_task_results/guo_readmission_results.csv
../uncertainty_quantification/single_task_results/guo_icu_results.csv
../uncertainty_quantification/single_task_results/new_hypertension_results.csv
../uncertainty_quantification/single_task_results/new_hyperlipidemia_results.csv
../uncertainty_quantification/single_task_results/new_pancan_results.csv
../uncertainty_quantification/single_task_results/new_celiac_results.csv
../uncertainty_quantification/single_task_results/new_lupus_results.csv
../uncertainty_quantification/single_task_results/new_acutemi_results.csv
../uncertainty_quantification/single_task_results/lab_thrombocytopenia_results.csv
../uncertainty_quantification/single_task_results/lab_hyperkalemia_results.csv
../uncertainty_quantification/single_task_results/lab_hyponatremia_results.csv
../uncertainty_quantification/single_task_results/lab_anemia_results.csv
../uncertainty_q

In [7]:
binary_prob

[[0.568033, 0.43196702],
 [0.49982902, 0.500171],
 [0.842721, 0.15727907],
 [0.81384706, 0.186153],
 [0.49334276, 0.50665724],
 [0.70217156, 0.29782844],
 [0.26723117, 0.7327688],
 [0.73102444, 0.2689756],
 [0.87041926, 0.1295807],
 [0.8852202, 0.1147798],
 [0.94720614, 0.05279393],
 [0.7882363, 0.21176368],
 [0.9297165, 0.07028349],
 [0.99153477, 0.0084653],
 [0.9755712, 0.02442885],
 [0.9328172, 0.06718276],
 [0.96783584, 0.03216419],
 [0.97038704, 0.02961298],
 [0.94173586, 0.05826418],
 [0.9674499, 0.03255006],
 [0.98517215, 0.01482782],
 [0.971199, 0.02880106],
 [0.91164094, 0.08835906],
 [0.9111477, 0.08885234],
 [0.9517331, 0.04826694],
 [0.9747395, 0.02526052],
 [0.99112695, 0.0088731],
 [0.9918657, 0.00813428],
 [0.9853172, 0.01468283],
 [0.98135144, 0.01864856],
 [0.9452896, 0.05471032],
 [0.9738921, 0.02610785],
 [0.97585243, 0.02414755],
 [0.9316923, 0.06830771],
 [0.95470333, 0.04529666],
 [0.92986315, 0.07013689],
 [0.8014662, 0.19853374],
 [0.8501756, 0.14982435],
 [0.82

In [5]:
pd.DataFrame({'Task Name': labeling_functions, 'Brier Score': brier_score_list, 'Categorical NLL': categorical_nll_list, 'Calibration Error': calibration_error_list})

Unnamed: 0,Task Name,Brier Score,Categorical NLL,Calibration Error
0,guo_los,0.414,0.547,0.547
1,guo_readmission,0.326,0.619,0.619
2,guo_icu,0.196,0.836,0.836
3,new_hypertension,0.377,0.602,0.602
4,new_hyperlipidemia,0.446,0.622,0.622
5,new_pancan,0.292,0.741,0.741
6,new_celiac,0.237,0.798,0.798
7,new_lupus,0.158,0.838,0.838
8,new_acutemi,0.322,0.638,0.638
9,lab_thrombocytopenia,0.307,0.439,0.439
