In [1]:
'''Use multiple rounds to get a more robust results'''
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, balanced_accuracy_score
import matplotlib.pyplot as plt
import torchvision
import shutil
import os

In [20]:
def cal_metrics(csv_path, is_binary=False):
    '''
    calculate average accuracy, accuracy per skin type, PQD, DPM, EOM.
    All known skin types
    input val results csv path, type_indices: a list
    output a dic, 'acc_avg': value, 'acc_per_type': array[x,x,x], 'PQD', 'DPM', 'EOM'
    '''
    df = pd.read_csv(csv_path)
    type_indices = sorted(list(df['fitzpatrick'].unique()))
    labels_array = np.zeros((6, len(df["label"].unique())))
    correct_array = np.zeros((6, len(df["label"].unique())))
    predictions_array = np.zeros((6, len(df["label"].unique())))
    positive_list = []  # get positive probability for binary classification
    labels_ft0 = []
    labels_ft1 = []
    predictions_ft0 = []
    predictions_ft1 = []

    for i in range(df.shape[0]):
        prediction = df.iloc[i]["prediction"]
        label = df.iloc[i]["label"]
        type = df.iloc[i]["fitzpatrick"]
        labels_array[int(type), int(label)] += 1
        predictions_array[int(type), int(prediction)] += 1
        if prediction == label:
            correct_array[int(type), int(label)] += 1

        if is_binary:
            if prediction == 0:
                positive_list.append(1.0 - df.iloc[i]["prediction_probability"])
            else:
                positive_list.append(df.iloc[i]["prediction_probability"])

        binary_type = 0 if df.iloc[i]["fitzpatrick"] in [0, 1, 2] else 1

        if binary_type == 0:
            labels_ft0.append(label)
            predictions_ft0.append(prediction)
        else:
            labels_ft1.append(label)
            predictions_ft1.append(prediction)

    correct_array = correct_array[type_indices]
    labels_array = labels_array[type_indices]
    predictions_array = predictions_array[type_indices]

    # avg acc, acc per type
    correct_array_sumc, labels_array_sumc = np.sum(correct_array, axis=1), np.sum(labels_array, axis=1)  # sum skin conditions
    acc_array = correct_array_sumc/labels_array_sumc
    avg_acc = np.sum(correct_array)/np.sum(labels_array)

    # PQD
    PQD = acc_array.min()/acc_array.max()

    # DPM
    demo_array = predictions_array/np.sum(predictions_array, axis=1, keepdims=True)
    DPM = np.mean(demo_array.min(axis=0)/demo_array.max(axis=0))

    # EOM
    eo_array = correct_array/labels_array
    EOM = np.mean(np.min(eo_array,axis=0)/np.max(eo_array, axis=0))

    # getting class-wise TPR, FPR, TNR for fitzpatrick 0
    conf_matrix_fitz0 = confusion_matrix(labels_ft0, predictions_ft0)

    # return(conf_matrix_fitz0)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz0 = []
    class_tnr_fitz0 = []
    class_fpr_fitz0 = []

    for i in range(len(conf_matrix_fitz0)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz0[i, i] / sum(conf_matrix_fitz0[i, :])
        class_tpr_fitz0.append(tpr)
        
        # Calculate TNR for class i
        tn = sum(sum(conf_matrix_fitz0)) - sum(conf_matrix_fitz0[i, :]) - sum(conf_matrix_fitz0[:, i]) + conf_matrix_fitz0[i, i]
        fp = sum(conf_matrix_fitz0[:, i]) - conf_matrix_fitz0[i, i]
        fn = sum(conf_matrix_fitz0[i, :]) - conf_matrix_fitz0[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz0.append(tnr)
        
        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz0.append(fpr)
    
    # getting class-wise TPR, FPR, TNR for fitzpatrick 1

    conf_matrix_fitz1 = confusion_matrix(labels_ft1, predictions_ft1)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz1 = []
    class_tnr_fitz1 = []
    class_fpr_fitz1 = []

    for i in range(len(conf_matrix_fitz1)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz1[i, i] / sum(conf_matrix_fitz1[i, :])
        class_tpr_fitz1.append(tpr)
        
        # Calculate TNR for class i
        tn = sum(sum(conf_matrix_fitz1)) - sum(conf_matrix_fitz1[i, :]) - sum(conf_matrix_fitz1[:, i]) + conf_matrix_fitz1[i, i]
        fp = sum(conf_matrix_fitz1[:, i]) - conf_matrix_fitz1[i, i]
        fn = sum(conf_matrix_fitz1[i, :]) - conf_matrix_fitz1[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz1.append(tnr)
        
        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz1.append(fpr)


    #EOpp0
    EOpp0 = 0
    for c in range(len(class_tnr_fitz0)):
        EOpp0 += abs(class_tnr_fitz1[c] - class_tnr_fitz0[c])
    
    #EOpp1
    EOpp1 = 0
    for c in range(len(class_tpr_fitz0)):
        EOpp1 += abs(class_tpr_fitz1[c] - class_tpr_fitz0[c])

    #EOdd
    EOdd = 0
    for c in range(len(class_tpr_fitz0)):
        EOdd += abs(class_tpr_fitz1[c] - class_tpr_fitz0[c] + class_fpr_fitz1[c] - class_fpr_fitz0[c])
    
    #NAR
    NAR = (acc_array.max() - acc_array.min()) / acc_array.mean()

    # if is binary classification, output AUC
    if is_binary:
        fpr, tpr, threshold = roc_curve(df['label'], positive_list,drop_intermediate=True)
        AUC = auc(fpr, tpr)
    else:
        AUC = -1

    return {'acc_avg': avg_acc, 'acc_per_type': acc_array, 'PQD': PQD, 'DPM': DPM, 'EOM': EOM, 'EOpp0': EOpp0, 'EOpp1': EOpp1, 'EOdd': EOdd, 'NAR':NAR, 'AUC': AUC}

# Temp

In [4]:

df_main = pd.read_csv('/home/ali/Datasets/Fitz17k/fitzpatrick17k.csv')
df_preds = pd.read_csv('/home/ali/Outputs/SkinFormer_baseline/validation_results_DiT_S_LRP_50_random_holdout_BASE.csv')

df_merged = df_preds.merge(df_main, left_on="hasher", right_on="md5hash")[
    [
        "hasher",
        "label_x",
        "fitzpatrick_scale",
        "prediction_probability",
        "prediction",
    ]
]
df_merged.rename(
    columns={"label_x": "label", "fitzpatrick_scale": "fitzpatrick"}, inplace=True
)
df_merged['fitzpatrick'] = df_merged['fitzpatrick'] - 1

In [6]:
df_merged.to_csv('/home/ali/Outputs/SkinFormer_baseline/validation_results_DiT_S_LRP_50_random_holdout_BASE_6fitz_new.csv', index=False)

# Baseline

In [22]:
cal_metrics(csv_path='/home/ali/Outputs/SkinFormer_baseline/validation_results_DiT_S_LRP_50_random_holdout_BASE_6fitz.csv', is_binary=False)

{'acc_avg': 0.8616921635966281,
 'acc_per_type': array([0.83946488, 0.843987  , 0.85901163, 0.8966725 , 0.90938511,
        0.83333333]),
 'PQD': 0.9163701067615659,
 'DPM': 0.5278897673948356,
 'EOM': 0.6284678839710286,
 'EOpp0': 0.09241948268945266,
 'EOpp1': 0.13334040160654603,
 'EOdd': 0.19503690128792983,
 'NAR': 0.08805933919376745,
 'AUC': -1}

# temp 2

In [10]:
def cal_metrics(df, is_binaryCLF=False):
    """
    calculate average accuracy, accuracy per skin type, PQD, DPM, EOM, EOpp0, EOpp1, EOdd, and NAR.
    Skin type in the input df should be in the range of [0,5].
    input val results csv path, type_indices: a list
    output a dic, 'acc_avg': value, 'acc_per_type': array[x,x,x], 'PQD', 'DPM', 'EOM'
    """
    type_indices = sorted(list(df["fitzpatrick"].unique()))
    type_indices_binary = sorted(list(df["fitzpatrick_binary"].unique()))

    labels_array = np.zeros((6, len(df["label"].unique())))
    correct_array = np.zeros((6, len(df["label"].unique())))
    predictions_array = np.zeros((6, len(df["label"].unique())))

    labels_array_binary = np.zeros((2, len(df["label"].unique())))
    correct_array_binary = np.zeros((2, len(df["label"].unique())))
    predictions_array_binary = np.zeros((2, len(df["label"].unique())))

    positive_list = []  # get positive probability for binary classification
    labels_ft0 = []
    labels_ft1 = []
    predictions_ft0 = []
    predictions_ft1 = []

    for i in range(df.shape[0]):
        prediction = df.iloc[i]["prediction"]
        label = df.iloc[i]["label"]
        type = df.iloc[i]["fitzpatrick"]
        type_binary = df.iloc[i]["fitzpatrick_binary"]

        labels_array[int(type), int(label)] += 1
        predictions_array[int(type), int(prediction)] += 1
        if prediction == label:
            correct_array[int(type), int(label)] += 1

        labels_array_binary[int(type_binary), int(label)] += 1
        predictions_array_binary[int(type_binary), int(prediction)] += 1
        if prediction == label:
            correct_array_binary[int(type_binary), int(label)] += 1

        if is_binaryCLF:
            if prediction == 0:
                positive_list.append(1.0 - df.iloc[i]["prediction_probability"])
            else:
                positive_list.append(df.iloc[i]["prediction_probability"])

        if type_binary == 0:
            labels_ft0.append(label)
            predictions_ft0.append(prediction)
        else:
            labels_ft1.append(label)
            predictions_ft1.append(prediction)

    correct_array = correct_array[type_indices]
    labels_array = labels_array[type_indices]
    predictions_array = predictions_array[type_indices]

    # avg acc, acc per type
    correct_array_sumc, labels_array_sumc = np.sum(correct_array, axis=1), np.sum(
        labels_array, axis=1
    )  # sum skin conditions
    acc_array = correct_array_sumc / labels_array_sumc
    avg_acc = np.sum(correct_array) / np.sum(labels_array)

    # PQD
    PQD = acc_array.min() / acc_array.max()

    # DPM
    demo_array = predictions_array / np.sum(predictions_array, axis=1, keepdims=True)
    DPM = np.mean(demo_array.min(axis=0) / demo_array.max(axis=0))

    # EOM
    eo_array = correct_array / labels_array
    EOM = np.mean(np.min(eo_array, axis=0) / np.max(eo_array, axis=0))

    # NAR
    NAR = (acc_array.max() - acc_array.min()) / acc_array.mean()

    ##############################          Metrics with binary Sensative attribute         ##############################

    correct_array_binary = correct_array_binary[type_indices_binary]
    labels_array_binary = labels_array_binary[type_indices_binary]
    predictions_array_binary = predictions_array_binary[type_indices_binary]

    # avg acc, acc per type
    correct_array_sumc_binary, labels_array_sumc_binary = np.sum(
        correct_array_binary, axis=1
    ), np.sum(
        labels_array_binary, axis=1
    )  # sum skin conditions
    acc_array_binary = correct_array_sumc_binary / labels_array_sumc_binary
    avg_acc_binary = np.sum(correct_array_binary) / np.sum(labels_array_binary)

    # PQD
    PQD_binary = acc_array_binary.min() / acc_array_binary.max()

    # DPM
    demo_array_binary = predictions_array_binary / np.sum(
        predictions_array_binary, axis=1, keepdims=True
    )
    DPM_binary = np.mean(demo_array_binary.min(axis=0) / demo_array_binary.max(axis=0))

    # EOM
    eo_array_binary = correct_array_binary / labels_array_binary
    EOM_binary = np.mean(
        np.min(eo_array_binary, axis=0) / np.max(eo_array_binary, axis=0)
    )

    # getting class-wise TPR, FPR, TNR for fitzpatrick 0
    conf_matrix_fitz0 = confusion_matrix(labels_ft0, predictions_ft0)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz0 = []
    class_tnr_fitz0 = []
    class_fpr_fitz0 = []

    for i in range(len(conf_matrix_fitz0)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz0[i, i] / sum(conf_matrix_fitz0[i, :])
        class_tpr_fitz0.append(tpr)

        # Calculate TNR for class i
        tn = (
            sum(sum(conf_matrix_fitz0))
            - sum(conf_matrix_fitz0[i, :])
            - sum(conf_matrix_fitz0[:, i])
            + conf_matrix_fitz0[i, i]
        )
        fp = sum(conf_matrix_fitz0[:, i]) - conf_matrix_fitz0[i, i]
        fn = sum(conf_matrix_fitz0[i, :]) - conf_matrix_fitz0[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz0.append(tnr)

        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz0.append(fpr)

    # getting class-wise TPR, FPR, TNR for fitzpatrick 1

    conf_matrix_fitz1 = confusion_matrix(labels_ft1, predictions_ft1)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz1 = []
    class_tnr_fitz1 = []
    class_fpr_fitz1 = []

    for i in range(len(conf_matrix_fitz1)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz1[i, i] / sum(conf_matrix_fitz1[i, :])
        class_tpr_fitz1.append(tpr)

        # Calculate TNR for class i
        tn = (
            sum(sum(conf_matrix_fitz1))
            - sum(conf_matrix_fitz1[i, :])
            - sum(conf_matrix_fitz1[:, i])
            + conf_matrix_fitz1[i, i]
        )
        fp = sum(conf_matrix_fitz1[:, i]) - conf_matrix_fitz1[i, i]
        fn = sum(conf_matrix_fitz1[i, :]) - conf_matrix_fitz1[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz1.append(tnr)

        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz1.append(fpr)

    # EOpp0
    EOpp0 = 0
    for c in range(len(class_tnr_fitz0)):
        EOpp0 += abs(class_tnr_fitz1[c] - class_tnr_fitz0[c])

    # EOpp1
    EOpp1 = 0
    for c in range(len(class_tpr_fitz0)):
        EOpp1 += abs(class_tpr_fitz1[c] - class_tpr_fitz0[c])

    # EOdd
    EOdd = 0
    for c in range(len(class_tpr_fitz0)):
        EOdd += abs(
            class_tpr_fitz1[c]
            - class_tpr_fitz0[c]
            + class_fpr_fitz1[c]
            - class_fpr_fitz0[c]
        )

    # NAR
    NAR_binary = (
        acc_array_binary.max() - acc_array_binary.min()
    ) / acc_array_binary.mean()

    # if is binary classification, output AUC
    if is_binaryCLF:
        fpr, tpr, threshold = roc_curve(
            df["label"], positive_list, drop_intermediate=True
        )
        AUC = auc(fpr, tpr)
    else:
        AUC = -1

    return {
        "acc_avg": avg_acc,
        "acc_per_type": acc_array,
        "PQD": PQD,
        "DPM": DPM,
        "EOM": EOM,
        "EOpp0": EOpp0,
        "EOpp1": EOpp1,
        "EOdd": EOdd,
        "NAR": NAR,
        "AUC": AUC,
        "acc_avg_binary": avg_acc_binary,
        "acc_per_type_binary": acc_array_binary,
        "PQD_binary": PQD_binary,
        "DPM_binary": DPM_binary,
        "EOM_binary": EOM_binary,
        "NAR_binary": NAR_binary,
    }


In [4]:
df = pd.read_csv('/home/ali/Outputs/SkinFormer_baseline/validation_results_DiT_S_LRP_50_random_holdout_BASE_6fitz.csv')
df['fitzpatrick_binary'] = df['fitzpatrick'].apply(lambda x: 0 if x in [0,1,2] else 1)
df

Unnamed: 0,hasher,label,fitzpatrick,prediction_probability,prediction,fitzpatrick_binary
0,2902cdd1ebaa4408346ad45372b67de3,2,3,0.999881,2,1
1,1663f45b286763ea93868bd833c00496,2,2,0.999801,2,0
2,79afe212abe2babed2e8c270d549da55,2,0,0.990119,2,0
3,7667396577c8a5828bf9070dd24a67a3,2,5,0.998609,2,1
4,bcd558c86187bde3ed013eed7b7cfe5f,2,1,0.999148,2,0
...,...,...,...,...,...,...
3198,186240638de7f1c3cc66e576e49cef89,2,3,0.999695,2,1
3199,c52074f991a37d6570d1b82ff645a0d4,2,1,0.999495,2,0
3200,8d7cda0d2bfbdcd6d5496e8a6e342788,2,2,0.999840,2,0
3201,e97996fd8e9a1245be0acb3fe3f6a707,2,2,0.999824,2,0


In [11]:
cal_metrics(df=df, is_binaryCLF=False)

{'acc_avg': 0.8616921635966281,
 'acc_per_type': array([0.83946488, 0.843987  , 0.85901163, 0.8966725 , 0.90938511,
        0.83333333]),
 'PQD': 0.9163701067615659,
 'DPM': 0.5278897673948356,
 'EOM': 0.6284678839710286,
 'EOpp0': 0.09241948268945266,
 'EOpp1': 0.13334040160654603,
 'EOdd': 0.19503690128792983,
 'NAR': 0.08805933919376745,
 'AUC': -1,
 'acc_avg_binary': 0.8616921635966281,
 'acc_per_type_binary': array([0.84744228, 0.89336016]),
 'PQD_binary': 0.9486009323039654,
 'DPM_binary': 0.743366066518016,
 'EOM_binary': 0.9361077616382222,
 'NAR_binary': 0.05275484255800076}