In [2]:
'''Use multiple rounds to get a more robust results'''
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, balanced_accuracy_score, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import torchvision
import shutil
import os

In [3]:
def cal_metrics(df):
    """
    calculate average accuracy, accuracy per skin type, PQD, DPM, EOM, EOpp0, EOpp1, EOdd, and NAR.
    Skin type in the input df should be in the range of [0,5].
    input val results csv path, type_indices: a list
    output a dic, 'acc_avg': value, 'acc_per_type': array[x,x,x], 'PQD', 'DPM', 'EOM'
    """
    is_binaryCLF = len(df["label"].unique()) == 2

    type_indices = sorted(list(df["fitzpatrick"].unique()))
    type_indices_binary = sorted(list(df["fitzpatrick_binary"].unique()))

    labels_array = np.zeros((6, len(df["label"].unique())))
    correct_array = np.zeros((6, len(df["label"].unique())))
    predictions_array = np.zeros((6, len(df["label"].unique())))
    prob_array = [[] for i in range(len(df["fitzpatrick"].unique()))]
    label_array_per_fitz = [[] for i in range(len(df["fitzpatrick"].unique()))]

    labels_array_binary = np.zeros((2, len(df["label"].unique())))
    correct_array_binary = np.zeros((2, len(df["label"].unique())))
    predictions_array_binary = np.zeros((2, len(df["label"].unique())))

    positive_list = []  # get positive probability for binary classification
    labels_ft0 = []
    labels_ft1 = []
    predictions_ft0 = []
    predictions_ft1 = []

    for i in range(df.shape[0]):
        prediction = df.iloc[i]["prediction"]
        label = df.iloc[i]["label"]
        type = df.iloc[i]["fitzpatrick"]
        type_binary = df.iloc[i]["fitzpatrick_binary"]

        labels_array[int(type), int(label)] += 1
        predictions_array[int(type), int(prediction)] += 1
        if prediction == label:
            correct_array[int(type), int(label)] += 1

        labels_array_binary[int(type_binary), int(label)] += 1
        predictions_array_binary[int(type_binary), int(prediction)] += 1
        if prediction == label:
            correct_array_binary[int(type_binary), int(label)] += 1

        if is_binaryCLF:
            prob_array[int(type)].append(df.iloc[i]["prediction_probability"])
            label_array_per_fitz[int(type)].append(label)
            if prediction == 0:
                positive_list.append(1.0 - df.iloc[i]["prediction_probability"])
            else:
                positive_list.append(df.iloc[i]["prediction_probability"])

        if type_binary == 0:
            labels_ft0.append(label)
            predictions_ft0.append(prediction)
        else:
            labels_ft1.append(label)
            predictions_ft1.append(prediction)

    correct_array = correct_array[type_indices]
    labels_array = labels_array[type_indices]
    predictions_array = predictions_array[type_indices]

    # avg acc, acc per type
    correct_array_sumc, labels_array_sumc = np.sum(correct_array, axis=1), np.sum(
        labels_array, axis=1
    )  # sum skin conditions
    acc_array = (correct_array_sumc / labels_array_sumc) * 100
    avg_acc = (np.sum(correct_array) / np.sum(labels_array)) * 100

    # f1_score, f1-score per type
    F1 = f1_score(df["label"], df["prediction"], average="weighted") * 100

    F1_array = []
    for i in range(6):
        F1_array.append(
            f1_score(
                df[df["fitzpatrick"] == i]["label"],
                df[df["fitzpatrick"] == i]["prediction"],
                average="weighted",
            )
            * 100
        )

    # PQD
    PQD = acc_array.min() / acc_array.max()

    # DPM
    demo_array = predictions_array / np.sum(predictions_array, axis=1, keepdims=True)
    DPM = np.mean(demo_array.min(axis=0) / demo_array.max(axis=0))

    # EOM
    eo_array = correct_array / labels_array
    EOM = np.mean(np.min(eo_array, axis=0) / np.max(eo_array, axis=0))

    # NAR
    NAR = (acc_array.max() - acc_array.min()) / acc_array.mean()

    # AUC
    if is_binaryCLF:
        # AUC per skin type
        AUC = roc_auc_score(df["label"], df["prediction_probability"]) * 100
        AUC_per_type = []
        for i in range(len(label_array_per_fitz)):
            AUC_per_type.append(
                roc_auc_score(label_array_per_fitz[i], prob_array[i]) * 100
            )
        AUC_Gap = max(AUC_per_type) - min(AUC_per_type)
    else:
        AUC = -1
        AUC_per_type = [-1, -1, -1, -1, -1, -1]
        AUC_Gap = -1

    ##############################          Metrics with binary Sensative attribute         ##############################

    correct_array_binary = correct_array_binary[type_indices_binary]
    labels_array_binary = labels_array_binary[type_indices_binary]
    predictions_array_binary = predictions_array_binary[type_indices_binary]

    # avg acc, acc per type
    correct_array_sumc_binary, labels_array_sumc_binary = np.sum(
        correct_array_binary, axis=1
    ), np.sum(
        labels_array_binary, axis=1
    )  # sum skin conditions
    acc_array_binary = correct_array_sumc_binary / labels_array_sumc_binary
    avg_acc_binary = (np.sum(correct_array_binary) / np.sum(labels_array_binary)) * 100

    # PQD
    PQD_binary = acc_array_binary.min() / acc_array_binary.max()

    # DPM
    demo_array_binary = predictions_array_binary / np.sum(
        predictions_array_binary, axis=1, keepdims=True
    )
    DPM_binary = np.mean(demo_array_binary.min(axis=0) / demo_array_binary.max(axis=0))

    # EOM
    eo_array_binary = correct_array_binary / labels_array_binary
    EOM_binary = np.mean(
        np.min(eo_array_binary, axis=0) / np.max(eo_array_binary, axis=0)
    )

    # getting class-wise TPR, FPR, TNR for fitzpatrick 0
    conf_matrix_fitz0 = confusion_matrix(labels_ft0, predictions_ft0)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz0 = []
    class_tnr_fitz0 = []
    class_fpr_fitz0 = []

    for i in range(len(conf_matrix_fitz0)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz0[i, i] / sum(conf_matrix_fitz0[i, :])
        class_tpr_fitz0.append(tpr)

        # Calculate TNR for class i
        tn = (
            sum(sum(conf_matrix_fitz0))
            - sum(conf_matrix_fitz0[i, :])
            - sum(conf_matrix_fitz0[:, i])
            + conf_matrix_fitz0[i, i]
        )
        fp = sum(conf_matrix_fitz0[:, i]) - conf_matrix_fitz0[i, i]
        fn = sum(conf_matrix_fitz0[i, :]) - conf_matrix_fitz0[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz0.append(tnr)

        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz0.append(fpr)

    # getting class-wise TPR, FPR, TNR for fitzpatrick 1

    conf_matrix_fitz1 = confusion_matrix(labels_ft1, predictions_ft1)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz1 = []
    class_tnr_fitz1 = []
    class_fpr_fitz1 = []

    for i in range(len(conf_matrix_fitz1)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz1[i, i] / sum(conf_matrix_fitz1[i, :])
        class_tpr_fitz1.append(tpr)

        # Calculate TNR for class i
        tn = (
            sum(sum(conf_matrix_fitz1))
            - sum(conf_matrix_fitz1[i, :])
            - sum(conf_matrix_fitz1[:, i])
            + conf_matrix_fitz1[i, i]
        )
        fp = sum(conf_matrix_fitz1[:, i]) - conf_matrix_fitz1[i, i]
        fn = sum(conf_matrix_fitz1[i, :]) - conf_matrix_fitz1[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz1.append(tnr)

        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz1.append(fpr)

    # EOpp0
    EOpp0 = 0
    for c in range(len(class_tnr_fitz0)):
        EOpp0 += abs(class_tnr_fitz1[c] - class_tnr_fitz0[c])

    # EOpp1
    EOpp1 = 0
    for c in range(len(class_tpr_fitz0)):
        EOpp1 += abs(class_tpr_fitz1[c] - class_tpr_fitz0[c])

    # EOdd
    EOdd = 0
    for c in range(len(class_tpr_fitz0)):
        EOdd += abs(
            class_tpr_fitz1[c]
            - class_tpr_fitz0[c]
            + class_fpr_fitz1[c]
            - class_fpr_fitz0[c]
        )

    # NAR
    NAR_binary = (
        acc_array_binary.max() - acc_array_binary.min()
    ) / acc_array_binary.mean()

    return {
        "acc_avg": avg_acc,
        "acc_per_type": acc_array,
        "acc_gap": max(acc_array) - min(acc_array),
        "Min_acc": min(acc_array),
        "F1_score": F1,
        "F1_per_type": F1_array,
        "F1_gap": max(F1_array) - min(F1_array),
        "Min_F1": min(F1_array),
        "PQD": PQD,
        "DPM": DPM,
        "EOM": EOM,
        "EOpp0": EOpp0,
        "EOpp1": EOpp1,
        "EOdd": EOdd,
        "NAR": NAR,
        "AUC": AUC,
        "AUC_per_type": AUC_per_type,
        "AUC_Gap": AUC_Gap,
        "AUC_min": min(AUC_per_type),
        "acc_avg_binary": avg_acc_binary,
        "acc_per_type_binary": acc_array_binary,
        "PQD_binary": PQD_binary,
        "DPM_binary": DPM_binary,
        "EOM_binary": EOM_binary,
        "NAR_binary": NAR_binary,
    }

# Baseline

In [53]:
df = pd.read_csv('/home/ali/Outputs/SkinFormer_baseline_3class/validation_results_DiT_S_LRP_level=high_epoch=50_random_holdout.csv')
cal_metrics(df=df)

{'acc_avg': 86.16921635966281,
 'acc_per_type': array([83.94648829, 84.39869989, 85.90116279, 89.66725044, 90.93851133,
        83.33333333]),
 'acc_gap': 7.605177993527491,
 'Min_acc': 83.33333333333334,
 'F1_score': 85.57893397463474,
 'F1_per_type': [83.40888098529375,
  83.78928603956243,
  85.37177898427865,
  89.01159924768479,
  90.69159774661392,
  79.75499454377571],
 'F1_gap': 10.936603202838214,
 'Min_F1': 79.75499454377571,
 'PQD': 0.916370106761566,
 'DPM': 0.5278897673948356,
 'EOM': 0.6284678839710286,
 'EOpp0': 0.09241948268945266,
 'EOpp1': 0.13334040160654603,
 'EOdd': 0.19503690128792983,
 'NAR': 0.08805933919376732,
 'AUC': -1,
 'AUC_per_type': [-1, -1, -1, -1, -1, -1],
 'AUC_Gap': -1,
 'AUC_min': -1,
 'acc_avg_binary': 86.16921635966281,
 'acc_per_type_binary': array([0.84744228, 0.89336016]),
 'PQD_binary': 0.9486009323039654,
 'DPM_binary': 0.743366066518016,
 'EOM_binary': 0.9361077616382222,
 'NAR_binary': 0.05275484255800076}

In [54]:
df = pd.read_csv('/home/ali/Outputs/SkinFormer_baseline_2class/validation_results_DiT_S_LRP_level=binary_epoch=50_random_holdout.csv')
cal_metrics(df=df)

{'acc_avg': 93.78707461754605,
 'acc_per_type': array([93.81270903, 92.30769231, 92.29651163, 96.84763573, 96.76375405,
        91.22807018]),
 'acc_gap': 5.619565551356516,
 'Min_acc': 91.22807017543859,
 'F1_score': 93.58431062096106,
 'F1_per_type': [93.67097036561044,
  92.17603050992126,
  91.90241448181142,
  96.76754633797887,
  96.60765906058379,
  89.89139515455304],
 'F1_gap': 6.876151183425833,
 'Min_F1': 89.89139515455304,
 'PQD': 0.9419751911424128,
 'DPM': 0.6443480064767824,
 'EOM': 0.6964731050522976,
 'EOpp0': 0.044660586462973995,
 'EOpp1': 0.044660586462973995,
 'EOdd': 0.00828135841152311,
 'NAR': 0.05986153895383024,
 'AUC': 93.51590984421611,
 'AUC_per_type': [93.76476145488898,
  92.24324621935105,
  92.00706736691954,
  98.11373092926492,
  96.14759805059178,
  86.19281045751633],
 'AUC_Gap': 11.920920471748587,
 'AUC_min': 86.19281045751633,
 'acc_avg_binary': 93.78707461754605,
 'acc_per_type_binary': array([0.92711634, 0.96177062]),
 'PQD_binary': 0.963968247

# FairDisCo

In [5]:
df = pd.read_csv('/home/ali/Repos/FairDisCo/results_FairDisCO_50_high_random_holdout.csv')
df["fitzpatrick"] = df["fitzpatrick"] -1
df["fitzpatrick_binary"] = df['fitzpatrick'].apply(lambda x: 0 if x in [0,1,2] else 1)
cal_metrics(df=df)

{'acc_avg': 86.04433343740243,
 'acc_per_type': array([84.11371237, 82.77356446, 86.19186047, 89.31698774, 93.52750809,
        85.0877193 ]),
 'acc_gap': 10.753943626909575,
 'Min_acc': 82.77356446370531,
 'F1_score': 85.40178852870271,
 'F1_per_type': [83.46605343183822,
  82.04096074015393,
  85.51682097590682,
  88.82539106272448,
  93.37276263150967,
  82.45370444635752],
 'F1_gap': 11.331801891355738,
 'Min_F1': 82.04096074015393,
 'PQD': 0.8850183882105516,
 'DPM': 0.478434030160005,
 'EOM': 0.6226869024695112,
 'EOpp0': 0.04858410420571213,
 'EOpp1': 0.14267129135206158,
 'EOdd': 0.10489253203620119,
 'NAR': 0.12384310142214475,
 'AUC': -1,
 'AUC_per_type': [-1, -1, -1, -1, -1, -1],
 'AUC_Gap': -1,
 'AUC_min': -1,
 'acc_avg_binary': 86.04433343740243,
 'acc_per_type_binary': array([0.84200996, 0.90140845]),
 'PQD_binary': 0.9341047985513806,
 'DPM_binary': 0.7766552687660875,
 'EOM_binary': 0.9310292841461735,
 'NAR_binary': 0.0681402595122808}

# FairPrune

In [6]:
df = pd.read_csv("/home/ali/Outputs/FairPrune/validation_results_ResNet18_FairPrune_PIter4_epoch=50_random_holdout.csv")
cal_metrics(df=df)

{'acc_avg': 81.33000312207305,
 'acc_per_type': array([79.43143813, 78.65655471, 82.26744186, 83.71278459, 85.4368932 ,
        84.21052632]),
 'acc_gap': 6.780338490990744,
 'Min_acc': 78.65655471289274,
 'F1_score': 78.08004156764137,
 'F1_per_type': [76.67339823995785,
  74.62891329526849,
  79.29608888191215,
  80.51718593995612,
  82.560866324101,
  80.13335158624854],
 'F1_gap': 7.93195302883251,
 'Min_F1': 74.62891329526849,
 'PQD': 0.9206392199349946,
 'DPM': 0.49776309188734347,
 'EOM': 0.7074291673232631,
 'EOpp0': 0.13819241627628487,
 'EOpp1': 0.09420496905121217,
 'EOdd': 0.23239738532749704,
 'NAR': 0.08239972111095889,
 'AUC': -1,
 'AUC_per_type': [-1, -1, -1, -1, -1, -1],
 'AUC_Gap': -1,
 'AUC_min': -1,
 'acc_avg_binary': 81.33000312207305,
 'acc_per_type_binary': array([0.79990946, 0.84305835]),
 'PQD_binary': 0.9488186211538607,
 'DPM_binary': 0.7022129448437271,
 'EOM_binary': 0.9237732879362852,
 'NAR_binary': 0.05252554372231485}