In [1]:
'''Use multiple rounds to get a more robust results'''
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, balanced_accuracy_score, roc_auc_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
import torchvision
import shutil
import os

In [2]:
def cal_metrics(df):
    """
    calculate average accuracy, accuracy per skin type, PQD, DPM, EOM, EOpp0, EOpp1, EOdd, and NAR.
    Skin type in the input df should be in the range of [0,5].
    input val results csv path, type_indices: a list
    output a dic, 'acc_avg': value, 'acc_per_type': array[x,x,x], 'PQD', 'DPM', 'EOM'
    """
    is_binaryCLF = len(df["label"].unique()) == 2

    type_indices = sorted(list(df["fitzpatrick"].unique()))
    type_indices_binary = sorted(list(df["fitzpatrick_binary"].unique()))

    labels_array = np.zeros((len(type_indices), len(df["label"].unique())))
    correct_array = np.zeros((len(type_indices), len(df["label"].unique())))
    predictions_array = np.zeros((len(type_indices), len(df["label"].unique())))
    prob_array = [[] for i in range(len(type_indices))]
    label_array_per_fitz = [[] for i in range(len(type_indices))]

    labels_array_binary = np.zeros((2, len(df["label"].unique())))
    correct_array_binary = np.zeros((2, len(df["label"].unique())))
    predictions_array_binary = np.zeros((2, len(df["label"].unique())))

    positive_list = []  # get positive probability for binary classification
    labels_ft0 = []
    labels_ft1 = []
    predictions_ft0 = []
    predictions_ft1 = []

    for i in range(df.shape[0]):
        prediction = df.iloc[i]["prediction"]
        label = df.iloc[i]["label"]
        type = df.iloc[i]["fitzpatrick"]
        type_binary = df.iloc[i]["fitzpatrick_binary"]

        labels_array[int(type), int(label)] += 1
        predictions_array[int(type), int(prediction)] += 1
        if prediction == label:
            correct_array[int(type), int(label)] += 1

        labels_array_binary[int(type_binary), int(label)] += 1
        predictions_array_binary[int(type_binary), int(prediction)] += 1
        if prediction == label:
            correct_array_binary[int(type_binary), int(label)] += 1

        if is_binaryCLF:
            prob_array[int(type)].append(df.iloc[i]["prediction_probability"])
            label_array_per_fitz[int(type)].append(label)
            if prediction == 0:
                positive_list.append(1.0 - df.iloc[i]["prediction_probability"])
            else:
                positive_list.append(df.iloc[i]["prediction_probability"])

        if type_binary == 0:
            labels_ft0.append(label)
            predictions_ft0.append(prediction)
        else:
            labels_ft1.append(label)
            predictions_ft1.append(prediction)

    correct_array = correct_array[type_indices]
    labels_array = labels_array[type_indices]
    predictions_array = predictions_array[type_indices]

    # Accuracy, accuracy per type
    Accuracy = accuracy_score(df["label"], df["prediction"]) * 100

    acc_array = []
    for i in range(len(type_indices)):
        acc_array.append(
            accuracy_score(
                df[df["fitzpatrick"] == i]["label"],
                df[df["fitzpatrick"] == i]["prediction"],
            )
            * 100
        )
    acc_array = np.array(acc_array)

    # f1_score, f1-score per type (Weighted average)
    F1_W = f1_score(df["label"], df["prediction"], average="weighted") * 100

    F1_W_array = []
    for i in range(len(type_indices)):
        F1_W_array.append(
            f1_score(
                df[df["fitzpatrick"] == i]["label"],
                df[df["fitzpatrick"] == i]["prediction"],
                average="weighted",
            )
            * 100
        )
    F1_W_array = np.array(F1_W_array)

    # f1_score, f1-score per type (Macro average)
    F1_Mac = f1_score(df["label"], df["prediction"], average="macro") * 100

    F1_Mac_array = []
    for i in range(len(type_indices)):
        F1_Mac_array.append(
            f1_score(
                df[df["fitzpatrick"] == i]["label"],
                df[df["fitzpatrick"] == i]["prediction"],
                average="macro",
            )
            * 100
        )
    F1_Mac_array = np.array(F1_Mac_array)

    # PQD
    PQD = acc_array.min() / acc_array.max()

    # DPM
    demo_array = predictions_array / np.sum(predictions_array, axis=1, keepdims=True)
    DPM = np.mean(demo_array.min(axis=0) / demo_array.max(axis=0))

    # EOM
    eo_array = correct_array / labels_array
    EOM = np.mean(np.nanmin(eo_array, axis=0) / np.nanmax(eo_array, axis=0))

    # NAR
    NAR = (acc_array.max() - acc_array.min()) / acc_array.mean()

    # NFR (Weighted)
    NFR_W = (F1_W_array.max() - F1_W_array.min()) / F1_W_array.mean()

    # NAR (Macro)
    NFR_Mac = (F1_Mac_array.max() - F1_Mac_array.min()) / F1_Mac_array.mean()

    # AUC
    if is_binaryCLF:
        # AUC per skin type
        AUC = roc_auc_score(df["label"], df["prediction_probability"]) * 100
        AUC_per_type = []
        for i in range(len(label_array_per_fitz)):
            try:
                AUC_per_type.append(
                    roc_auc_score(label_array_per_fitz[i], prob_array[i]) * 100
                )
            except:
                AUC_per_type.append(np.nan)
        AUC_Gap = max(AUC_per_type) - min(AUC_per_type)
    else:
        AUC = -1
        AUC_per_type = [-1] * len(type_indices)
        AUC_Gap = -1

    ##############################          Metrics with binary Sensative attribute         ##############################

    correct_array_binary = correct_array_binary[type_indices_binary]
    labels_array_binary = labels_array_binary[type_indices_binary]
    predictions_array_binary = predictions_array_binary[type_indices_binary]

    # avg acc, acc per type
    correct_array_sumc_binary, labels_array_sumc_binary = np.sum(
        correct_array_binary, axis=1
    ), np.sum(
        labels_array_binary, axis=1
    )  # sum skin conditions
    acc_array_binary = correct_array_sumc_binary / labels_array_sumc_binary
    avg_acc_binary = (np.sum(correct_array_binary) / np.sum(labels_array_binary)) * 100

    # PQD
    PQD_binary = acc_array_binary.min() / acc_array_binary.max()

    # DPM
    demo_array_binary = predictions_array_binary / np.sum(
        predictions_array_binary, axis=1, keepdims=True
    )
    DPM_binary = np.mean(demo_array_binary.min(axis=0) / demo_array_binary.max(axis=0))

    # EOM
    eo_array_binary = correct_array_binary / labels_array_binary
    EOM_binary = np.mean(
        np.nanmin(eo_array_binary, axis=0) / np.nanmax(eo_array_binary, axis=0)
    )

    # getting class-wise TPR, FPR, TNR for fitzpatrick 0
    conf_matrix_fitz0 = confusion_matrix(labels_ft0, predictions_ft0)

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz0 = []
    class_tnr_fitz0 = []
    class_fpr_fitz0 = []

    for i in range(len(conf_matrix_fitz0)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz0[i, i] / sum(conf_matrix_fitz0[i, :])
        class_tpr_fitz0.append(tpr)

        # Calculate TNR for class i
        tn = (
            sum(sum(conf_matrix_fitz0))
            - sum(conf_matrix_fitz0[i, :])
            - sum(conf_matrix_fitz0[:, i])
            + conf_matrix_fitz0[i, i]
        )
        fp = sum(conf_matrix_fitz0[:, i]) - conf_matrix_fitz0[i, i]
        fn = sum(conf_matrix_fitz0[i, :]) - conf_matrix_fitz0[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz0.append(tnr)

        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz0.append(fpr)

    # getting class-wise TPR, FPR, TNR for fitzpatrick 1

    conf_matrix_fitz1 = confusion_matrix(labels_ft1, predictions_ft1)
    
    # Check if there is any class that is not in both subgroups to handle it
    try:
        class_idx = (set(df[df['fitzpatrick_binary'] == 0]['label'].unique()) - set(df[df['fitzpatrick_binary'] == 1]['label'].unique())).pop()
        conf_matrix_fitz1 = np.insert(conf_matrix_fitz1, class_idx, 0, axis=1)
        conf_matrix_fitz1 = np.insert(conf_matrix_fitz1, class_idx, 0, axis=0)
        print(f"INFO: class {class_idx} is not in both binary subgroups")
    except:
        class_idx = None

    # Initialize lists to store TPR, TNR, FPR for each class
    class_tpr_fitz1 = []
    class_tnr_fitz1 = []
    class_fpr_fitz1 = []

    for i in range(len(conf_matrix_fitz1)):
        # Calculate TPR for class i
        tpr = conf_matrix_fitz1[i, i] / sum(conf_matrix_fitz1[i, :])
        class_tpr_fitz1.append(tpr)

        # Calculate TNR for class i
        tn = (
            sum(sum(conf_matrix_fitz1))
            - sum(conf_matrix_fitz1[i, :])
            - sum(conf_matrix_fitz1[:, i])
            + conf_matrix_fitz1[i, i]
        )
        fp = sum(conf_matrix_fitz1[:, i]) - conf_matrix_fitz1[i, i]
        fn = sum(conf_matrix_fitz1[i, :]) - conf_matrix_fitz1[i, i]
        tnr = tn / (tn + fp)
        class_tnr_fitz1.append(tnr)

        # Calculate FPR for class i
        fpr = 1 - tnr
        class_fpr_fitz1.append(fpr)

    if class_idx is not None:
        class_tpr_fitz1[class_idx] = np.nan
        class_tnr_fitz1[class_idx] = np.nan
        class_fpr_fitz1[class_idx] = np.nan
    
    # EOpp0
    EOpp0 = 0
    for c in range(len(class_tnr_fitz0)):
        val = abs(class_tnr_fitz1[c] - class_tnr_fitz0[c])
        if not np.isnan(val):
            EOpp0 += val

    # EOpp1
    EOpp1 = 0
    for c in range(len(class_tpr_fitz0)):
        val = abs(class_tpr_fitz1[c] - class_tpr_fitz0[c])
        if not np.isnan(val):
            EOpp1 += val

    # EOdd
    EOdd = 0
    for c in range(len(class_tpr_fitz0)):
        val = abs(
            class_tpr_fitz1[c]
            - class_tpr_fitz0[c]
            + class_fpr_fitz1[c]
            - class_fpr_fitz0[c]
        )
        if not np.isnan(val):
            EOdd += val

    # NAR
    NAR_binary = (
        acc_array_binary.max() - acc_array_binary.min()
    ) / acc_array_binary.mean()

    return {
        "accuracy": Accuracy,
        "acc_per_type": acc_array,
        "acc_gap": acc_array.max() - acc_array.min(),
        "F1_W": F1_W,
        "F1_per_type_W": F1_W_array,
        "F1_W_gap": max(F1_W_array) - min(F1_W_array),
        "F1_Mac": F1_Mac,
        "F1_per_type_Mac": F1_Mac_array,
        "F1_Mac_gap": max(F1_Mac_array) - min(F1_Mac_array),
        "PQD": PQD,
        "DPM": DPM,
        "EOM": EOM,
        "EOpp0": EOpp0,
        "EOpp1": EOpp1,
        "EOdd": EOdd,
        "NAR": NAR,
        "NFR_W": NFR_W,
        "NFR_Mac": NFR_Mac,
        "AUC": AUC,
        "AUC_per_type": AUC_per_type,
        "AUC_Gap": AUC_Gap,
        "AUC_min": min(AUC_per_type),
        "acc_avg_binary": avg_acc_binary,
        "acc_per_type_binary": acc_array_binary,
        "PQD_binary": PQD_binary,
        "DPM_binary": DPM_binary,
        "EOM_binary": EOM_binary,
        "NAR_binary": NAR_binary,
    }

## Fitzpatrick

In [3]:
df = pd.read_csv("/home/ali/Outputs/Fitzpatrick17k/FairPrune/validation_results_ResNet18_FairPrune_PIter4_epoch=50_random_holdout.csv")
cal_metrics(df=df)

{'accuracy': 81.33000312207305,
 'acc_per_type': array([79.43143813, 78.65655471, 82.26744186, 83.71278459, 85.4368932 ,
        84.21052632]),
 'acc_gap': 6.780338490990744,
 'F1_W': 78.08004156764137,
 'F1_per_type_W': array([76.67339824, 74.6289133 , 79.29608888, 80.51718594, 82.56086632,
        80.13335159]),
 'F1_W_gap': 7.93195302883251,
 'F1_Mac': 61.559112110539715,
 'F1_per_type_Mac': array([65.53734189, 57.86020998, 61.70303263, 61.27168844, 60.60239272,
        58.76884087]),
 'F1_Mac_gap': 7.6771319134357014,
 'PQD': 0.9206392199349946,
 'DPM': 0.49776309188734347,
 'EOM': 0.7074291673232631,
 'EOpp0': 0.13819241627628487,
 'EOpp1': 0.09420496905121217,
 'EOdd': 0.23239738532749704,
 'NAR': 0.08239972111095889,
 'NFR_W': 0.10044477286951978,
 'NFR_Mac': 0.12594288253709293,
 'AUC': -1,
 'AUC_per_type': [-1, -1, -1, -1, -1, -1],
 'AUC_Gap': -1,
 'AUC_min': -1,
 'acc_avg_binary': 81.33000312207305,
 'acc_per_type_binary': array([0.79990946, 0.84305835]),
 'PQD_binary': 0.948

## PAD-UFES-20

### Baseline

In [4]:
df = pd.read_csv("/home/ali/Outputs/PAD-UFES-20/FairPrune/Baseline-Resnet18/validation_results_Resnet18_BASE_low_epoch=50_random_holdout.csv")
cal_metrics(df=df)

INFO: class 2 is not in both binary subgroups


  eo_array = correct_array / labels_array
  eo_array_binary = correct_array_binary / labels_array_binary
  tpr = conf_matrix_fitz1[i, i] / sum(conf_matrix_fitz1[i, :])


{'accuracy': 71.23745819397993,
 'acc_per_type': array([ 62.06896552,  71.18644068,  78.37837838,  55.55555556,
        100.        ]),
 'acc_gap': 44.44444444444444,
 'F1_W': 69.82307420138476,
 'F1_per_type_W': array([ 57.52351097,  70.38258133,  75.87588138,  55.66137566,
        100.        ]),
 'F1_W_gap': 44.33862433862434,
 'F1_Mac': 63.6782640401112,
 'F1_per_type_Mac': array([ 40.34090909,  68.53893171,  73.60283911,  26.03174603,
        100.        ]),
 'F1_Mac_gap': 73.96825396825396,
 'PQD': 0.5555555555555556,
 'DPM': 0.009259259259259259,
 'EOM': 0.36959064327485375,
 'EOpp0': 0.11701729062580335,
 'EOpp1': 1.1485360055369986,
 'EOdd': 1.093800084122928,
 'NAR': 0.6051979127282565,
 'NFR_W': 0.6167679054167151,
 'NFR_Mac': 1.1987811225086122,
 'AUC': -1,
 'AUC_per_type': [-1, -1, -1, -1, -1],
 'AUC_Gap': -1,
 'AUC_min': -1,
 'acc_avg_binary': 71.23745819397993,
 'acc_per_type_binary': array([0.72142857, 0.57894737]),
 'PQD_binary': 0.8025013027618552,
 'DPM_binary': 0.54

### FairPrune - FAILED

In [5]:
df = pd.read_csv("/home/ali/Outputs/PAD-UFES-20/FairPrune/B=33_PR=35/validation_results_ResNet18_FairPrune_PIter1_epoch=50_random_holdout.csv")
cal_metrics(df=df)

INFO: class 2 is not in both binary subgroups


  DPM = np.mean(demo_array.min(axis=0) / demo_array.max(axis=0))
  eo_array = correct_array / labels_array
  EOM = np.mean(np.nanmin(eo_array, axis=0) / np.nanmax(eo_array, axis=0))
  DPM_binary = np.mean(demo_array_binary.min(axis=0) / demo_array_binary.max(axis=0))
  eo_array_binary = correct_array_binary / labels_array_binary
  np.nanmin(eo_array_binary, axis=0) / np.nanmax(eo_array_binary, axis=0)
  tpr = conf_matrix_fitz1[i, i] / sum(conf_matrix_fitz1[i, :])


{'accuracy': 40.13377926421405,
 'acc_per_type': array([ 37.93103448,  38.4180791 ,  41.89189189,  50.        ,
        100.        ]),
 'acc_gap': 62.06896551724138,
 'F1_W': 37.45754287994282,
 'F1_per_type_W': array([ 36.58170915,  35.55104974,  39.7216587 ,  52.43386243,
        100.        ]),
 'F1_W_gap': 64.44895025824651,
 'F1_Mac': 21.48980577016143,
 'F1_per_type_Mac': array([ 30.86956522,  19.97444014,  21.01425881,  34.47619048,
        100.        ]),
 'F1_Mac_gap': 80.02555985821706,
 'PQD': 0.3793103448275862,
 'DPM': nan,
 'EOM': nan,
 'EOpp0': 0.25889509872449457,
 'EOpp1': 0.6209834190966268,
 'EOdd': 0.41167509723163637,
 'NAR': 1.156962661400816,
 'NFR_W': 1.2192926272017524,
 'NFR_Mac': 1.9392195063937132,
 'AUC': -1,
 'AUC_per_type': [-1, -1, -1, -1, -1],
 'AUC_Gap': -1,
 'AUC_min': -1,
 'acc_avg_binary': 40.13377926421405,
 'acc_per_type_binary': array([0.39285714, 0.52631579]),
 'PQD_binary': 0.7464285714285714,
 'DPM_binary': nan,
 'EOM_binary': nan,
 'NAR_bina