In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score

from libauc.metrics import auc_roc_score # for multi-task

In [2]:
labels = [
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices',
    'No Finding'
]

labels_small = [
    'Cardiomegaly',
    'Pneumonia',
    'Pleural Effusion',
   'Fracture',
    'No Finding'
]

labels_abbr_small = [
    'Cd',
    'Pa',
    'Ef',
    'Fr',
    'NF'
]

observations_abbr = {
    'Enlarged Cardiomediastinum': 'EC',
    'Cardiomegaly': 'Cd',
    'Lung Opacity': 'LO',
    'Lung Lesion': 'LL',
    'Edema': 'Ed',
    'Consolidation': 'Co',
    'Pneumonia': 'Pa',
    'Atelectasis': 'A',
    'Pneumothorax': 'Px',
    'Pleural Effusion': 'Ef',
    'Pleural Other': 'PO',
    'Fracture': 'Fr',
    'Support Devices': 'SD',
    'No Finding': 'NF'
}

sex_attrib = [
    'Female',
    'Male'
]

results_path = './MT_files/'

train_file_path = "D:/chexpertchestxrays/CheXpert-v1.0/CheXpert-v1.0/train.csv"
valid_file_path = "D:/chexpertchestxrays/CheXpert-v1.0/CheXpert-v1.0/valid.csv"

results_Deep_AUC_file_path = results_path + "pred_results_Deep_AUC.csv"
results_sex_group_file_path = results_path + "pred_results_FIS_sex.csv"
results_age_group_file_path = results_path + "pred_results_FIS_age.csv"


In [3]:
def compute_group_auc(y_true, y_prob, sensitive_attr):

    group_aucs = {}
    groups = np.unique(sensitive_attr)

    for g in groups:
        mask = sensitive_attr == g
        
        aucs = auc_roc_score(y_true[mask], y_pred[mask])
        group_aucs[g] = np.nan_to_num(aucs, nan=0.0)

    return group_aucs

def compute_group_auc_table(y_true, y_prob, sensitive_attr):
    rows = []
    groups = np.unique(sensitive_attr)

    for g in groups:
        mask = (sensitive_attr == g)

        aucs = auc_roc_score(y_true[mask], y_prob[mask])
        aucs = np.nan_to_num(aucs, nan=0.0)

        row = {"group": g}  

        if np.ndim(aucs) > 0:
            for i, auc in enumerate(aucs):
                row[f"auc_label_{i}"] = float(auc)
        else:
            row["auc"] = float(aucs)

        rows.append(row)

    df =  pd.DataFrame(rows)
    df = df.sort_values("group").reset_index(drop=True)

    auc_cols = [c for c in df.columns if c.startswith("auc")]
    df["mean_auc"] = df[auc_cols].mean(axis=1)


    return df
    
def prob_to_binary_multilabel(y_pred, threshold=0.6):

    return (y_pred >= threshold).astype(int)

def find_best_threshold(y_true, y_prob, metric=f1_score):
    thresholds = np.linspace(0.01, 0.99, 99) #thresholds from 0.01 to 0.99 (99 values)
    scores = []

    for t in thresholds:
        y_bin = (y_prob >= t).astype(int)
        scores.append(metric(y_true, y_bin))

    best_t = thresholds[np.argmax(scores)]
    return best_t


def find_best_threshold_multilabel(y_true, y_prob, average="macro", metric=f1_score):
    thresholds=np.linspace(0.01, 0.99, 99)
    scores = []

    for t in thresholds:
        y_bin = (y_prob >= t).astype(int)
        score = metric(y_true, y_bin, average=average, zero_division=0)
        scores.append(score)

    best_idx = np.argmax(scores)
    
    return thresholds[best_idx], scores[best_idx]

def compute_auprc_multilabel(y_true, y_prob):
  
    num_labels = y_true.shape[1]
    auprcs = []

    for i in range(num_labels):
        auprc = average_precision_score(y_true[:, i], y_prob[:, i])
        auprcs.append(auprc)

    return {
        "per_label": np.array(auprcs),
        "macro": np.mean(auprcs)
    }

def compute_group_auprc(y_true, y_prob, sensitive_attr):

    group_auprcs = {}
    groups = np.unique(sensitive_attr)

    for g in groups:
        mask = sensitive_attr == g
        auprc = average_precision_score(
            y_true[mask],
            y_prob[mask],
            average=None  # per-label
        )
        group_auprcs[g] = np.nan_to_num(auprc, nan=0.0)

    return group_auprcs

def compute_group_auprc_table(y_true, y_prob, sensitive_attr):
    rows = []
    groups = np.unique(sensitive_attr)

    for g in groups:
        mask = (sensitive_attr == g)

        auprcs = average_precision_score(
            y_true[mask],
            y_prob[mask],
            average=None  # per-label AUPRC
        )
        auprcs = np.nan_to_num(auprcs, nan=0.0)

        row = {"group": g}

        if np.ndim(auprcs) > 0:
            for i, auprc in enumerate(auprcs):
                row[f"auprc_label_{i}"] = float(auprc)
        else:
            row["auprc"] = float(auprcs)

        rows.append(row)

    df = pd.DataFrame(rows)
    df = df.sort_values("group").reset_index(drop=True)

    auprc_cols = [c for c in df.columns if c.startswith("auprc")]
    df["mean_auprc"] = df[auprc_cols].mean(axis=1)

    return df



In [4]:
results_data = pd.read_csv(results_Deep_AUC_file_path)
results_data.info()

results_data_sex = pd.read_csv(results_sex_group_file_path)
results_data_age = pd.read_csv(results_age_group_file_path)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age_group         234 non-null    object 
 1   Sex               234 non-null    object 
 2   Age               234 non-null    int64  
 3   Cardiomegaly      234 non-null    float64
 4   Pneumonia         234 non-null    float64
 5   Pleural Effusion  234 non-null    float64
 6   Fracture          234 non-null    float64
 7   No Finding        234 non-null    float64
 8   Cd                234 non-null    float64
 9   Pa                234 non-null    float64
 10  Ef                234 non-null    float64
 11  Fr                234 non-null    float64
 12  NF                234 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 23.9+ KB


In [5]:
# train_data.columns = train_data.columns.str.lower()
results_data.head()

Unnamed: 0,age_group,Sex,Age,Cardiomegaly,Pneumonia,Pleural Effusion,Fracture,No Finding,Cd,Pa,Ef,Fr,NF
0,15–19,Female,19,0.0,0.0,0.0,0.0,1.0,0.030579,0.412787,0.017696,0.836949,0.762922
1,15–19,Female,18,0.0,0.0,0.0,0.0,0.0,0.446477,0.405428,0.017631,0.648013,0.059345
2,15–19,Female,18,0.0,0.0,0.0,0.0,0.0,0.912972,0.123558,0.054164,0.133802,0.094939
3,15–19,Male,19,0.0,0.0,0.0,0.0,0.0,0.546239,0.332704,0.223092,0.022522,0.081227
4,20–24,Female,23,0.0,0.0,0.0,0.0,1.0,0.012227,0.304598,0.01157,0.825408,0.803511


In [6]:
df_calc = results_data.groupby('Sex')[['Sex']].count()
print(df_calc)

df_calc = results_data.groupby('age_group')[['age_group']].count()
print(df_calc)

        Sex
Sex        
Female  106
Male    128
           age_group
age_group           
15–19              4
20–24              9
25–29              5
30–34              9
35–39              7
40–44              6
45–49             21
50–54             13
55–59             29
60–64             25
65–69             23
70–74             23
75–79             18
80–84             18
85–89             13
90–94             11


In [7]:
y_true = np.array(results_data[labels_small])
y_pred = np.array(results_data[labels_abbr_small])
sex_attr_baseline = np.array(results_data['Sex'])
age_attr_baseline = np.array(results_data['age_group'])

# show auc roc scores for each class 
print('AUC ROC score for the Deep AUC model:')
print(auc_roc_score(y_true, y_pred))

y_true_sex = np.array(results_data_sex[labels_small])
y_pred_sex = np.array(results_data_sex[labels_abbr_small])
sex_attr_sex = np.array(results_data_sex['Sex'])
age_attr_sex = np.array(results_data_sex['age_group'])

# show auc roc scores for each class 
print('AUC ROC score for the Deep AUC nd FIS model, Sex attribute:')
print(auc_roc_score(y_true_sex, y_pred_sex))


y_true_age = np.array(results_data_age[labels_small])
y_pred_age = np.array(results_data_age[labels_abbr_small])
sex_attr_age = np.array(results_data_age['Sex'])
age_attr_age = np.array(results_data_age['age_group'])

# show auc roc scores for each class 
print('AUC ROC score for the Deep AUC nd FIS model, Age attribute:')
print(auc_roc_score(y_true_age, y_pred_age))


AUC ROC score for the Deep AUC model:
[0.7668320340184267, 0.8722345132743363, 0.9159889176870141, nan, 0.8913802363050484]
AUC ROC score for the Deep AUC nd FIS model, Sex attribute:
[0.7711729270021261, 0.8960176991150441, 0.9029403878809545, nan, 0.9081632653061225]
AUC ROC score for the Deep AUC nd FIS model, Age attribute:
[0.7786144578313253, 0.8478982300884956, 0.8936455447314327, nan, 0.8988990332975295]


In [8]:
# show Group auc roc scores for each class

# print('Deep AUC model')
# print('Sex Group AUC ROC score for the Deep AUC model:')
# print(compute_group_auc_table(y_true, y_pred, sex_attr_baseline))
# print('Age Group AUC ROC score for the Deep AUC model:')
# print(compute_group_auc_table(y_true, y_pred, age_attr_baseline))
# print('--------------------------------------------------')
# print('Deep AUC + FIS for Sex attr. model')
# print('Sex Group AUC ROC score:')
# print(compute_group_auc_table(y_true_sex, y_pred_sex, sex_attr_sex))
# print('Age Group AUC ROC score:')
# print(compute_group_auc_table(y_true_sex, y_pred_sex, age_attr_sex))
# print('--------------------------------------------------')
# print('Deep AUC + FIS for Age attr. model')
# print('Sex Group AUC ROC score:')
# print(compute_group_auc_table(y_true_age, y_pred_age, sex_attr_age))
# print('Age Group AUC ROC score:')
# print(compute_group_auc_table(y_true_age, y_pred_age, age_attr_age))

df_group_auc_baseline_sex = compute_group_auc_table(y_true, y_pred, sex_attr_baseline)
df_group_auc_baseline_age = compute_group_auc_table(y_true, y_pred, age_attr_baseline)

df_group_auc_sex_sex = compute_group_auc_table(y_true_sex, y_pred_sex, sex_attr_sex)
df_group_auc_sex_age = compute_group_auc_table(y_true_sex, y_pred_sex, age_attr_sex)

df_group_auc_age_sex = compute_group_auc_table(y_true_age, y_pred_age, sex_attr_age)
df_group_auc_age_age = compute_group_auc_table(y_true_age, y_pred_age, age_attr_age)

output_path = results_path+"results_group_auc.xlsx"

with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer:
    df_group_auc_baseline_sex.to_excel(writer, sheet_name="Sex_baseline", index=False)
    df_group_auc_baseline_age.to_excel(writer, sheet_name="Age_baseline", index=False)
    df_group_auc_sex_sex.to_excel(writer, sheet_name="Sex_sex", index=False)
    df_group_auc_sex_age.to_excel(writer, sheet_name="Age_sex", index=False)
    df_group_auc_age_sex.to_excel(writer, sheet_name="Sex_age", index=False)
    df_group_auc_age_age.to_excel(writer, sheet_name="Age_age", index=False)



In [9]:
# show AUPRC scores for each class 
print('AUPRC score for the Deep AUC model:')
auprc_results = compute_auprc_multilabel(y_true, y_pred)
print(auprc_results["per_label"])

print('AUPRC score for the Deep AUC nd FIS model, Sex attribute:')
auprc_results = compute_auprc_multilabel(y_true_sex, y_pred_sex)
print(auprc_results["per_label"])

print('AUPRC score for the Deep AUC nd FIS model, Age attribute:')
auprc_results = compute_auprc_multilabel(y_true_age, y_pred_age)
print(auprc_results["per_label"])

AUPRC score for the Deep AUC model:
[0.56572905 0.2659063  0.84178137 0.         0.58865502]
AUPRC score for the Deep AUC nd FIS model, Sex attribute:
[0.55098741 0.35628319 0.79362947 0.         0.58714688]
AUPRC score for the Deep AUC nd FIS model, Age attribute:
[0.56208603 0.3370205  0.77792984 0.         0.59461563]


In [10]:
# show Group AUPRC scores for each class

df_group_auprc_baseline_sex = compute_group_auprc_table(y_true, y_pred, sex_attr_baseline)
df_group_auprc_baseline_age = compute_group_auprc_table(y_true, y_pred, age_attr_baseline)

df_group_auprc_sex_sex = compute_group_auprc_table(y_true_sex, y_pred_sex, sex_attr_sex)
df_group_auprc_sex_age = compute_group_auprc_table(y_true_sex, y_pred_sex, age_attr_sex)

df_group_auprc_age_sex = compute_group_auprc_table(y_true_age, y_pred_age, sex_attr_age)
df_group_auprc_age_age = compute_group_auprc_table(y_true_age, y_pred_age, age_attr_age)

output_path = results_path+"results_group_auprc.xlsx"

with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer:
    df_group_auprc_baseline_sex.to_excel(writer, sheet_name="Sex_baseline", index=False)
    df_group_auprc_baseline_age.to_excel(writer, sheet_name="Age_baseline", index=False)
    df_group_auprc_sex_sex.to_excel(writer, sheet_name="Sex_sex", index=False)
    df_group_auprc_sex_age.to_excel(writer, sheet_name="Age_sex", index=False)
    df_group_auprc_age_sex.to_excel(writer, sheet_name="Sex_age", index=False)
    df_group_auprc_age_age.to_excel(writer, sheet_name="Age_age", index=False)

In [11]:
best_t, best_f1 = find_best_threshold_multilabel(y_true, y_pred, average="macro")

print("Best threshold:", best_t)
print("Best F1:", best_f1)

y_pred_binary = prob_to_binary_multilabel(y_pred)
y_pred_sex_binary = prob_to_binary_multilabel(y_pred_sex)
y_pred_age_binary = prob_to_binary_multilabel(y_pred_age)


Best threshold: 0.6
Best F1: 0.4186810825108697


In [12]:
# Equalized Odds Metric

from itertools import combinations # Generates all unordered pairs of elements from groups

def compute_group_rates_multiclass(y_true, y_pred, classes):

    tprs, fprs = [], []

    for c in classes:
        # Positive = class c
        pos = (y_true == c)
        neg = (y_true != c)

        # Avoid division-by-zero
        if np.sum(pos) == 0:
            tpr = 0.0
        else:
            tpr = np.sum((y_pred == c) & pos) / np.sum(pos) 
            # (y_pred == c) & pos ← both conditions are True, boolean mask for true positives
 
        if np.sum(neg) == 0:
            fpr = 0.0
        else:
            fpr = np.sum((y_pred == c) & neg) / np.sum(neg)

        tprs.append(tpr)
        fprs.append(fpr)

    # macro-average
    return float(np.mean(tprs)), float(np.mean(fprs))


def equalized_odds_metric(y_test, y_pred, attribute, normalize=True, return_group_rates=True):

    groups = np.unique(attribute)
    classes = np.unique(y_test)

    # Dictionary to store group → (TPR_macro, FPR_macro)
    group_rates = {}

    for g in groups:
        idx = (attribute == g)
        y_true_g = y_test[idx]
        y_pred_g = y_pred[idx]

        tpr_g, fpr_g = compute_group_rates_multiclass(y_true_g, y_pred_g, classes)
        group_rates[g] = {"TPR": tpr_g, "FPR": fpr_g}

    # Compute pairwise EO distance: |TPR_g1 - TPR_g2| + |FPR_g1 - FPR_g2|
    distances = []

    for g1, g2 in combinations(groups, 2):
        t1, f1 = group_rates[g1]["TPR"], group_rates[g1]["FPR"]
        t2, f2 = group_rates[g2]["TPR"], group_rates[g2]["FPR"]

        dist = abs(t1 - t2) + abs(f1 - f2)
        distances.append(dist)

    eo_score = max(distances) if distances else 0.0

    # Normalize to [0,1]: maximum possible gap is 2 (|1−0| + |1−0|)
    if normalize:
        eo_score /= 2.0

    if return_group_rates:
        return eo_score, group_rates
    else:
        return eo_score


def equalized_odds_metric_multilabel( y_test, y_pred, attribute, labels, normalize=True, return_group_rates=True):

    groups = np.unique(attribute)
    classes = np.unique(y_test)

    rows = []

    for x, label_name in enumerate(labels):

        y_test_label = y_test[:, x]
        y_pred_label = y_pred[:, x]

        group_rates = {}

        for g in groups:
            idx = (attribute == g)
            y_true_g = y_test_label[idx]
            y_pred_g = y_pred_label[idx]

            tpr_g, fpr_g = compute_group_rates_multiclass(
                y_true_g, y_pred_g, classes
            )
            group_rates[g] = (tpr_g, fpr_g)

        # EO distance (label-level)
        distances = []
        for g1, g2 in combinations(groups, 2):
            t1, f1 = group_rates[g1]
            t2, f2 = group_rates[g2]
            distances.append(abs(t1 - t2) + abs(f1 - f2))

        eo_score = max(distances) if distances else 0.0
        if normalize:
            eo_score /= 2.0

        # ---- Long-format rows ----
        if return_group_rates:
            for g in groups:
                rows.append({
                    "Label": label_name,
                    "Group": g,
                    "TPR": group_rates[g][0],
                    "FPR": group_rates[g][1],
                    "Equalized Odds": eo_score
                })
        else:
            rows.append({
                "Label": label_name,
                "Equalized Odds": eo_score
            })

    df = pd.DataFrame(rows).reset_index(drop=True)
    return df



In [13]:
df_group_eo_scores_baseline_sex = equalized_odds_metric_multilabel(y_true, y_pred_binary, sex_attr_baseline, labels_abbr_small)
# print(df_group_eo_scores_baseline_sex)
df_group_eo_scores_baseline_age = equalized_odds_metric_multilabel(y_true, y_pred_binary, age_attr_baseline, labels_abbr_small)
# print(df_group_eo_scores_baseline_age)

df_group_eo_scores_sex_sex = equalized_odds_metric_multilabel(y_true_sex, y_pred_sex_binary, sex_attr_sex, labels_abbr_small)
df_group_eo_scores_sex_age = equalized_odds_metric_multilabel(y_true_sex, y_pred_sex_binary, age_attr_sex, labels_abbr_small)

df_group_eo_scores_age_sex = equalized_odds_metric_multilabel(y_true_age, y_pred_age_binary, sex_attr_age, labels_abbr_small)
df_group_eo_scores_age_age = equalized_odds_metric_multilabel(y_true_age, y_pred_age_binary, age_attr_age, labels_abbr_small)

output_path = results_path+"results_group_odds.xlsx"

with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer:
    df_group_eo_scores_baseline_sex.to_excel(writer, sheet_name="Sex_baseline", index=False)
    df_group_eo_scores_baseline_age.to_excel(writer, sheet_name="Age_baseline", index=False)
    df_group_eo_scores_sex_sex.to_excel(writer, sheet_name="Sex_sex", index=False)
    df_group_eo_scores_sex_age.to_excel(writer, sheet_name="Age_sex", index=False)
    df_group_eo_scores_age_sex.to_excel(writer, sheet_name="Sex_age", index=False)
    df_group_eo_scores_age_age.to_excel(writer, sheet_name="Age_age", index=False)

In [14]:
# Equalized Odds Metric, Fairlearn (Microsoft)

from fairlearn.metrics import (equalized_odds_difference, equalized_odds_ratio)

print('SEX attribute')
print('Baseline model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true[:, x]
    y_pred_val = y_pred_binary[:, x]
    
    eo_diff = equalized_odds_difference(y_true_val, y_pred_val, sensitive_features=sex_attr_baseline)
    eo_ratio = equalized_odds_ratio(y_true_val, y_pred_val, sensitive_features=sex_attr_baseline)

    print(labels_abbr_small[x]+",", "Equalized Odds Difference:", eo_diff)
    print("Odds Ratio:", eo_ratio)

print('FIS Sex model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_sex[:, x]
    y_pred_val = y_pred_sex_binary[:, x]
    
    eo_diff = equalized_odds_difference(y_true_val, y_pred_val, sensitive_features=sex_attr_sex)
    eo_ratio = equalized_odds_ratio(y_true_val, y_pred_val, sensitive_features=sex_attr_sex)

    print(labels_abbr_small[x]+",", "Equalized Odds Difference:", eo_diff)
    print("Odds Ratio:", eo_ratio)


print('FIS Age model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_age[:, x]
    y_pred_val = y_pred_age_binary[:, x]
    
    eo_diff = equalized_odds_difference(y_true_val, y_pred_val, sensitive_features=sex_attr_age)
    eo_ratio = equalized_odds_ratio(y_true_val, y_pred_val, sensitive_features=sex_attr_age)

    print(labels_abbr_small[x]+",", "Equalized Odds Difference:", eo_diff)
    print("Odds Ratio:", eo_ratio)

SEX attribute
Baseline model----------------------------------------------------
Cd, Equalized Odds Difference: 0.07516483516483516
Odds Ratio: 0.6868131868131868
Pa, Equalized Odds Difference: 0.25
Odds Ratio: 0.75
Ef, Equalized Odds Difference: 0.09193245778611625
Odds Ratio: 0.8700265251989391
Fr, Equalized Odds Difference: 0.04613797169811318
Odds Ratio: nan
NF, Equalized Odds Difference: 0.08864468864468865
Odds Ratio: 0.5769230769230769
FIS Sex model----------------------------------------------------
Cd, Equalized Odds Difference: 0.17010989010989014
Odds Ratio: 0.39246467817896385
Pa, Equalized Odds Difference: 0.03874130297280201
Odds Ratio: 0.696029776674938
Ef, Equalized Odds Difference: 0.17260787992495308
Odds Ratio: 0.6372549019607843
Fr, Equalized Odds Difference: 0.009581367924528295
Odds Ratio: nan
NF, Equalized Odds Difference: 0.062271062271062265
Odds Ratio: 0.3461538461538462
FIS Age model----------------------------------------------------
Cd, Equalized Odds Diffe

In [15]:
# Equalized Odds Metric, Fairlearn (Microsoft)

print('AGE group attribute')
print('Baseline model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true[:, x]
    y_pred_val = y_pred_binary[:, x]

    # print(labels_abbr_small[x])
    # print(y_true_val.shape)
    # print(y_pred_val.shape)
    # print(age_attr_baseline.shape)
    
    eo_diff = equalized_odds_difference(y_true_val, y_pred_val, sensitive_features=age_attr_baseline)
    eo_ratio = equalized_odds_ratio(y_true_val, y_pred_val, sensitive_features=age_attr_baseline)

    print(labels_abbr_small[x]+",", "Equalized Odds Difference:", eo_diff)
    print("Odds Ratio:", eo_ratio)

print('FIS Sex model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_sex[:, x]
    y_pred_val = y_pred_sex_binary[:, x]
    
    eo_diff = equalized_odds_difference(y_true_val, y_pred_val, sensitive_features=age_attr_sex)
    eo_ratio = equalized_odds_ratio(y_true_val, y_pred_val, sensitive_features=age_attr_sex)

    print(labels_abbr_small[x]+",", "Equalized Odds Difference:", eo_diff)
    print("Odds Ratio:", eo_ratio)


print('FIS Age model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_age[:, x]
    y_pred_val = y_pred_age_binary[:, x]
    
    eo_diff = equalized_odds_difference(y_true_val, y_pred_val, sensitive_features=age_attr_age)
    eo_ratio = equalized_odds_ratio(y_true_val, y_pred_val, sensitive_features=age_attr_age)

    print(labels_abbr_small[x]+",", "Equalized Odds Difference:", eo_diff)
    print("Odds Ratio:", eo_ratio)

AGE group attribute
Baseline model----------------------------------------------------
Cd, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Pa, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Ef, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Fr, Equalized Odds Difference: 0.44871794871794873
Odds Ratio: nan
NF, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
FIS Sex model----------------------------------------------------
Cd, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Pa, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Ef, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Fr, Equalized Odds Difference: 0.5
Odds Ratio: nan
NF, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
FIS Age model----------------------------------------------------
Cd, Equalized Odds Difference: 0.75
Odds Ratio: 0.0
Pa, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Ef, Equalized Odds Difference: 1.0
Odds Ratio: 0.0
Fr, Equalized Odds Difference: 0.5555555555555556
Odds Ratio: nan
NF, Equalized Odds Difference: 1.0


In [16]:
# Demographic Parity Difference, Fairlearn (Microsoft)

from fairlearn.metrics import demographic_parity_difference, selection_rate, MetricFrame

# Y_test: true labels (not actually needed for SPD)
# Y_pred: predicted labels (0/1 recommended)
# sex_attr_test: sensitive attribute array (e.g., 0 = male, 1 = female)

# spd = demographic_parity_difference(
#     y_true=Y_test,             # accepted but not used in SPD calculation
#     y_pred=Y_pred,             # predicted labels
#     sensitive_features=sex_attr_test
# )

# print("Statistical Parity Difference:", spd)

# mf = MetricFrame(
#     metrics=selection_rate,      # P(Y_pred = 1)
#     y_true=Y_test,
#     y_pred=Y_pred,
#     sensitive_features=sex_attr_test
# )

# print("Selection rate per group:")
# print(mf.by_group)

# print("Statistical Parity Difference:", mf.difference())

print('SEX attribute')
print('Baseline model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true[:, x]
    y_pred_val = y_pred_binary[:, x]
    
    spd = demographic_parity_difference(y_true_val, y_pred_val, sensitive_features=sex_attr_baseline)

    mf = MetricFrame(
    metrics=selection_rate,      # P(Y_pred = 1)
    y_true=y_true_val,
    y_pred=y_pred_val,
    sensitive_features=sex_attr_baseline
    )

    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", spd)
    print("Selection rate per group:")
    print(mf.by_group)
    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", mf.difference())

print('FIS Sex model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_sex[:, x]
    y_pred_val = y_pred_sex_binary[:, x]
    
    spd = demographic_parity_difference(y_true_val, y_pred_val, sensitive_features=sex_attr_sex)
    
    mf = MetricFrame(
    metrics=selection_rate,      # P(Y_pred = 1)
    y_true=y_true_val,
    y_pred=y_pred_val,
    sensitive_features=sex_attr_sex
    )

    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", spd)
    print("Selection rate per group:")
    print(mf.by_group)
    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", mf.difference())


print('FIS Age model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_age[:, x]
    y_pred_val = y_pred_age_binary[:, x]
    
    spd = demographic_parity_difference(y_true_val, y_pred_val, sensitive_features=sex_attr_age)
    

    mf = MetricFrame(
    metrics=selection_rate,      # P(Y_pred = 1)
    y_true=y_true_val,
    y_pred=y_pred_val,
    sensitive_features=sex_attr_age
    )

    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", spd)
    print("Selection rate per group:")
    print(mf.by_group)
    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", mf.difference())


SEX attribute
Baseline model----------------------------------------------------
Cd, Statistical Parity Difference: 0.033313679245283
Selection rate per group:
sensitive_feature_0
Female    0.330189
Male      0.296875
Name: selection_rate, dtype: float64
Cd, Statistical Parity Difference: 0.033313679245283
Pa, Statistical Parity Difference: 0.038325471698113234
Selection rate per group:
sensitive_feature_0
Female    0.367925
Male      0.406250
Name: selection_rate, dtype: float64
Pa, Statistical Parity Difference: 0.038325471698113234
Ef, Statistical Parity Difference: 0.13782429245283018
Selection rate per group:
sensitive_feature_0
Female    0.301887
Male      0.164062
Name: selection_rate, dtype: float64
Ef, Statistical Parity Difference: 0.13782429245283018
Fr, Statistical Parity Difference: 0.04613797169811318
Selection rate per group:
sensitive_feature_0
Female    0.632075
Male      0.585938
Name: selection_rate, dtype: float64
Fr, Statistical Parity Difference: 0.046137971698113

In [17]:
# Demographic Parity Difference, Fairlearn (Microsoft)

print('AGE attribute')
print('Baseline model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true[:, x]
    y_pred_val = y_pred_binary[:, x]
    
    spd = demographic_parity_difference(y_true_val, y_pred_val, sensitive_features=age_attr_baseline)

    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", spd)

print('FIS Sex model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_sex[:, x]
    y_pred_val = y_pred_sex_binary[:, x]
    
    spd = demographic_parity_difference(y_true_val, y_pred_val, sensitive_features=age_attr_sex)
    

    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", spd)
 

print('FIS Age model----------------------------------------------------')
for x in range(len(labels_abbr_small)):

    y_true_val = y_true_age[:, x]
    y_pred_val = y_pred_age_binary[:, x]
    
    spd = demographic_parity_difference(y_true_val, y_pred_val, sensitive_features=age_attr_age)
    
    print(labels_abbr_small[x]+",", "Statistical Parity Difference:", spd)
    

AGE attribute
Baseline model----------------------------------------------------
Cd, Statistical Parity Difference: 0.6923076923076923
Pa, Statistical Parity Difference: 0.5555555555555556
Ef, Statistical Parity Difference: 0.5384615384615384
Fr, Statistical Parity Difference: 0.44871794871794873
NF, Statistical Parity Difference: 0.6
FIS Sex model----------------------------------------------------
Cd, Statistical Parity Difference: 0.5
Pa, Statistical Parity Difference: 0.23076923076923078
Ef, Statistical Parity Difference: 0.6153846153846154
Fr, Statistical Parity Difference: 0.5
NF, Statistical Parity Difference: 0.6
FIS Age model----------------------------------------------------
Cd, Statistical Parity Difference: 0.5714285714285714
Pa, Statistical Parity Difference: 0.3076923076923077
Ef, Statistical Parity Difference: 0.5384615384615384
Fr, Statistical Parity Difference: 0.5555555555555556
NF, Statistical Parity Difference: 0.6
