# Task 3: Improving Fairness

###  **__ATTRIBUTE CHOSEN__: Gender

## Import libraries and functions

In [1]:
import pandas as pd
import plotly.graph_objs as go

# model: choose RF for its overall good performance
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# constants
from const import *

# udfs
from utils import *

## Load and Split Dataset

In [2]:
# load data
labeled_df_raw = pd.read_csv('../' + LABELED_DATA_PATH).drop('Unnamed: 0', axis=1)
labeled_y, labeled_X = labeled_df_raw['Risk'], labeled_df_raw.drop('Risk', axis=1)

# split data into training and validation subsets
train_X, val_X, train_y, val_y = train_test_split(labeled_X, labeled_y, test_size=VAL_RATIO, random_state=RANDOM_SEED)

# merge inputs and labels together for validation sets to better access within one dataframe
val_full = pd.merge(val_X, val_y, left_index=True, right_index=True)

## Baseline Model: with All Attributes Present

In [3]:
# Similarly using the anti-classification function with no attribute protected
val_full = anti_classification(
    'None', RandomForestClassifier(), 'RF', train_X, train_y, val_full, True
)

# obtain needed column names
pred_col = 'Risk_pred(protected=None, model=RF)'
true_col = 'Risk'

# obtain the confusion matrix on full results
print("\n\nConfusion Matrix: Nothing Protected (Baseline)\n")
_ = print_confusion_matrix(confusion_matrix(val_full[true_col], val_full[[pred_col]]))



Confusion Matrix: Nothing Protected (Baseline)

------------- Confusion Matrix (Count) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good                 117                 17
Factually Bad                   37                 29
-----------------------------------------------------


------------- Confusion Matrix (Ratio) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good               0.585              0.085
Factually Bad                0.185              0.145
-----------------------------------------------------


### Baseline Results

Accuracy, precision, recall and f1 score.

In [4]:
# print out the original results
baseline_acc = accuracy_score(val_full[true_col], val_full[pred_col])

baseline_precision, baseline_recall, baseline_f1, _ = precision_recall_fscore_support(
    val_full[true_col], val_full[pred_col], average='macro'
)

print(f"\nBaseline Results:")
print(f"Accuracy:  {baseline_acc:.4f}")
print(f"Precision: {baseline_precision:.4f}")
print(f"Recall:    {baseline_recall:.4f}")
print(f"F1 Score:  {baseline_f1:.4f}\n")


Baseline Results:
Accuracy:  0.7300
Precision: 0.6951
Recall:    0.6563
F1 Score:  0.6652



In [5]:
# obtain the original confusion matrix elements per-subgroup
tpr_sex, fpr_sex, fnr_sex, tnr_sex  = dict(), dict(), dict(), dict()
for gender in ('female', 'male'):
    print(f"SUBGROUP - {gender.upper()}:")
    tpr_sex[gender], fpr_sex[gender], fnr_sex[gender], tnr_sex[gender] = print_confusion_matrix(confusion_matrix(
        val_full[val_full['Sex']==gender][true_col], 
        val_full[val_full['Sex']==gender][pred_col]
    ))
    print("\n\n")

SUBGROUP - FEMALE:
------------- Confusion Matrix (Count) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good                  28                  5
Factually Bad                   12                 15
-----------------------------------------------------


------------- Confusion Matrix (Ratio) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good              0.4667             0.0833
Factually Bad               0.2                0.25
-----------------------------------------------------



SUBGROUP - MALE:
------------- Confusion Matrix (Count) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good                  89                 12
Factually Bad                   25                 14
-------------------------------------------

In [6]:
# visualize the results
fig = go.Figure()
fig.add_trace(go.Bar(
    x=list(tpr_sex.keys()),
    y=list(tpr_sex.values()),
    marker_color='#F3A291',
    opacity=0.75,
    name='true positive rate'
))
fig.add_trace(go.Bar(
    x=list(fpr_sex.keys()),
    y=list(fpr_sex.values()),
    marker_color='#F3CA91',
    opacity=0.75,
    name='false positive rate'
))
fig.add_trace(go.Bar(
    x=list(fnr_sex.keys()),
    y=list(fnr_sex.values()),
    marker_color='#CAF391',
    opacity=0.75,
    name='false negative rate'
))
fig.add_trace(go.Bar(
    x=list(tnr_sex.keys()),
    y=list(tnr_sex.values()),
    marker_color='#91F3D1',
    opacity=0.75,
    name='true negative rate'
))
fig.update_layout(
    title_text='Metrics within Confusion Matrix w.r.t. Gender Differences (Baseline)',
    xaxis_title='Sex',
    yaxis_title='Rate',
    width=1200,
    height=500,
)
fig.write_image('../results/separation-1-before.png', scale=2)
fig.show()

##  Attempts on Improvement

### 1. Excluding the Attribute (as is in anti-classification)

In [7]:
protected_attr = 'Sex'
val_full = anti_classification(
    protected_attr, RandomForestClassifier(), 'RF', train_X, train_y, val_full, True
)

### Results

In [8]:
# print out the new results
pred_col_m1 = 'Risk_pred(protected=Sex, model=RF)'

m1_acc = accuracy_score(val_full[true_col], val_full[pred_col_m1])

m1_precision, m1_recall, m1_f1, _ = precision_recall_fscore_support(
    val_full[true_col], val_full[pred_col_m1], average='macro'
)

print(f"\nResults on Method 1 - Anti-Classification:")
print(f"Accuracy:  {m1_acc:.4f}")
print(f"Precision: {m1_precision:.4f}")
print(f"Recall:    {m1_recall:.4f}")
print(f"F1 Score:  {m1_f1:.4f}\n")


Results on Method 1 - Anti-Classification:
Accuracy:  0.7150
Precision: 0.6734
Recall:    0.6528
F1 Score:  0.6591



### 2. Group Fairness: Revise Thresholds for Similar Predictably Positive Rates

In [9]:
# set the positive rate to be 0.6 for both genders
threshold = 0.6
thre_dict = dict()
prob_col = 'Risk_prob(protected=None, model=RF)'

for gender in ('female', 'male'):
    thre_dict[gender] = get_threshold('Sex', gender, val_full, prob_col, threshold)


print(f"\nProbability threshold to ensure the same probability of being predicted positive at {threshold * 100:.2f}%:")
print(thre_dict)


Probability threshold to ensure the same probability of being predicted positive at 60.00%:
{'female': 0.56, 'male': 0.66}


#### Build a udf to obtain the predicted label for different sub groups

In [10]:
pred_col_m2 = 'Risk_pred(protected=Sex, model=RF)_Group_Fairness'

def pred_with_diff_thre(thre_dict, df, attr_col, prob_col):
    return 'good' if df[prob_col] >= thre_dict[df[attr_col]] else 'bad'

val_full[pred_col_m2] = val_full.apply(lambda x: pred_with_diff_thre(thre_dict, x, 'Sex', prob_col), axis=1)

### Results

In [11]:
# print out the results
m2_acc = accuracy_score(val_full[true_col], val_full[pred_col_m2])

m2_precision, m2_recall, m2_f1, _ = precision_recall_fscore_support(
    val_full[true_col], val_full[pred_col_m2], average='macro'
)

print(f"\nResults on Method 2 - Group Fairness:")
print(f"Accuracy:  {m2_acc:.4f}")
print(f"Precision: {m2_precision:.4f}")
print(f"Recall:    {m2_recall:.4f}")
print(f"F1 Score:  {m2_f1:.4f}\n")


Results on Method 2 - Group Fairness:
Accuracy:  0.6750
Precision: 0.6457
Recall:    0.6575
F1 Score:  0.6484



We can also break the results down to subgroups and see which part is mal-performing.

In [12]:
# break down results
print(f"\nResults on Method 2 - Group Fairness (BREAKDOWN):\n")

for gender in thre_dict.keys():

    m2_acc_sub = accuracy_score(val_full[val_full['Sex'] == gender][true_col], val_full[val_full['Sex'] == gender][pred_col_m2])

    m2_precision_sub, m2_recall_sub, m2_f1_sub, _ = precision_recall_fscore_support(
        val_full[val_full['Sex'] == gender][true_col], val_full[val_full['Sex'] == gender][pred_col_m2], average='macro'
    )

    print(f"SUBGROUP - {gender.upper()}:")
    print(f"Accuracy:  {m2_acc_sub:.4f}")
    print(f"Precision: {m2_precision_sub:.4f}")
    print(f"Recall:    {m2_recall_sub:.4f}")
    print(f"F1 Score:  {m2_f1_sub:.4f}\n")


Results on Method 2 - Group Fairness (BREAKDOWN):

SUBGROUP - FEMALE:
Accuracy:  0.7000
Precision: 0.6992
Recall:    0.6902
F1 Score:  0.6914

SUBGROUP - MALE:
Accuracy:  0.6643
Precision: 0.6250
Recall:    0.6493
F1 Score:  0.6256



Lastly, we check if the thresholds really ensure close rates of positive predictions.

In [13]:
def obtain_pos_pred_rate(attr, attr_val, df, pred_col):
    return (
        len(df[(df[pred_col]=='good') & (df[attr]==attr_val)]) / 
        len(df[df[attr]==attr_val])
    )

print("Original baseline model:")
for gender in ('female', 'male'):
    ppr = obtain_pos_pred_rate(
        'Sex', gender, val_full, pred_col
    )
    print(f"The rate of positive predictions for subgroup {gender.upper()} is {ppr:.4f}.")

print("\n\nModified Model:")
for gender in ('female', 'male'):
    ppr = obtain_pos_pred_rate(
        'Sex', gender, val_full, pred_col_m2
    )
    print(f"The rate of positive predictions for subgroup {gender.upper()} is {ppr:.4f}.")

Original baseline model:
The rate of positive predictions for subgroup FEMALE is 0.6667.
The rate of positive predictions for subgroup MALE is 0.8143.


Modified Model:
The rate of positive predictions for subgroup FEMALE is 0.6167.
The rate of positive predictions for subgroup MALE is 0.6000.


### 3. Others: Dataset Augmentation

A naive way to augment the dataset is to duplicate the entire dataset with all the gender labels reseversed, based on the assumption that gender is not, or shall not be, impacting the credit scoring results.

In [14]:
# augment data
AUGMENTED_DATA_PATH = boost_dataset('../' + LABELED_DATA_PATH)

# load data
augmented_labeled_df_raw = pd.read_csv(AUGMENTED_DATA_PATH).drop('Unnamed: 0', axis=1) # already included '../' as the prefix
augmented_labeled_y, augmented_labeled_X = augmented_labeled_df_raw['Risk'], augmented_labeled_df_raw.drop('Risk', axis=1)

# split data into training and validation subsets
augmented_train_X, augmented_val_X, augmented_train_y, augmented_val_y = train_test_split(augmented_labeled_X, augmented_labeled_y, test_size=VAL_RATIO, random_state=RANDOM_SEED)

# merge inputs and labels together for validation sets to better access within one dataframe
augmented_val_full = pd.merge(augmented_val_X, augmented_val_y, left_index=True, right_index=True)

In [15]:
# visualize the current distribution of gender
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=labeled_df_raw[labeled_df_raw['Sex']=='female']['Risk'],
    marker_color='#F5DDD3',
    opacity=0.75,
    name='female_original'
))

fig.add_trace(go.Histogram(
    x=augmented_labeled_df_raw[augmented_labeled_df_raw['Sex']=='female']['Risk'],
    marker_color='#F8C6AB',
    opacity=0.75,
    name='female_augmented'
))

fig.add_trace(go.Histogram(
    x=labeled_df_raw[labeled_df_raw['Sex']=='male']['Risk'],
    marker_color='#C9DDE4',
    opacity=0.75,
    name='male_original'
))

fig.add_trace(go.Histogram(
    x=augmented_labeled_df_raw[augmented_labeled_df_raw['Sex']=='male']['Risk'],
    marker_color='#ABE3F8',
    opacity=0.75,
    name='male_augmented'
))


fig.update_layout(
    title_text='Distribution of Good/Bad Credit across Gender (Entire Dataset)',
    xaxis_title='Credit',
    yaxis_title='Count'
)

fig.write_image('../results/new-distribution-gender.png', scale=2)
fig.show()

In [16]:
# train the model
augmented_val_full = anti_classification(
    'None', RandomForestClassifier(), 'RF', augmented_train_X, augmented_train_y, augmented_val_full, True
)

### Results

In [17]:
# obtain needed column names
pred_col_m3 = 'Risk_pred(protected=None, model=RF)'
true_col = 'Risk'

# obtain the confusion matrix on full results
print("\n\nConfusion Matrix on Method 3 - Data Augmentation:\n")
_ = print_confusion_matrix(confusion_matrix(augmented_val_full[true_col], augmented_val_full[pred_col_m3]))

print("\n\nConfusion Matrix on Method 3: BREAKDOWN")
tpr_sex, fpr_sex, fnr_sex, tnr_sex  = dict(), dict(), dict(), dict()
for gender in ('female', 'male'):
    print(f"SUBGROUP - {gender.upper()}:")
    tpr_sex[gender], fpr_sex[gender], fnr_sex[gender], tnr_sex[gender] = print_confusion_matrix(confusion_matrix(
        augmented_val_full[augmented_val_full['Sex']==gender][true_col], 
        augmented_val_full[augmented_val_full['Sex']==gender][pred_col_m3]
    ))
    print("\n\n")



Confusion Matrix on Method 3 - Data Augmentation:

------------- Confusion Matrix (Count) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good                 266                 17
Factually Bad                   41                 76
-----------------------------------------------------


------------- Confusion Matrix (Ratio) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good              0.665              0.0425
Factually Bad               0.1025             0.19
-----------------------------------------------------


Confusion Matrix on Method 3: BREAKDOWN
SUBGROUP - FEMALE:
------------- Confusion Matrix (Count) --------------
                  Predictably Good    Predictably Bad
--------------  ------------------  -----------------
Factually Good                 133                  7
Factually Bad         

In [18]:
# print out the results
m3_acc = accuracy_score(augmented_val_full[true_col], augmented_val_full[pred_col_m3])

m3_precision, m3_recall, m3_f1, _ = precision_recall_fscore_support(
    augmented_val_full[true_col], augmented_val_full[pred_col_m3], average='macro'
)

print(f"\nResults on Method 3 - Data Augmentation:")
print(f"Accuracy:  {m3_acc:.4f}")
print(f"Precision: {m3_precision:.4f}")
print(f"Recall:    {m3_recall:.4f}")
print(f"F1 Score:  {m3_f1:.4f}\n")

# break down results
print(f"\nResults on Method 3 - Data Augmentation (BREAKDOWN):")

for gender in thre_dict.keys():

    m3_acc_sub = accuracy_score(
        augmented_val_full[augmented_val_full['Sex'] == gender][true_col], 
        augmented_val_full[augmented_val_full['Sex'] == gender][pred_col_m3]
    )

    m3_precision_sub, m3_recall_sub, m3_f1_sub, _ = precision_recall_fscore_support(
        augmented_val_full[augmented_val_full['Sex'] == gender][true_col], 
        augmented_val_full[augmented_val_full['Sex'] == gender][pred_col_m3], average='macro'
    )

    print(f"SUBGROUP - {gender.upper()}:")
    print(f"Accuracy:  {m3_acc_sub:.4f}")
    print(f"Precision: {m3_precision_sub:.4f}")
    print(f"Recall:    {m3_recall_sub:.4f}")
    print(f"F1 Score:  {m3_f1_sub:.4f}\n")


Results on Method 3 - Data Augmentation:
Accuracy:  0.8550
Precision: 0.8418
Recall:    0.7948
F1 Score:  0.8128


Results on Method 3 - Data Augmentation (BREAKDOWN):
SUBGROUP - FEMALE:
Accuracy:  0.8700
Precision: 0.8646
Recall:    0.8167
F1 Score:  0.8351

SUBGROUP - MALE:
Accuracy:  0.8400
Precision: 0.8179
Recall:    0.7721
F1 Score:  0.7894



#### Fairness Measure

In [19]:
# visualize the results
fig = go.Figure()
fig.add_trace(go.Bar(
    x=list(tpr_sex.keys()),
    y=list(tpr_sex.values()),
    marker_color='#F3A291',
    opacity=0.75,
    name='true positive rate'
))
fig.add_trace(go.Bar(
    x=list(fpr_sex.keys()),
    y=list(fpr_sex.values()),
    marker_color='#F3CA91',
    opacity=0.75,
    name='false positive rate'
))
fig.add_trace(go.Bar(
    x=list(fnr_sex.keys()),
    y=list(fnr_sex.values()),
    marker_color='#CAF391',
    opacity=0.75,
    name='false negative rate'
))
fig.add_trace(go.Bar(
    x=list(tnr_sex.keys()),
    y=list(tnr_sex.values()),
    marker_color='#91F3D1',
    opacity=0.75,
    name='true negative rate'
))
fig.update_layout(
    title_text='Metrics within Confusion Matrix w.r.t. Gender Differences (Data Augmented)',
    xaxis_title='Sex',
    yaxis_title='Rate',
    width=1200,
    height=500,
)
fig.write_image('../results/separation-2-after.png', scale=2)
fig.show()