# Step 4

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


1) Randomly split your original dataset into training and testing datasets

In [2]:
raw_data = pd.read_csv("Data/bank.csv", delimiter=";")
raw_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
fairness_data = raw_data[['age', 'marital', 'y']]
fairness_data['age'] = fairness_data['age'].apply(lambda x: 'old' if x > 45 else 'young')
fairness_data['marital'] = fairness_data['marital'].apply(lambda x: 'married' if x == 'married' else 'not married')
print(fairness_data.head())

#combine the priviledge/unpriviledge group into raw data to expedite later calculation
raw_data['age_p'] = fairness_data['age'].map({'old':0,'young':1})
raw_data['marital_p'] = fairness_data['marital'].map({'married':0,'not married':1})
print(raw_data.head())


     age      marital   y
0  young      married  no
1  young      married  no
2  young  not married  no
3  young      married  no
4    old      married  no
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  \
0  cellular   19   oct        79         1     -1         0  unknown  no   
1  cellular   11   may       220         1    339         4  failure  no   
2  cellular   16   apr       185         1    330         1  failure  no   
3   unknown    3   jun       199         4     -1         0  unknown  no   
4   unknown    5   may   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fairness_data['age'] = fairness_data['age'].apply(lambda x: 'old' if x > 45 else 'young')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fairness_data['marital'] = fairness_data['marital'].apply(lambda x: 'married' if x == 'married' else 'not married')


In [4]:
orig_x = raw_data[['age_p','marital_p','balance','day','duration','campaign','pdays','previous']]
orig_y = raw_data['y'].map({'no':0,'yes':1})
orig_x_train, orig_x_test, orig_y_train, orig_y_test = train_test_split(orig_x, orig_y,test_size=0.5)

2) Randomly split your transformed dataset into training and testing datasets (from Step 3.3)

In [5]:
#recreate the transformed dataset from step 3.3
# calculate the base rate of the privileged and unprivileged groups
base_rate_priv = fairness_data.loc[fairness_data['age'] == 'old', 'y'].value_counts(normalize=True)['yes']
base_rate_unpriv = fairness_data.loc[fairness_data['age'] == 'young', 'y'].value_counts(normalize=True)['yes']

# calculate the ratio of the base rates
base_rate_ratio = base_rate_priv / base_rate_unpriv

# calculate the desired rate for the unprivileged group
unpriv_desired_rate = fairness_data.loc[fairness_data['age'] == 'young', 'y'].value_counts(normalize=True)['yes'] * base_rate_ratio

# adjust the y values for the unprivileged group
fairness_data.loc[fairness_data['age'] == 'young', 'y'] = fairness_data.loc[fairness_data['age'] == 'young', 'y'].apply(
    lambda x: 'yes' if x == 'yes' and pd.np.random.rand() < unpriv_desired_rate else 'no'
)

tran_x = raw_data[['age_p','marital_p','balance','day','duration','campaign','pdays','previous']]
tran_y = fairness_data['y'].map({'no':0,'yes':1})
tran_x_train, tran_x_test, tran_y_train, tran_y_test = train_test_split(tran_x, tran_y,test_size=0.5)

#check the independent variable
tran_x_test.head()

  lambda x: 'yes' if x == 'yes' and pd.np.random.rand() < unpriv_desired_rate else 'no'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,age_p,marital_p,balance,day,duration,campaign,pdays,previous
1073,1,1,726,16,39,1,342,3
2145,1,0,-363,12,340,2,-1,0
2139,1,0,0,9,236,1,-1,0
2213,1,1,822,27,184,1,91,2
714,0,1,15311,29,157,6,-1,0


3) Train a classifier using the original training dataset from Step 4.1; select one of your dependent variables as the output label to train your classifier.

In [6]:
clf = LogisticRegression(solver='liblinear', random_state=0)
# Train Decision Tree Classifer
clf = clf.fit(orig_x_train, orig_y_train)

#Predict the response for test dataset
orig_y_pred = clf.predict(orig_x_test)


4) Train a classifier using the transformed training dataset from Step 4.2; select one of your dependent variables as the output label to train your classifier.

In [7]:
#train the model using the transformed dataset
model_tran = LogisticRegression(solver='liblinear', random_state=0)

#exclude the new added columns from the training process
model_tran_fit = model_tran.fit(tran_x_train, tran_y_train)
tran_y_pred= model_tran_fit.predict(tran_x_test)

5) Select the privileged/unprivileged groups associated with one of your protected class variables (from Step 3.1); Use the two fairness metrics identified in Step 3.2 and and compute the fairness metrics for the classifier output associated with the original testing dataset and the transformed testing dataset

In [8]:
#choose age for this question
confusion_matrix_age_orig = pd.crosstab(orig_x_test['age_p'].map({0:'old',1:'young'}), pd.Series(orig_y_pred).map({0:'no',1:'yes'}))
print('Classifier output:')
print(confusion_matrix_age_orig)
# calculate the number of individuals in each group
n_old = confusion_matrix_age_orig.loc['old'].sum()
n_young = confusion_matrix_age_orig.loc['young'].sum()

n_old_yes = confusion_matrix_age_orig.loc['old', 'yes']
n_young_yes = confusion_matrix_age_orig.loc['young', 'yes']

n_old_no = confusion_matrix_age_orig.loc['old', 'no']
n_young_no = confusion_matrix_age_orig.loc['young', 'no']

# calculate the overall proportion of individuals that were classified as 'yes'
p_yes = (n_old_yes + n_young_yes) / (n_old + n_young)

# calculate the proportion of individuals in each group that were classified as 'yes'
p_old_yes = n_old_yes / n_old
p_young_yes = n_young_yes / n_young

# calculate metrics
disparate_impact_age_orig = p_old_yes / p_young_yes
equal_opportunity_difference_age_orig = abs(p_old_yes - p_young_yes)

print('Age group Disparate Impact:', disparate_impact_age_orig)
print('Age group Equal Opportunity Difference:', equal_opportunity_difference_age_orig)

Classifier output:
col_0   no  yes
age_p          
old    347    8
young  745   27
Age group Disparate Impact: 0.6443401147626501
Age group Equal Opportunity Difference: 0.012438881996643069


In [9]:
confusion_matrix_age_trans = pd.crosstab(tran_x_test['age_p'].map({0:'old',1:'young'}), pd.Series(tran_y_pred).map({0:'no',1:'yes'}))
print('Classifier output:')
print(confusion_matrix_age_trans)
# calculate the number of individuals in each group
n_old = confusion_matrix_age_trans.loc['old'].sum()
n_young = confusion_matrix_age_trans.loc['young'].sum()

n_old_yes = confusion_matrix_age_trans.loc['old', 'yes']
n_young_yes = confusion_matrix_age_trans.loc['young', 'yes']

n_old_no = confusion_matrix_age_trans.loc['old', 'no']
n_young_no = confusion_matrix_age_trans.loc['young', 'no']

# calculate the overall proportion of individuals that were classified as 'yes'
p_yes = (n_old_yes + n_young_yes) / (n_old + n_young)

# calculate the proportion of individuals in each group that were classified as 'yes'
p_old_yes = n_old_yes / n_old
p_young_yes = n_young_yes / n_young

# calculate metrics
disparate_impact_age_tran = p_old_yes / p_young_yes
equal_opportunity_difference_age_tran = abs(p_old_yes - p_young_yes)

print('Age group Disparate Impact:', disparate_impact_age_tran)
print('Age group Equal Opportunity Difference:', equal_opportunity_difference_age_tran)

Classifier output:
col_0   no  yes
age_p          
old    351    4
young  756    8
Age group Disparate Impact: 1.076056338028169
Age group Equal Opportunity Difference: 0.0007964014453211415


6) For each fairness metric, in table format, indicate if there were any differences in the outcomes for the privileged versus unprivileged group. Was there a positive change, negative change, or no change on that fairness metric after transforming the dataset (from Step 3.4)? Was there a positive change, negative change, or no change on that fairness metric after training the classifier - with respect to the original testing dataset and the transformed testing dataset? [Note: Use your subjective opinion]

In [10]:
print('Privileged groups: age -> old (age>45)')
print('Unprivileged groups: age -> young (age<=45)')

table = pd.DataFrame({'Protected Class':['Age'], 'Privileged Group':['Old'], 'Unprivileged Group':['Young'], 'Disparate Impact':[disparate_impact_age_orig], 
                      'Equal Opportunity Difference':[equal_opportunity_difference_age_orig], 'Disparate Impact after Transform':[disparate_impact_age_tran], 
                      'Equal Opportunity Difference after Transform':[equal_opportunity_difference_age_tran]})
print(table)

Privileged groups: age -> old (age>45)
Unprivileged groups: age -> young (age<=45)
  Protected Class Privileged Group Unprivileged Group  Disparate Impact  \
0             Age              Old              Young           0.64434   

   Equal Opportunity Difference  Disparate Impact after Transform  \
0                      0.012439                          1.076056   

   Equal Opportunity Difference after Transform  
0                                      0.000796  


There is differences in the outcome for the priviledged versus unprivileged group. After transforming the dataset, there are negative change on the fairness metrics with respect to the original testing and the transformed testing dataset. As you can observe for the output table above, the disparate impact increased by a lot after transformation. However, the equal oppotunity difference is reduced after tranformation.