In [None]:
%reload_ext autoreload
%autoreload 2

In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from utils.custom_classes.data_loader import ACSEmploymentDataset
from utils.custom_classes.generic_pipeline import GenericPipeline
from utils.analyzers.bias_analyzer import BiasAnalyzer

In [2]:
dataset = ACSEmploymentDataset(state=['GA'], year=2018, with_nulls=False)
dataset.X_data

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP
0,5,4,0,3,2,1,2,2,2,1,2,16,1,1,13,51
1,3,4,0,1,2,1,1,2,1,2,1,16,1,4,16,56
2,5,4,0,1,1,1,1,2,2,2,2,17,1,4,20,23
3,1,4,0,1,2,1,2,2,2,1,2,16,1,1,17,43
4,5,4,0,1,2,1,2,2,2,2,1,16,1,1,19,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100850,1,4,0,1,2,1,2,2,2,2,1,1,1,3,21,51
100851,5,4,0,1,2,1,2,2,2,2,1,2,1,3,16,18
100852,5,4,0,3,2,1,2,2,2,2,2,0,1,3,21,46
100853,4,4,0,1,2,1,1,2,2,2,2,0,1,1,20,48


In [3]:
SEX_priv = RACE_priv = str(1)
SEED = 111
n_samples = 100000
base_pipeline = GenericPipeline(dataset, ['SEX','RAC1P'], [SEX_priv, RACE_priv])

In [4]:
_ = base_pipeline.create_train_test_val_split(SEED=SEED, sample_size=n_samples)

In [5]:
for g in base_pipeline.test_groups.keys():
    print(g, base_pipeline.test_groups[g].shape)

SEX_RAC1P_priv (6490, 16)
SEX_RAC1P_dis (3606, 16)
SEX_priv (9676, 16)
SEX_dis (10495, 16)
RAC1P_priv (13379, 16)
RAC1P_dis (6792, 16)


In [6]:
base_model = DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features=0.6, random_state=SEED)
encoder = ColumnTransformer(transformers=[
    ('categorical_features', OneHotEncoder(categories=[list(set(base_pipeline.X_train[col])) for col in base_pipeline.categorical_columns], sparse=False),
     base_pipeline.categorical_columns),
    ('numerical_features', StandardScaler(), base_pipeline.numerical_columns)
])

model = Pipeline([
    ('features', encoder),
    ('learner', base_model)
])

In [7]:
model.fit(base_pipeline.X_train, base_pipeline.y_train)

In [8]:
y_preds = model.predict(base_pipeline.X_test)

In [9]:
Analyzer = BiasAnalyzer(base_pipeline.X_test, base_pipeline.y_test, ['SEX','RAC1P'], [SEX_priv, RACE_priv])
dtc_res = Analyzer.compute_metrics(y_preds, 'Decision_Tree')

In [10]:
pd.DataFrame(dtc_res)

Unnamed: 0,overall,SEX_RAC1P_priv,SEX_RAC1P_dis,SEX_priv,SEX_dis,RAC1P_priv,RAC1P_dis
TPR,0.858496,0.87636,0.827657,0.886092,0.828638,0.854754,0.866739
TNR,0.799947,0.831552,0.787184,0.829485,0.775942,0.79695,0.805369
PPV,0.771038,0.843959,0.727545,0.825384,0.71646,0.779139,0.754006
FNR,0.141504,0.12364,0.172343,0.113908,0.171362,0.145246,0.133261
FPR,0.200053,0.168448,0.212816,0.170515,0.224058,0.20305,0.194631
Accuracy,0.82569,0.854391,0.803661,0.856449,0.797332,0.823305,0.830389
F1,0.81242,0.859855,0.774379,0.854662,0.768477,0.815197,0.806452
Selection-Rate,0.489564,0.529276,0.463117,0.511368,0.469462,0.500187,0.46864
Positive-Rate,1.113429,1.038392,1.137602,1.073552,1.156573,1.097049,1.149512
