In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import copy
import numpy as np
import scipy as sp
from sklearn.tree import DecisionTreeClassifier

from utils.data_loader import *
from utils.common_helpers import *
from utils.generic_pipeline import *
from utils.generic_analyzer import *

In [2]:
dataset = ACSEmploymentDataset(state=['GA'], year=2018, with_nulls=False)
dataset.X_data

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP
0,5,4,0,3,2,1,2,2,2,1,2,16,1,1,13,51
1,3,4,0,1,2,1,1,2,1,2,1,16,1,4,16,56
2,5,4,0,1,1,1,1,2,2,2,2,17,1,4,20,23
3,1,4,0,1,2,1,2,2,2,1,2,16,1,1,17,43
4,5,4,0,1,2,1,2,2,2,2,1,16,1,1,19,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100850,1,4,0,1,2,1,2,2,2,2,1,1,1,3,21,51
100851,5,4,0,1,2,1,2,2,2,2,1,2,1,3,16,18
100852,5,4,0,3,2,1,2,2,2,2,2,0,1,3,21,46
100853,4,4,0,1,2,1,1,2,2,2,2,0,1,1,20,48


In [3]:
SEX_priv = RACE_priv = str(1)
SEED = 111
n_samples = 100000
base_pipeline = GenericPipeline(dataset, ['SEX','RAC1P'], [SEX_priv, RACE_priv])

In [4]:
_ = base_pipeline.create_train_test_val_split(SEED=SEED, sample_size=n_samples)

In [5]:
for g in base_pipeline.test_groups.keys():
    print(g, base_pipeline.test_groups[g].shape)

SEX_RAC1P_priv (6490, 16)
SEX_RAC1P_dis (3606, 16)
SEX_priv (9676, 16)
SEX_dis (10495, 16)
RAC1P_priv (13379, 16)
RAC1P_dis (6792, 16)


In [6]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features=0.6, random_state=SEED)
model.fit(base_pipeline.X_train, base_pipeline.y_train)

In [7]:
y_preds = model.predict(base_pipeline.X_test)

In [8]:
Analyzer = GenericAnalyzer(base_pipeline.X_test, base_pipeline.y_test, ['SEX','RAC1P'], [SEX_priv, RACE_priv])
dtc_res = Analyzer.compute_metrics(y_preds, 'Decision_Tree')

In [9]:
pd.DataFrame(dtc_res)

Unnamed: 0,overall,SEX_RAC1P_priv,SEX_RAC1P_dis,SEX_priv,SEX_dis,RAC1P_priv,RAC1P_dis
TPR,0.859962,0.885127,0.852861,0.893903,0.823239,0.849672,0.882629
TNR,0.803663,0.830924,0.782507,0.82988,0.782358,0.803544,0.803878
PPV,0.774629,0.844778,0.729179,0.826977,0.721012,0.783759,0.755954
FNR,0.140038,0.114873,0.147139,0.106097,0.176761,0.150328,0.117371
FPR,0.196337,0.169076,0.217493,0.17012,0.217642,0.196456,0.196122
Accuracy,0.828417,0.858552,0.811148,0.860376,0.798952,0.824576,0.835984
F1,0.815068,0.864482,0.786185,0.859139,0.768742,0.815386,0.814395
Selection-Rate,0.488127,0.534052,0.476151,0.514882,0.463459,0.494282,0.476001
Positive-Rate,1.110159,1.047763,1.169619,1.080929,1.141784,1.084098,1.16757
