In [1]:
import pandas as pd
import numpy as np
from doubleml import DoubleMLData, DoubleMLAPOS
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier

banerji_raw = pd.read_csv('Banerji-Berry-Shotland_2017_AEJ.csv')
print(banerji_raw)

       treatment  age  state  bl_caser_total_norm  caser_total_norm  boy  \
0              2    8      1             1.954451          1.881699    1   
1              2    6      1             0.310986          1.389131    1   
2              2    4      1            -0.759177         -0.545960    1   
3              2    7      1            -0.759177          0.333626    1   
4              2    8      1             0.502087          0.650278    1   
...          ...  ...    ...                  ...               ...  ...   
14569          3    8      2             0.119886          0.298443    0   
14570          3    5      2            -0.712464         -0.862612    1   
14571          3    8      2            -0.653010         -0.721878    0   
14572          3    6      2            -0.912057         -0.897795    0   
14573          3    6      2            -0.720957          0.333626    1   

       number_of_kids  mother_educ  factor_educ  mother_age30  farmingIncome  
0       

In [2]:
y = banerji_raw['caser_total_norm']
d = banerji_raw['treatment']
X = banerji_raw.drop(['caser_total_norm', 'treatment'], axis=1)
dml_data = DoubleMLData(
    data=banerji_raw, y_col='caser_total_norm', d_cols='treatment', x_cols=list(X.columns)
)
treatment_levels = banerji_raw['treatment'].unique().tolist()
treatment_levels.sort()

In [13]:
ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
ml_m = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
dml_apos = DoubleMLAPOS(
    obj_dml_data=dml_data,
    ml_g=ml_g,
    ml_m=ml_m,
    treatment_levels=treatment_levels,
    n_folds=5,
    n_rep=1,
    score='APO',
    trimming_rule='truncate',
    trimming_threshold=0.01,
    draw_sample_splitting=True
)
dml_apos.fit()
dml_apos.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
1,0.223573,0.010607,21.077801,0.0,0.202784,0.244362
2,0.234998,0.010409,22.576103,0.0,0.214597,0.2554
3,0.242763,0.010339,23.480665,0.0,0.222499,0.263026
4,0.279409,0.010602,26.354288,0.0,0.25863,0.300189


In [14]:
dml_apos.sensitivity_analysis()
print(dml_apos.sensitivity_summary)


------------------ Scenario          ------------------
Significance Level: level=0.95
Sensitivity parameters: cf_y=0.03; cf_d=0.03, rho=1.0

------------------ Bounds with CI    ------------------
   CI lower  theta lower     theta  theta upper  CI upper
0  0.178932     0.196448  0.223573     0.250698  0.268091
1  0.190876     0.208077  0.234998     0.261919  0.278977
2  0.197984     0.215052  0.242763     0.270473  0.287436
3  0.234774     0.252283  0.279409     0.306535  0.323919

------------------ Robustness Values ------------------
   H_0     RV (%)    RVa (%)
0  0.0  22.152128  20.543326
1  0.0  23.288598  21.705411
2  0.0  23.361153  21.841950
3  0.0  26.837333  25.298645


In [15]:
causal_effects = dml_apos.causal_contrast(reference_levels=1)
causal_effects.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
2 vs 1,0.011425,0.010472,1.091025,0.2752621,-0.0091,0.03195
3 vs 1,0.01919,0.010422,1.841305,0.06557681,-0.001237,0.039616
4 vs 1,0.055836,0.010741,5.198599,2.007956e-07,0.034785,0.076887


In [None]:
from sklearn.model_selection import GridSearchCV

ml_g = MLPRegressor(
    activation='relu',
    solver='adam', max_iter=1000, random_state=42,
)
ml_m = MLPClassifier(
    activation='relu',
    solver='adam', max_iter=1000, random_state=42
)

param_grid_dnn = {
    'mlp__hidden_layer_sizes': [
        (128, 64, 32),
        (64, 64, 64, 64),
        (32, 32, 32, 32),
        (32, 32, 32),
        (64, 32, 16),
        (128, 64, 32, 16)
    ],
    'mlp__alpha': [0.001, 0.01, 0.1, 0.5, 1],
    'mlp__learning_rate': ['constant', 'adaptive']
}

grid_search_m = GridSearchCV(
    estimator=ml_m,
    param_grid=param_grid_dnn,
    cv=5,
    scoring='balanced_accuracy',
    n_jobs=-1
)
grid_search_g = GridSearchCV(
    estimator=ml_g,
    param_grid=param_grid_dnn,
    cv=5,
    scoring='balanced_accuracy',
    n_jobs=-1
)


dml_apos = DoubleMLAPOS(
    obj_dml_data=dml_data,
    ml_g=ml_g,
    ml_m=ml_m,
    treatment_levels=treatment_levels,
    n_folds=5,
    n_rep=1,
    score='APO',
    trimming_rule='truncate',
    trimming_threshold=0.01,
    draw_sample_splitting=True
)
dml_apos.fit()
dml_apos.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
1,0.223232,0.011121,20.072907,0.0,0.201435,0.245029
2,0.235384,0.010752,21.892959,0.0,0.214311,0.256457
3,0.245785,0.010708,22.954221,0.0,0.224798,0.266771
4,0.277079,0.011171,24.804375,0.0,0.255185,0.298973


In [6]:
causal_effects = dml_apos.causal_contrast(reference_levels=1)
causal_effects.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
2 vs 1,0.012152,0.011308,1.074611,0.282549,-0.010012,0.034316
3 vs 1,0.022553,0.011278,1.999667,0.045536,0.000448,0.044658
4 vs 1,0.053847,0.011785,4.569267,5e-06,0.030749,0.076944
