In [11]:
import pandas as pd

q3 = pd.read_csv("data_for_HW4.csv")
print(q3)

            x.1       x.2       x.3       x.4       x.5  t         y  e
0     -0.629086 -1.395086 -2.709882  0.769207  0.103222  0  2.756883  1
1      1.837422  0.453198  0.833936 -1.173575  0.720166  0  0.000057  1
2      0.343682  1.331708  0.814470  0.691395 -1.037305  1 -3.498816  1
3     -1.360096  0.015638  0.371858 -1.481852 -0.608062  0 -3.303751  1
4      0.535815 -1.759484 -0.280708 -1.277767 -2.212238  0  3.931033  1
...         ...       ...       ...       ...       ... ..       ... ..
39995  0.800978  0.336906  0.498468  0.206997 -0.624796  0 -0.826653  2
39996  0.061215 -1.341895 -0.229908 -0.680406  0.159504  0  0.445027  2
39997 -1.129852 -0.712578  1.055300  0.556854 -0.246316  0 -0.992614  2
39998  0.369920  0.171134 -1.097407 -1.910119  1.341962  1  3.691988  2
39999 -0.330893 -0.707505 -1.276876  1.607192  1.391644  0  0.769911  2

[40000 rows x 8 columns]


In [21]:
import pandas as pd
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Load the dataset
q3 = pd.read_csv("data_for_HW4.csv")

# Split the data based on the 'e' column
data_e1 = q3[q3['e'] == 1].copy()
data_e2 = q3[q3['e'] == 2].copy()

# Define outcome, treatment, and covariate columns
y_col = 'y'
d_col = 't'
x_cols = ['x.1','x.2','x.3','x.4','x.5']

def run_dml_plr(data, ml_l, ml_m, n_folds=5, score='partialling out'):
    """
    Fits a DoubleMLPLR model and returns the summary.
    """
    obj_dml_data = DoubleMLData(data, y_col, d_col, x_cols)
    dml_plr = DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=n_folds, score=score)
    dml_plr.fit()
    return dml_plr.summary

# Define parameter grids for each method's ml_l and ml_m
methods = {}

# RandomForest Method
methods['RandomForest'] = {
    'ml_l': GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid={
            'n_estimators': [100, 200],
            'max_depth': [5, 10],
            'max_features': ['sqrt', 'log2']
        },
        cv=5,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    ),
    'ml_m': GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid={
            'n_estimators': [100, 200],
            'max_depth': [5, 10],
            'max_features': ['sqrt', 'log2']
        },
        cv=5,
        n_jobs=-1,
        scoring='roc_auc'
    )
}

# GradientBoosting Method
methods['GradientBoosting'] = {
    'ml_l': GridSearchCV(
        estimator=GradientBoostingRegressor(random_state=42),
        param_grid={
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1]
        },
        cv=5,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    ),
    'ml_m': GridSearchCV(
        estimator=GradientBoostingClassifier(random_state=42),
        param_grid={
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1]
        },
        cv=5,
        n_jobs=-1,
        scoring='roc_auc'
    )
}

# Deep Neural Network Method
methods['DeepNN'] = {
    'ml_l': GridSearchCV(
        estimator=Pipeline([
            ('scaler', StandardScaler()),
            ('mlp', MLPRegressor(random_state=42, max_iter=2000))
        ]),
        param_grid={
            'mlp__hidden_layer_sizes': [(64,32), (128,64)],
            'mlp__alpha': [0.0001, 0.001],
            'mlp__learning_rate_init': [0.001, 0.01]
        },
        cv=5,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    ),
    'ml_m': GridSearchCV(
        estimator=Pipeline([
            ('scaler', StandardScaler()),
            ('mlp', MLPClassifier(random_state=42, max_iter=2000))
        ]),
        param_grid={
            'mlp__hidden_layer_sizes': [(64,32), (128,64)],
            'mlp__alpha': [0.0001, 0.001],
            'mlp__learning_rate_init': [0.001, 0.01]
        },
        cv=5,
        n_jobs=-1,
        scoring='roc_auc'
    )
}

# LASSO Method
methods['LASSO'] = {
    'ml_l': GridSearchCV(
        estimator=Lasso(random_state=42),
        param_grid={
            'alpha': [0.01, 0.1, 1.0],
        },
        cv=5,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    ),
    'ml_m': GridSearchCV(
        estimator=LogisticRegression(penalty='l2', solver='lbfgs', random_state=42, max_iter=2000),
        param_grid={
            'C': [0.1, 1.0, 10.0],
        },
        cv=5,
        n_jobs=-1,
        scoring='roc_auc'
    )
}

# Collect results
results = []

# Iterate over each data source and method
for src_name, src_data in [('e1', data_e1), ('e2', data_e2)]:
    for method_name, learners in methods.items():
        print(f"Running DoubleMLPLR for data source: {src_name}, method: {method_name}")
        summary = run_dml_plr(src_data, learners['ml_l'], learners['ml_m'])
        results.append({
            'data_source': src_name,
            'method': method_name,
            'summary': summary
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df = pd.concat([(
    result['summary']
    .assign(data_source=result['data_source'], method=result['method'])
    .pipe(lambda df: df[['method', 'coef', 'std err', 't', 'P>|t|', '2.5 %', '97.5 %', 'data_source']])
) for result in results])
results_df.to_csv('q3e-results/grid_search.csv', index=False)

Running DoubleMLPLR for data source: e1, method: RandomForest
Running DoubleMLPLR for data source: e1, method: GradientBoosting
Running DoubleMLPLR for data source: e1, method: DeepNN


             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('mlp',
                                        MLPClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'mlp__alpha': [0.0001, 0.001],
                         'mlp__hidden_layer_sizes': [(64, 32), (128, 64)],
                         'mlp__learning_rate_init': [0.001, 0.01]},
             scoring='roc_auc') for ml_m are close to zero or one (eps=1e-12).


Running DoubleMLPLR for data source: e1, method: LASSO
Running DoubleMLPLR for data source: e2, method: RandomForest
Running DoubleMLPLR for data source: e2, method: GradientBoosting
Running DoubleMLPLR for data source: e2, method: DeepNN




Running DoubleMLPLR for data source: e2, method: LASSO


In [22]:
results_df

Unnamed: 0,method,coef,std err,t,P>|t|,2.5 %,97.5 %,data_source
t,RandomForest,0.502296,0.032106,15.644891,3.5998730000000004e-55,0.439369,0.565223,e1
t,GradientBoosting,0.518262,0.032348,16.021308,9.072225e-58,0.45486,0.581663,e1
t,DeepNN,0.500243,0.032388,15.44514,8.135033e-54,0.436763,0.563723,e1
t,LASSO,0.443051,0.027604,16.05038,5.681481e-58,0.388948,0.497153,e1
t,RandomForest,2.00042,0.021877,91.438273,0.0,1.957541,2.043299,e2
t,GradientBoosting,1.990438,0.021741,91.550747,0.0,1.947826,2.033051,e2
t,DeepNN,1.907454,0.021751,87.694617,0.0,1.864823,1.950085,e2
t,LASSO,1.99379,0.022269,89.53036,0.0,1.950142,2.037437,e2


In [None]:
import pandas as pd
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

q3 = pd.read_csv("data_for_HW4.csv")

data_e1 = q3[q3['e'] == 1].copy()
data_e2 = q3[q3['e'] == 2].copy()

y_col = 'y'
d_col = 't'
x_cols = ['x.1','x.2','x.3','x.4','x.5']


def run_dml_plr(data, ml_l, ml_m, n_folds=5, score='partialling out'):

    obj_dml_data = DoubleMLData(data, y_col, d_col, x_cols)
    dml_plr = DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=n_folds, score=score)
    dml_plr.fit()
    return dml_plr.summary


methods = {}

methods['RandomForest'] = {
    'ml_l': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
    'ml_m': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
}

methods['GradientBoosting'] = {
    'ml_l': GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42),
    'ml_m': GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)
}

methods['DeepNN'] = {
    'ml_l': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))
    ]),
    'ml_m': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))
    ])
}

methods['LASSO'] = {
    'ml_l': Lasso(alpha=0.1, random_state=42),
    'ml_m': LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
}


results = []

for src_name, src_data in [('e1', data_e1), ('e2', data_e2)]:
    for method_name, learners in methods.items():
        summary = run_dml_plr(src_data, learners['ml_l'], learners['ml_m'])
        results.append({
            'data_source': src_name,
            'method': method_name,
            'summary': summary
        })

results_df = pd.DataFrame(results)
results_df = pd.concat([(
    result['summary']
    .assign(data_source=result['data_source'], method=result['method'])
    .pipe(lambda df: df[['method', 'coef', 'std err', 't', 'P>|t|', '2.5 %', '97.5 %', 'data_source']])
) for result in results])
results_df.to_csv('q3e-results/no_grid_search.csv', index=False)

                ('mlp',
                 MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500,
                               random_state=42))]) for ml_m are close to zero or one (eps=1e-12).


  data_source            method  \
0          e1      RandomForest   
1          e1  GradientBoosting   
2          e1            DeepNN   
3          e1             LASSO   
4          e2      RandomForest   
5          e2  GradientBoosting   
6          e2            DeepNN   
7          e2             LASSO   

                                             summary  
0         coef  std err          t         P>|t| ...  
1         coef   std err          t         P>|t|...  
2         coef  std err          t         P>|t| ...  
3         coef   std err          t         P>|t|...  
4         coef  std err         t  P>|t|     2.5 ...  
5         coef   std err          t  P>|t|     2....  
6         coef   std err          t  P>|t|     2....  
7         coef  std err          t  P>|t|     2.5...  


In [19]:
results_df = pd.concat([(
    result['summary']
    .assign(data_source=result['data_source'], method=result['method'])
    .pipe(lambda df: df[['method', 'coef', 'std err', 't', 'P>|t|', '2.5 %', '97.5 %', 'data_source']])
) for result in results])
print(results_df)

             method      coef   std err          t         P>|t|     2.5 %  \
t      RandomForest  0.484574  0.031210  15.526405  2.299137e-54  0.423405   
t  GradientBoosting  0.521255  0.032215  16.180598  6.911977e-59  0.458116   
t            DeepNN  0.496879  0.032290  15.388224  1.963492e-53  0.433593   
t             LASSO  0.443827  0.027538  16.116643  1.949143e-58  0.389852   
t      RandomForest  1.997878  0.022700  88.013960  0.000000e+00  1.953388   
t  GradientBoosting  1.996379  0.021839  91.415537  0.000000e+00  1.953576   
t            DeepNN  1.927452  0.022653  85.085649  0.000000e+00  1.883053   
t             LASSO  1.993349  0.022390  89.029704  0.000000e+00  1.949466   

     97.5 % data_source  
t  0.545744          e1  
t  0.584395          e1  
t  0.560166          e1  
t  0.497801          e1  
t  2.042369          e2  
t  2.039182          e2  
t  1.971852          e2  
t  2.037232          e2  


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import doubleml as dml

q3 = pd.read_csv("data_for_HW4.csv")

data_source_1 = q3[q3["e"] == 1]
data_source_2 = q3[q3["e"] == 2]

# Prepare variables for DoubleML
x_cols = ["x.1", "x.2", "x.3", "x.4", "x.5"]
y_col = "y"
t_col = "t"

# Data preprocessing for DoubleML
dml_data = dml.DoubleMLData(data_source_2, y_col, t_col, x_cols)

# Define nonparametric machine learning models for nuisance functions
# Experiment with Random Forest and Gradient Boosting
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
gb_regressor = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)

# Create and fit the DoubleMLPLR model
dml_plr_rf = dml.DoubleMLPLR(dml_data, ml_l=rf_regressor, ml_m=rf_classifier)
dml_plr_gb = dml.DoubleMLPLR(dml_data, ml_l=gb_regressor, ml_m=gb_classifier)

# Fit the models
dml_plr_rf.fit()
dml_plr_gb.fit()

# Display summaries
print("Random Forest Results:")
print(dml_plr_rf.summary)

print("\nGradient Boosting Results:")
print(dml_plr_gb.summary)

# Optional: Try other configurations like tuning hyperparameters using dml_plr_rf.tune() or dml_plr_gb.tune()

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import doubleml as dml

# Load the dataset
q3 = pd.read_csv("data_for_HW4.csv")

# Separate data sources
data_source_1 = q3[q3["e"] == 1]
data_source_2 = q3[q3["e"] == 2]

# Prepare variables for DoubleML
x_cols = ["x.1", "x.2", "x.3", "x.4", "x.5"]
y_col = "y"
t_col = "t"

# Data preprocessing for DoubleML
dml_data = dml.DoubleMLData(data_source_2, y_col, t_col, x_cols)

# Define models for nuisance function estimation
# Deep Neural Network
dnn_regressor = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)

# Lasso Regression
lasso_regressor = Lasso(alpha=0.1)

# Elastic Net
elastic_net_regressor = ElasticNet(alpha=0.1, l1_ratio=0.5)

# Random Forest Classifier for propensity score estimation (binary treatment)
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Initialize DoubleML objects for each model
# Deep Neural Network
dml_dnn = dml.DoubleMLPLR(dml_data, ml_l=dnn_regressor, ml_m=rf_classifier)

# Lasso
dml_lasso = dml.DoubleMLPLR(dml_data, ml_l=lasso_regressor, ml_m=rf_classifier)

# Elastic Net
dml_elastic_net = dml.DoubleMLPLR(dml_data, ml_l=elastic_net_regressor, ml_m=rf_classifier)

# Fit the models
dml_dnn.fit()
dml_lasso.fit()
dml_elastic_net.fit()

# Display summaries
print("Deep Neural Network Results:")
print(dml_dnn.summary)

print("\nLASSO Results:")
print(dml_lasso.summary)

print("\nElastic Net Results:")
print(dml_elastic_net.summary)

# Optional: You can also explore tuning hyperparameters for DNN, LASSO, or Elastic Net

In [None]:
import pandas as pd
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ----------------------------------------------------
# Load the data
q3 = pd.read_csv("data_for_HW4.csv")

# Separate data by data source
data_e1 = q3[q3['e'] == 1].copy()
data_e2 = q3[q3['e'] == 2].copy()

# Define variable names
y_col = 'y'
d_col = 't'
x_cols = ['x.1','x.2','x.3','x.4','x.5']

# ----------------------------------------------------
# Function to run DoubleML PLR and return ATE estimates
def run_dml_plr(data, ml_l, ml_m, score='partialling out', n_folds=5):
    """
    data: pd.DataFrame containing y, d, X's
    ml_l: estimator for E[Y|X]
    ml_m: estimator for E[D|X]
    score: 'partialling out' or a callable score function
    n_folds: number of folds for sample splitting
    """
    obj_dml_data = DoubleMLData(data, y_col, d_col, x_cols)
    dml_plr = DoubleMLPLR(
        obj_dml_data, 
        ml_l=ml_l, 
        ml_m=ml_m, 
        n_folds=n_folds, 
        score=score
    )
    dml_plr.fit()
    return dml_plr.coef, dml_plr.se

# ----------------------------------------------------
# Define a dictionary of methods to try
# We'll try four different sets of learners:
# 1) Random Forest
# 2) Gradient Boosting
# 3) Deep NN (MLP)
# 4) LASSO for Y|X and LogisticRegression for D|X

methods = {}

# 1) Random Forest
methods['RandomForest'] = {
    'ml_l': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
    'ml_m': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
}

# 2) Gradient Boosting
methods['GradientBoosting'] = {
    'ml_l': GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42),
    'ml_m': GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)
}

# 3) Deep NN (using MLP)
# We use pipelines to scale the data before MLP
methods['DeepNN'] = {
    'ml_l': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))
    ]),
    'ml_m': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))
    ])
}

# 4) LASSO for l(x) and LogisticRegression for m(x)
methods['LASSO'] = {
    'ml_l': Lasso(alpha=0.1, random_state=42),
    'ml_m': LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
}

# ----------------------------------------------------
# Run all methods for both data sources
results = []

for src_name, src_data in [('e1', data_e1), ('e2', data_e2)]:
    for method_name, learners in methods.items():
        coef, se = run_dml_plr(src_data, learners['ml_l'], learners['ml_m'])
        # Extract ATE and its standard error (assuming single treatment variable)
        ate_est = coef[0]
        ate_se = se[0]
        results.append({
            'data_source': src_name,
            'method': method_name,
            'ATE_estimate': ate_est,
            'ATE_std_error': ate_se
        })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results)
print(results_df)

                ('mlp',
                 MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500,
                               random_state=42))]) for ml_m are close to zero or one (eps=1e-12).


  data_source            method  ATE_estimate  ATE_std_error
0          e1      RandomForest      0.480545       0.031234
1          e1  GradientBoosting      0.519807       0.032165
2          e1            DeepNN      0.481113       0.031812
3          e1             LASSO      0.442431       0.027542
4          e2      RandomForest      1.997475       0.022608
5          e2  GradientBoosting      2.002289       0.022061
6          e2            DeepNN      1.926935       0.022798
7          e2             LASSO      1.994355       0.022378


In [None]:
results_df.to_excel("q3d.xlsx", index=False)

In [None]:
import pandas as pd
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ----------------------------------------------------
# Load data
q3 = pd.read_csv("data_for_HW4.csv")

# We assume t is binary {0,1}, and e indicates the data source {1 or 2}.
data_e1 = q3[q3['e'] == 1].copy()
data_e2 = q3[q3['e'] == 2].copy()

# Define variable names
y_col = 'y'
d_col = 't'
x_cols = ['x.1','x.2','x.3','x.4','x.5']

# ----------------------------------------------------
# Function to run DML-PLR with given ML learners
def run_dml_plr(data, ml_l, ml_m, n_folds=5, score='partialling out'):
    """
    data: pandas DataFrame containing columns y, d, and x's.
    ml_l: estimator for E[Y|X], must implement fit() and predict().
    ml_m: estimator for E[D|X]; if D is binary, should be classifier implementing fit() and predict_proba().
    n_folds: number of folds for cross-fitting.
    score: score function, default 'partialling out'.
    """
    obj_dml_data = DoubleMLData(data, y_col, d_col, x_cols)
    dml_plr = DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=n_folds, score=score)
    dml_plr.fit()
    return dml_plr.summary

# ----------------------------------------------------
# Create a dictionary of methods:
# We'll try four different approaches for the nuisance functions:
# 1) Random Forest
# 2) Gradient Boosting
# 3) Deep Neural Network (MLP)
# 4) LASSO for Y-model and LogisticRegression for D-model

# Note: For binary treatment, ml_m should be a classifier.
methods = {}

# 1) Random Forest
methods['RandomForest'] = {
    'ml_l': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
    'ml_m': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
}

# 2) Gradient Boosting
methods['GradientBoosting'] = {
    'ml_l': GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42),
    'ml_m': GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)
}

# 3) Deep NN (using MLP); we add scaling for better performance
methods['DeepNN'] = {
    'ml_l': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))
    ]),
    'ml_m': Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))
    ])
}

# 4) LASSO for the Y-model and LogisticRegression for the D-model
methods['LASSO'] = {
    'ml_l': Lasso(alpha=0.1, random_state=42),
    'ml_m': LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
}

# ----------------------------------------------------
# Run all methods for both data sources and store results
results = []

for src_name, src_data in [('e1', data_e1), ('e2', data_e2)]:
    for method_name, learners in methods.items():
        summary = run_dml_plr(src_data, learners['ml_l'], learners['ml_m'])
        # Extract the ATE and standard error
        results.append({
            'data_source': src_name,
            'method': method_name,
            'summary': summary
        })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results)
print(results_df)

                ('mlp',
                 MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500,
                               random_state=42))]) for ml_m are close to zero or one (eps=1e-12).


  data_source            method  ATE_estimate  ATE_std_error
0          e1      RandomForest      0.483731       0.031268
1          e1  GradientBoosting      0.516744       0.032110
2          e1            DeepNN      0.479756       0.031844
3          e1             LASSO      0.443138       0.027548
4          e2      RandomForest      1.991636       0.022801
5          e2  GradientBoosting      2.001202       0.021719
6          e2            DeepNN      1.928837       0.022379
7          e2             LASSO      1.994241       0.022375
