In [4]:
import doubleml as dml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, Lasso
from flaml import AutoML
from xgboost import XGBRegressor
from doubleml._utils_resampling import DoubleMLResampling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from ..dgp.make_BCH import DGP_BCH2014
import matplotlib.pyplot as plt
from ..doubleml_flaml_api.doubleml_flaml_api import FlamlRegressorDoubleML, FlamlClassifierDoubleML


## Boosting

In [None]:
## with ACIC data
n_folds = 4
no_iter = 100

for n_obs in [20,50,100,250,500,1000,2000,5000,10000,200000]:
    res_path = f"../results/bch/XGBoost/"
    res_fullsample = []

    for k in range(no_iter):
        # for saving each iteration
        _res_full = []
        
        np.random.seed(k)
        
        # load data
        X,y,d = DGP_BCH2014(theta=0.5, n_obs=n_obs, dim_x=200)  
        x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])]
        df = pd.DataFrame(np.column_stack((X, y, d)),
                            columns=x_cols + ['y', 'd'])      

        # full sample, tuning
        ml_l = XGBRegressor()
        ml_m = XGBRegressor()
        
        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='y',d_cols='d')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(mean_squared_error(y, dml_plr_automl.summary["coef"].values[0] * d + dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(y, dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(d, dml_plr_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_default.csv")

## Random Forest

In [68]:
grid_l = {'max_depth': [4,5,6],
        'n_estimators': [100],
        'n_jobs': [-1]}
grid_m = {'max_depth': [4,5,6],
        'n_estimators': [100],
        'n_jobs': [-1]}

In [None]:
## with ACIC data
n_folds = 4
no_iter = 100

for n_obs in [10,50,100,250,500,1000,2000,5000]:
    res_path = f"../results/bch/RandomForest/"
    res_fullsample = []
    res_splitsample = []
    res_onfolds = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []
        _res_split = []
        _res_of = []
        
        np.random.seed(k)

        # load data
        X,y,d = DGP_BCH2014(theta=0.5, n_obs=n_obs, dim_x=200)  
        x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])]
        df = pd.DataFrame(np.column_stack((X, y, d)),
                            columns=x_cols + ['y', 'd'])  
    
        # full sample, tuning
        gs_l = GridSearchCV(RandomForestRegressor(), grid_l, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_l.fit(X,y)
        gs_m = GridSearchCV(RandomForestRegressor(), grid_m, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_m.fit(X,d)

        ml_l = gs_l.best_estimator_
        ml_m = gs_m.best_estimator_

        # full sample, doubleml
        obj_dml_data = dml.DoubleMLData(df,y_col='y',d_cols='d')
        dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr.fit(store_predictions=True)

        _res_full.append(dml_plr.summary["coef"].values[0])
        _res_full.append(dml_plr.summary["2.5 %"].values[0])
        _res_full.append(dml_plr.summary["97.5 %"].values[0])
        _res_full.append(-1 * gs_l.best_score_)
        _res_full.append(-1 * gs_m.best_score_)
        _res_full.append(mean_squared_error(y, dml_plr.summary["coef"].values[0] * d + dml_plr.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(y, dml_plr.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(d, dml_plr.predictions["ml_m"][:,0,0]))

        # split sample, tuning
        df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42*k)
        y_tune, d_tune, X_tune = df_tune["y"], df_tune["d"], df_tune.drop(columns=["y","d"])
        
        gs_l = GridSearchCV(RandomForestRegressor(), grid_l, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_l.fit(X_tune,y_tune)
        gs_m = GridSearchCV(RandomForestRegressor(), grid_m, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_m.fit(X_tune,d_tune)

        ml_l = gs_l.best_estimator_
        ml_m = gs_m.best_estimator_

        # split sample, doubleml           
        obj_dml_data = dml.DoubleMLData(df_test, y_col='y', d_cols='d')
        dml_plr_split = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_split.fit(store_predictions = True)

        _res_split.append(dml_plr_split.summary["coef"].values[0])
        _res_split.append(dml_plr_split.summary["2.5 %"].values[0])
        _res_split.append(dml_plr_split.summary["97.5 %"].values[0])
        _res_split.append(-1 * gs_l.best_score_)
        _res_split.append(-1 * gs_m.best_score_)
        _res_split.append(mean_squared_error(df_test["y"], dml_plr_split.summary["coef"].values[0] * df_test["d"] +  dml_plr_split.predictions["ml_l"][:,0,0]))
        _res_split.append(mean_squared_error(df_test["y"], dml_plr_split.predictions["ml_l"][:,0,0]))
        _res_split.append(mean_squared_error(df_test["d"], dml_plr_split.predictions["ml_m"][:,0,0]))

        # on folds
        ml_l = RandomForestRegressor()
        ml_m = RandomForestRegressor()

        obj_dml_data = dml.DoubleMLData(df,y_col='y',d_cols='d')

        dml_plr_onfolds = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_onfolds.tune({"ml_l": grid_l, "ml_m": grid_m}, tune_on_folds=True)
        dml_plr_onfolds.fit(store_predictions=True, store_models = True)

        _res_of.append(dml_plr_onfolds.summary["coef"].values[0])
        _res_of.append(dml_plr_onfolds.summary["2.5 %"].values[0])
        _res_of.append(dml_plr_onfolds.summary["97.5 %"].values[0])
        _res_of.append(np.nan)
        _res_of.append(np.nan)
        _res_of.append(mean_squared_error(y, dml_plr_onfolds.summary["coef"].values[0] * d + dml_plr_onfolds.predictions["ml_l"][:,0,0]))
        _res_of.append(mean_squared_error(y, dml_plr_onfolds.predictions["ml_l"][:,0,0]))
        _res_of.append(mean_squared_error(d, dml_plr_onfolds.predictions["ml_m"][:,0,0]))
        
        # # add this iteration to overall results
        res_fullsample.append(_res_full)
        res_splitsample.append(_res_split)
        res_onfolds.append(_res_of)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_fullsample.csv")
        pd.DataFrame(res_splitsample, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_splitsample.csv")
        pd.DataFrame(res_onfolds, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_onfolds.csv")

## AutoML

In [None]:
## with ACIC data
n_folds = 4
no_iter = 100
t = 60

for n_obs in [10,50,100,250,500,1000,2000,5000,10000,200000]:
    res_path = f"simulations/results/bch/FLAML/"
    res_fullsample = []
    res_splitsample = []
    res_onfolds = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []
        _res_split = []
        _res_of = []
        
        # load data
        X,y,d = DGP_BCH2014(theta=0.5, n_obs=n_obs, dim_x=200)  
        x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])]
        df = pd.DataFrame(np.column_stack((X, y, d)),
                        columns=x_cols + ['y', 'd'])  
    
        # full sample, tuning
        while True:
            try:
                automl_y = AutoML()
                automl_y.fit(X, y, task="regression", time_budget=t, metric="mse", verbose=False, estimator_list = None)
                automl_d = AutoML()
                automl_d.fit(X, d, task="regression", time_budget=t, metric="mse", verbose=False, estimator_list = None)

                ml_l = automl_y.model.estimator
                ml_m = automl_d.model.estimator
                break
            except AttributeError: 
                pass

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='y',d_cols='d')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(automl_y.best_loss)
        _res_full.append(automl_d.best_loss)
        _res_full.append(mean_squared_error(y, dml_plr_automl.summary["coef"].values[0] * d + dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(y, dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(d, dml_plr_automl.predictions["ml_m"][:,0,0]))

        # split sample, tuning
        
        df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42*k)
        y_tune, d_tune, X_tune = df_tune["y"], df_tune["d"], df_tune.drop(columns=["y","d"])
        
        while True:
            try:
                automl_y = AutoML()
                automl_y.fit(X_tune, y_tune, task="regression", time_budget=t, metric= "mse", verbose=False, estimator_list = None)
                automl_d = AutoML()
                automl_d.fit(X_tune, d_tune, task="regression", time_budget=t, metric= "mse", verbose=False, estimator_list = None)

                ml_l = automl_y.model.estimator
                ml_m = automl_d.model.estimator
                break 
            except AttributeError:
                pass

        # split sample, doubleml           
        np.random.seed(2*k)
        obj_dml_data = dml.DoubleMLData(df_test, y_col='y', d_cols='d')
        dml_plr_automl_split = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl_split.fit(store_predictions = True)

        _res_split.append(dml_plr_automl_split.summary["coef"].values[0])
        _res_split.append(dml_plr_automl_split.summary["2.5 %"].values[0])
        _res_split.append(dml_plr_automl_split.summary["97.5 %"].values[0])
        _res_split.append(automl_y.best_loss)
        _res_split.append(automl_d.best_loss)
        _res_split.append(mean_squared_error(df_test["y"], dml_plr_automl_split.summary["coef"].values[0] * df_test["d"] + dml_plr_automl_split.predictions["ml_l"][:,0,0]))
        _res_split.append(mean_squared_error(df_test["y"], dml_plr_automl_split.predictions["ml_l"][:,0,0]))
        _res_split.append(mean_squared_error(df_test["d"], dml_plr_automl_split.predictions["ml_m"][:,0,0]))

        # on folds
        while True:
            try:
                ml_l = FlamlRegressorDoubleML(time = (t/4), metric="mse", estimator_list = None)
                ml_m = FlamlRegressorDoubleML(time = (t/4), metric="mse", estimator_list = None)

                obj_dml_data = dml.DoubleMLData(df,y_col='y',d_cols='d')
        
                np.random.seed(3*k)
                dml_plr_automl_onfolds = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
                dml_plr_automl_onfolds.fit(store_predictions=True, store_models = True)
                break
            except AttributeError:
                pass

        _res_of.append(dml_plr_automl_onfolds.summary["coef"].values[0])
        _res_of.append(dml_plr_automl_onfolds.summary["2.5 %"].values[0])
        _res_of.append(dml_plr_automl_onfolds.summary["97.5 %"].values[0])
        _res_of.append(np.mean([i.auto_ml.best_loss for i in dml_plr_automl_onfolds.models['ml_l']["d"][0]]))
        _res_of.append(np.mean([i.auto_ml.best_loss for i in dml_plr_automl_onfolds.models['ml_m']["d"][0]]))
        _res_of.append(mean_squared_error(y, dml_plr_automl_onfolds.summary["coef"].values[0] * d + dml_plr_automl_onfolds.predictions["ml_l"][:,0,0]))
        _res_of.append(mean_squared_error(y,  dml_plr_automl_onfolds.predictions["ml_l"][:,0,0]))
        _res_of.append(mean_squared_error(d, dml_plr_automl_onfolds.predictions["ml_m"][:,0,0]))
        
        # add this iteration to overall results
        res_fullsample.append(_res_full)
        res_splitsample.append(_res_split)
        res_onfolds.append(_res_of)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_fullsample.csv")
        pd.DataFrame(res_splitsample, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_splitsample.csv")
        pd.DataFrame(res_onfolds, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{n_obs}_onfolds.csv")
