In [1]:
import doubleml as dml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.linear_model import LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier

## Lasso, Full Sample

In [10]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/plr/LassoCV/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        lasso_l = LassoCV()
        lasso_l.fit(X,y)
        ml_l = Lasso(alpha=lasso_l.alpha_)
        
        lasso_m = LassoCV()
        lasso_m.fit(X,d)
        ml_m = Lasso(alpha=lasso_m.alpha_)
        
        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(mean_squared_error(y, dml_plr_automl.summary["coef"].values[0] * d + dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(y, dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(log_loss(d, dml_plr_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_LassoCV_fullsample.csv")

## Lasso, tune on the folds

In [4]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/plr/LassoCV/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_l = LassoCV()
        ml_m = LassoCV()
        
        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(mean_squared_error(y, dml_plr_automl.summary["coef"].values[0] * d + dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(y, dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(log_loss(d, dml_plr_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_LassoCV_onfolds.csv")

## Lasso, split the sample

In [5]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/plr/LassoCV/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        
        df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42*k)
        y_tune, d_tune, X_tune = df_tune["Y"], df_tune["A"], df_tune.drop(columns=["Y","A"])

        # full sample, tuning
        lasso_l = LassoCV()
        lasso_l.fit(X_tune, y_tune)
        ml_l = Lasso(alpha=lasso_l.alpha_)
        
        lasso_m = LassoCV()
        lasso_m.fit(X_tune, d_tune)
        ml_m = Lasso(alpha=lasso_m.alpha_)
        

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df_test, y_col='Y', d_cols='A')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(mean_squared_error(df_test["Y"], dml_plr_automl.summary["coef"].values[0] * df_test["A"] + dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(df_test["Y"], dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(log_loss(df_test["A"], dml_plr_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_LassoCV_splitsample.csv")

## Random Forest, default parameters

In [4]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/plr/RFdef/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_l = RandomForestRegressor()
        ml_m = RandomForestClassifier()

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(mean_squared_error(y, dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(log_loss(d, dml_plr_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_RandomForestdefault.csv")

## Boosting, default parameters

In [5]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/plr/XGBdef/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"..dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_l = XGBRegressor()
        ml_m = XGBClassifier()

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_plr_automl = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_automl.fit(store_predictions=True)

        _res_full.append(dml_plr_automl.summary["coef"].values[0])
        _res_full.append(dml_plr_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_plr_automl.summary["97.5 %"].values[0])
        _res_full.append(mean_squared_error(y, dml_plr_automl.predictions["ml_l"][:,0,0]))
        _res_full.append(log_loss(d, dml_plr_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_XGBoostdefault.csv")