In [1]:
import doubleml as dml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.linear_model import LassoCV, LogisticRegressionCV, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier

In [8]:
from sklearn.utils.multiclass import unique_labels
class LassoCVClassifier:
    _estimator_type = 'classifier'

    def __init__(self, *args, **kwargs):
        self.lasso = LassoCV(*args, **kwargs)

    def set_params(self, **params):
        self.lasso.set_params(**params)
        return self

    def get_params(self, deep=True):
        return self.lasso.get_params(deep)

    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.lasso.fit(X, y)
        return self

    def predict_proba(self, x):
        preds = self.lasso.predict(x)
        return np.c_[(1-preds),preds]
    

class LassoClassifier:
    _estimator_type = 'classifier'

    def __init__(self, *args, **kwargs):
        self.lasso = Lasso(*args, **kwargs)

    def set_params(self, **params):
        self.lasso.set_params(**params)
        return self

    def get_params(self, deep=True):
        return self.lasso.get_params(deep)

    def fit(self, X, y):
        self.classes_ = unique_labels(y)
        self.lasso.fit(X, y)
        return self

    def predict_proba(self, x):
        preds = self.lasso.predict(x)
        return np.c_[(1-preds),preds]

## Lasso, Full Sample

In [40]:
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/irm/LassoCV/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_g = LassoCV(n_jobs=-1)
        ml_g.fit(df.drop(columns=["Y"]), y)
        ml_g = Lasso(alpha=ml_g.alpha_)
        ml_m = LassoCV(n_jobs=-1)
        ml_m.fit(X, d)
        ml_m = LassoClassifier(alpha=ml_m.alpha_)

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold=0.025)
        dml_irm_automl.fit(store_predictions=True)

        _res_full.append(dml_irm_automl.summary["coef"].values[0])
        _res_full.append(dml_irm_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_irm_automl.summary["97.5 %"].values[0])
        treat_ind = (df["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl.predictions["ml_g0"][:,0,0]
        _res_full.append(mean_squared_error(y, ml_g_pred))
        _res_full.append(log_loss(d, dml_irm_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_LassoCV_fullsample.csv")

## Random Forest, default values

In [6]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/irm/RFdef/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_g = RandomForestRegressor(n_jobs=-1)
        ml_m = RandomForestClassifier(n_jobs=-1)

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold = 0.025)
        dml_irm_automl.fit(store_predictions=True)

        _res_full.append(dml_irm_automl.summary["coef"].values[0])
        _res_full.append(dml_irm_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_irm_automl.summary["97.5 %"].values[0])
        treat_ind = (df["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl.predictions["ml_g0"][:,0,0]
        _res_full.append(mean_squared_error(y, ml_g_pred))
        _res_full.append(log_loss(d, dml_irm_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_RandomForestdefault.csv")

## Boosting, default values

In [5]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/irm/XGBdef/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_g = XGBRegressor()
        ml_m = XGBClassifier()

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold = 0.025)
        dml_irm_automl.fit(store_predictions=True)

        _res_full.append(dml_irm_automl.summary["coef"].values[0])
        _res_full.append(dml_irm_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_irm_automl.summary["97.5 %"].values[0])
        treat_ind = (df["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl.predictions["ml_g0"][:,0,0]
        _res_full.append(mean_squared_error(y, ml_g_pred))
        _res_full.append(log_loss(d, dml_irm_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_XGBoostdefault.csv")

## Lasso, Tune on the folds

In [None]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/irm/LassoCV/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])

        # full sample, tuning
        ml_g = LassoCV(n_jobs=-1)
        ml_m = LassoCVClassifier(n_jobs=-1)

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold=0.025)
        dml_irm_automl.fit(store_predictions=True, store_models=True)

        _res_full.append(dml_irm_automl.summary["coef"].values[0])
        _res_full.append(dml_irm_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_irm_automl.summary["97.5 %"].values[0])
        treat_ind = (df["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl.predictions["ml_g0"][:,0,0]
        _res_full.append(mean_squared_error(y, ml_g_pred))
        _res_full.append(log_loss(d, dml_irm_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_LassoCV_onfolds.csv")

## Lasso, Split the sample

In [30]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/irm/LassoCV/"
    res_fullsample = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []

        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")      
        df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42)
        y_tune, d_tune, X_tune = df_tune["Y"], df_tune["A"], df_tune.drop(columns=["Y","A"])

        # full sample, tuning
        lasso_g = LassoCV(n_jobs=-1)
        lasso_g.fit(df_tune.drop(columns=["Y"]), y_tune)
        ml_g = Lasso(alpha=lasso_g.alpha_)
        log_m = LassoCV(n_jobs=-1)
        log_m.fit(X_tune, d_tune)
        ml_m = LassoClassifier(alpha=log_m.alpha_)

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df_test,y_col='Y',d_cols='A')
        dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold=0.025)
        dml_irm_automl.fit(store_predictions=True, store_models=True)

        _res_full.append(dml_irm_automl.summary["coef"].values[0])
        _res_full.append(dml_irm_automl.summary["2.5 %"].values[0])
        _res_full.append(dml_irm_automl.summary["97.5 %"].values[0])
        treat_ind = (df_test["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl.predictions["ml_g0"][:,0,0]
        _res_full.append(mean_squared_error(df_test["Y"], ml_g_pred))
        _res_full.append(log_loss(df_test["A"], dml_irm_automl.predictions["ml_m"][:,0,0]))

        # add this iteration to overall results
        res_fullsample.append(_res_full)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"Scenario{sc}_LassoCV_splitsample.csv")