In [1]:
import doubleml as dml
import numpy as np
from flaml import AutoML
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from ..doubleml_flaml_api.doubleml_flaml_api import FlamlRegressorDoubleML, FlamlClassifierDoubleML

In [10]:
n_folds = 4
no_iter = 100
scenarios = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
for sc in scenarios:
    res_path = f"../results/acic/irm/Scenario{sc}/"
    for t in [1,4,7,10,20,40,60]:
        res_fullsample = []
        res_splitsample = []
        res_onfolds = []
        for k in range(no_iter):
            # for saving each iteration
            _res_full = []
            _res_split = []
            _res_of = []
            
            #load data
            df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")
            y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])
            
            
            # full sample, tuning
            while True:
                try:
                    automl_y = AutoML()
                    automl_y.fit(df.drop(columns=["Y"]), y, task="regression", time_budget=t, metric="mse", verbose=False, estimator_list = None)
                    automl_d = AutoML()
                    automl_d.fit(X, d, task="classification", time_budget=t, metric="log_loss", verbose=False, estimator_list = None)
            
                    ml_g = automl_y.model.estimator
                    ml_m = automl_d.model.estimator
                    break
                except AttributeError:
                    pass
                    
            # full sample, doubleml
            np.random.seed(k)
            obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
            dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold=0.025)
            dml_irm_automl.fit(store_predictions=True)
        
            _res_full.append(dml_irm_automl.summary["coef"].values[0])
            _res_full.append(dml_irm_automl.summary["2.5 %"].values[0])
            _res_full.append(dml_irm_automl.summary["97.5 %"].values[0])
            _res_full.append(automl_y.best_loss)
            _res_full.append(automl_d.best_loss)
            treat_ind = (df["A"] == 1)
            ml_g_pred = treat_ind * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl.predictions["ml_g0"][:,0,0]
            _res_full.append(mean_squared_error(y, ml_g_pred))
            _res_full.append(log_loss(d, dml_irm_automl.predictions["ml_m"][:,0,0]))
            
            
            # split sample, tuning
            df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42)
            y_tune, d_tune, X_tune = df_tune["Y"], df_tune["A"], df_tune.drop(columns=["Y","A"])
            
            while True:
                try:
                    automl_y = AutoML()
                    automl_y.fit(df_tune.drop(columns=["Y"]), y_tune, task="regression", time_budget=t, metric= "mse", verbose=False)
                    automl_d = AutoML()
                    automl_d.fit(X_tune, d_tune, task="classification", time_budget=t, metric= "log_loss", verbose=False)
            
                    ml_g = automl_y.model.estimator
                    ml_m = automl_d.model.estimator
                    break
                except AttributeError:
                    pass

            # split sample, doubleml
            np.random.seed(2*k)
            obj_dml_data = dml.DoubleMLData(df_test, y_col='Y', d_cols='A')
            dml_irm_automl_split = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold=0.025)
            dml_irm_automl_split.fit(store_predictions = True)

            _res_split.append(dml_irm_automl_split.summary["coef"].values[0])
            _res_split.append(dml_irm_automl_split.summary["2.5 %"].values[0])
            _res_split.append(dml_irm_automl_split.summary["97.5 %"].values[0])
            _res_split.append(automl_y.best_loss)
            _res_split.append(automl_d.best_loss)
            treat_ind = (df_test["A"] == 1)
            ml_g_pred = treat_ind * dml_irm_automl_split.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl_split.predictions["ml_g0"][:,0,0]
            _res_split.append(mean_squared_error(df_test["Y"], ml_g_pred))
            _res_split.append(log_loss(df_test["A"], dml_irm_automl_split.predictions["ml_m"][:,0,0]))
            
            # onfolds, tuning
            while True:
                try:
                    ml_g = FlamlRegressorDoubleML(time = (t/4), metric="mse", estimator_list = None)
                    ml_m = FlamlClassifierDoubleML(time = (t/4), metric="log_loss", estimator_list = None)

                    obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
            
                    np.random.seed(3*k)
                    dml_irm_automl_onfolds = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds, trimming_threshold=0.025)
                    dml_irm_automl_onfolds.fit(store_predictions=True, store_models=True)
                    break
                except AttributeError:
                    pass
            
            _res_of.append(dml_irm_automl_onfolds.summary["coef"].values[0])
            _res_of.append(dml_irm_automl_onfolds.summary["2.5 %"].values[0])
            _res_of.append(dml_irm_automl_onfolds.summary["97.5 %"].values[0])
            
            treat_ind = (df["A"] == 1)
            fs_of_mlm, fs_of_mlg = 0,0
            for i in range(n_folds):
                fs_of_mlg += np.mean(treat_ind * dml_irm_automl_onfolds.models['ml_g1']["A"][0][i].auto_ml.best_loss + (1 - treat_ind) * dml_irm_automl_onfolds.models['ml_g0']["A"][0][i].auto_ml.best_loss)
                fs_of_mlm += dml_irm_automl_onfolds.models['ml_m']["A"][0][i].auto_ml.best_loss
            _res_of.append(fs_of_mlg / n_folds)
            _res_of.append(fs_of_mlm / n_folds)

            ml_g_pred = treat_ind * dml_irm_automl_onfolds.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_automl_onfolds.predictions["ml_g0"][:,0,0]
            _res_of.append(mean_squared_error(y, ml_g_pred))
            _res_of.append(log_loss(d, dml_irm_automl_onfolds.predictions["ml_m"][:,0,0]))

            # add this iteration to overall results
            res_fullsample.append(_res_full)
            res_splitsample.append(_res_split)
            res_onfolds.append(_res_of)

            # save current result
            pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{t}_fullsample.csv")
            pd.DataFrame(res_splitsample, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{t}_splitsample.csv")
            pd.DataFrame(res_onfolds, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"{t}_onfolds.csv")  
