In [1]:
import doubleml as dml
import numpy as np
from flaml import AutoML
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import StandardScaler

In [4]:
def load_and_format_covariates(file_path, delimiter=','):

    data = np.loadtxt(file_path, delimiter=delimiter)

    binfeats = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
    contfeats = [i for i in range(25) if i not in binfeats]

    mu_0, mu_1, x = data[:, 3][:, None], data[:, 4][:, None], data[:, 5:]
    perm = binfeats + contfeats
    x = x[:, perm]
    return x


def load_other_stuff(file_path, delimiter=','):
    data = np.loadtxt(file_path, delimiter=delimiter)
    t, y, y_cf = data[:, 0], data[:, 1][:, None], data[:, 2][:, None]
    mu_0, mu_1, x = data[:, 3][:, None], data[:, 4][:, None], data[:, 5:]
    return t.reshape(-1, 1), y, y_cf, mu_0, mu_1

In [22]:
res_path = f"../results/ihdp/"
mae_aml = []
res_fullsample = []

for i in range (100):
    _res_full = []
    path = f"../dgp/ihdp/ihdp_{i+1}.csv"
    x = load_and_format_covariates(path, delimiter=' ')
    t, y, y_cf, mu_0, mu_1 = load_other_stuff(path, delimiter=' ')
    X = np.c_[t, x]
    true_ATE = np.mean(mu_1 - mu_0)
    y_scaler = StandardScaler(with_mean=True).fit(y)
    y = y_scaler.transform(y)

    while True:
      try:
        automl_y = AutoML()
        automl_y.fit(X, y, task="regression", time_budget=60, metric="mse", verbose=False, estimator_list = None)
        automl_d = AutoML()
        automl_d.fit(x, t, task="classification", time_budget=60, metric='log_loss', verbose=False, estimator_list = None)
              
        ml_g = automl_y.model.estimator
        ml_m = automl_d.model.estimator
        break
      except:
        pass

    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, t)
    dml_irm_automl = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = 4, trimming_threshold=0.025)
    dml_irm_automl.fit(store_predictions = True)
    
    _res_full.append(y_scaler.scale_[0] * dml_irm_automl.summary["coef"].values[0])
    _res_full.append(y_scaler.scale_[0] * dml_irm_automl.summary["2.5 %"].values[0])
    _res_full.append(y_scaler.scale_[0] * dml_irm_automl.summary["97.5 %"].values[0])
    _res_full.append(automl_y.best_loss)
    _res_full.append(automl_d.best_loss)
    treat_ind = (t == 1)
    ml_g_pred = treat_ind.T * dml_irm_automl.predictions["ml_g1"][:,0,0] + (1 - treat_ind).T * dml_irm_automl.predictions["ml_g0"][:,0,0]
    _res_full.append(mean_squared_error(y, ml_g_pred.T))
    _res_full.append(log_loss(t, dml_irm_automl.predictions["ml_m"][:,0,0]))
    _res_full.append(true_ATE)

    res_fullsample.append(_res_full)
    pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","fs_loss_mll","fs_loss_mlm","true_ATE"]).to_csv(res_path + f"ihdp_flaml.csv")
