In [1]:
import numpy as np
import pandas as pd
import doubleml as dml
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.model_selection import train_test_split

In [2]:
grid_l = {'max_depth': [4,5,6],
        'n_estimators': [100],
        'n_jobs': [-1]}
grid_m = {'max_depth': [4,5,6],
        'n_estimators': [100],
        'n_jobs': [-1]}

## PLR Model

In [None]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]:
    res_path = f"../results/acic/plr/RandomForest/Scenario{sc}_"
    res_fullsample = []
    res_splitsample = []
    res_onfolds = []
    for k in range(no_iter):
        print(k)
        # for saving each iteration
        _res_full = []
        _res_split = []
        _res_of = []
        
        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])
    
        # full sample, tuning
        gs_l = GridSearchCV(RandomForestRegressor(), grid_l, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_l.fit(X,y)
        gs_m = GridSearchCV(RandomForestClassifier(), grid_m, n_jobs=-1, scoring="neg_log_loss")
        gs_m.fit(X,d)

        ml_l = gs_l.best_estimator_
        ml_m = gs_m.best_estimator_

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_plr = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr.fit(store_predictions=True)

        _res_full.append(dml_plr.summary["coef"].values[0])
        _res_full.append(dml_plr.summary["2.5 %"].values[0])
        _res_full.append(dml_plr.summary["97.5 %"].values[0])
        _res_full.append(-1 * gs_l.best_score_)
        _res_full.append(-1 * gs_m.best_score_)
        _res_full.append(mean_squared_error(y, dml_plr.summary["coef"].values[0] * d + dml_plr.predictions["ml_l"][:,0,0]))
        _res_full.append(mean_squared_error(y, dml_plr.predictions["ml_l"][:,0,0]))
        _res_full.append(log_loss(d, dml_plr.predictions["ml_m"][:,0,0]))

        # split sample, tuning
        df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42*k)
        y_tune, d_tune, X_tune = df_tune["Y"], df_tune["A"], df_tune.drop(columns=["Y","A"])
        
        gs_l = GridSearchCV(RandomForestRegressor(), grid_l, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_l.fit(X_tune,y_tune)
        gs_m = GridSearchCV(RandomForestClassifier(), grid_m, n_jobs=-1, scoring="neg_log_loss")
        gs_m.fit(X_tune,d_tune)

        ml_l = gs_l.best_estimator_
        ml_m = gs_m.best_estimator_

        # split sample, doubleml           
        np.random.seed(2*k)
        obj_dml_data = dml.DoubleMLData(df_test, y_col='Y', d_cols='A')
        dml_plr_split = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_split.fit(store_predictions = True)

        _res_split.append(dml_plr_split.summary["coef"].values[0])
        _res_split.append(dml_plr_split.summary["2.5 %"].values[0])
        _res_split.append(dml_plr_split.summary["97.5 %"].values[0])
        _res_split.append(-1 * gs_l.best_score_)
        _res_split.append(-1 * gs_m.best_score_)
        _res_split.append(mean_squared_error(df_test["Y"], dml_plr_split.summary["coef"].values[0] * df_test["A"] +  dml_plr_split.predictions["ml_l"][:,0,0]))
        _res_split.append(mean_squared_error(df_test["Y"], dml_plr_split.predictions["ml_l"][:,0,0]))
        _res_split.append(log_loss(df_test["A"], dml_plr_split.predictions["ml_m"][:,0,0]))

        # on folds
        ml_l = RandomForestRegressor()
        ml_m = RandomForestClassifier()

        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')

        np.random.seed(3*k)
        dml_plr_onfolds = dml.DoubleMLPLR(obj_dml_data, ml_l = ml_l, ml_m = ml_m, n_folds = n_folds)
        dml_plr_onfolds.tune({"ml_l": grid_l, "ml_m": grid_m}, tune_on_folds=True)
        dml_plr_onfolds.fit(store_predictions=True, store_models = True)

        _res_of.append(dml_plr_onfolds.summary["coef"].values[0])
        _res_of.append(dml_plr_onfolds.summary["2.5 %"].values[0])
        _res_of.append(dml_plr_onfolds.summary["97.5 %"].values[0])
        _res_of.append(np.nan)
        _res_of.append(np.nan)
        _res_of.append(mean_squared_error(y, dml_plr_onfolds.summary["coef"].values[0] * d + dml_plr_onfolds.predictions["ml_l"][:,0,0]))
        _res_of.append(mean_squared_error(y, dml_plr_onfolds.predictions["ml_l"][:,0,0]))
        _res_of.append(log_loss(d, dml_plr_onfolds.predictions["ml_m"][:,0,0]))
        
        # # add this iteration to overall results
        res_fullsample.append(_res_full)
        res_splitsample.append(_res_split)
        res_onfolds.append(_res_of)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"fullsample_v3.csv")
        pd.DataFrame(res_splitsample, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"splitsample_v3.csv")
        pd.DataFrame(res_onfolds, columns=["coef","2.5%","97.5%","tune_loss_mll","tune_loss_mlm","loss_Y","fs_loss_mll","fs_loss_mlm"]).to_csv(res_path + f"onfolds_v3.csv")


## IRM model

In [6]:
## with ACIC data
n_folds = 4
no_iter = 100

for sc in range(16):
    sc+=1
    res_path = f"../results/acic/irm/RandomForest/Scenario{sc}_"
    res_fullsample = []
    res_splitsample = []
    res_onfolds = []
    for k in range(no_iter):
        # for saving each iteration
        _res_full = []
        _res_split = []
        _res_of = []
        
        # load data
        df = pd.read_csv(f"../dgp/acic/Scenario{sc}/CHDScenario{sc}DS{k+1}.csv")           
        y, d, X = df["Y"], df["A"], df.drop(columns=["Y","A"])
    
        # full sample, tuning
        gs_g = GridSearchCV(RandomForestRegressor(), grid_l, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_g.fit(np.c_[X,d],y)
        gs_m = GridSearchCV(RandomForestClassifier(), grid_m, n_jobs=-1, scoring="neg_log_loss")
        gs_m.fit(X,d)

        ml_g = gs_g.best_estimator_
        ml_m = gs_m.best_estimator_

        # full sample, doubleml
        np.random.seed(k)
        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')
        dml_irm = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds)
        dml_irm.fit(store_predictions=True)

        _res_full.append(dml_irm.summary["coef"].values[0])
        _res_full.append(dml_irm.summary["2.5 %"].values[0])
        _res_full.append(dml_irm.summary["97.5 %"].values[0])
        _res_full.append(-1 * gs_g.best_score_)
        _res_full.append(-1 * gs_m.best_score_)
        treat_ind = (df["A"] == 1)
        ml_g_pred = treat_ind * dml_irm.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm.predictions["ml_g0"][:,0,0]
        _res_full.append(mean_squared_error(y, ml_g_pred))
        _res_full.append(log_loss(d, dml_irm.predictions["ml_m"][:,0,0]))

        # split sample, tuning
        df_tune, df_test = train_test_split(df, test_size= 0.5, random_state = 42*k)
        y_tune, d_tune, X_tune = df_tune["Y"], df_tune["A"], df_tune.drop(columns=["Y","A"])
        
        gs_g = GridSearchCV(RandomForestRegressor(), grid_l, n_jobs=-1, scoring="neg_mean_squared_error")
        gs_g.fit(np.c_[X_tune,d_tune],y_tune)
        gs_m = GridSearchCV(RandomForestClassifier(), grid_m, n_jobs=-1, scoring="neg_log_loss")
        gs_m.fit(X_tune,d_tune)

        ml_g = gs_g.best_estimator_
        ml_m = gs_m.best_estimator_

        # split sample, doubleml           
        np.random.seed(2*k)
        obj_dml_data = dml.DoubleMLData(df_test, y_col='Y', d_cols='A')
        dml_irm_split = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds)
        dml_irm_split.fit(store_predictions = True)

        _res_split.append(dml_irm_split.summary["coef"].values[0])
        _res_split.append(dml_irm_split.summary["2.5 %"].values[0])
        _res_split.append(dml_irm_split.summary["97.5 %"].values[0])
        _res_split.append(-1 * gs_g.best_score_)
        _res_split.append(-1 * gs_m.best_score_)
        treat_ind = (df_test["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_split.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_split.predictions["ml_g0"][:,0,0]
        _res_split.append(mean_squared_error(df_test["Y"], ml_g_pred))
        _res_split.append(log_loss(df_test["A"], dml_irm_split.predictions["ml_m"][:,0,0]))

        # on folds
        ml_l = RandomForestRegressor()
        ml_m = RandomForestClassifier()

        obj_dml_data = dml.DoubleMLData(df,y_col='Y',d_cols='A')

        np.random.seed(3*k)
        dml_irm_onfolds = dml.DoubleMLIRM(obj_dml_data, ml_g = ml_g, ml_m = ml_m, n_folds = n_folds)
        dml_irm_onfolds.tune({"ml_g": grid_l, "ml_m": grid_m}, tune_on_folds=True)
        dml_irm_onfolds.fit(store_predictions=True, store_models = True)

        _res_of.append(dml_irm_onfolds.summary["coef"].values[0])
        _res_of.append(dml_irm_onfolds.summary["2.5 %"].values[0])
        _res_of.append(dml_irm_onfolds.summary["97.5 %"].values[0])
        _res_of.append(np.nan)
        _res_of.append(np.nan)
        treat_ind = (df["A"] == 1)
        ml_g_pred = treat_ind * dml_irm_onfolds.predictions["ml_g1"][:,0,0] + (1 - treat_ind) * dml_irm_onfolds.predictions["ml_g0"][:,0,0]
        _res_of.append(mean_squared_error(y, ml_g_pred))
        _res_of.append(log_loss(d, dml_irm_onfolds.predictions["ml_m"][:,0,0]))
        
        # # add this iteration to overall results
        res_fullsample.append(_res_full)
        res_splitsample.append(_res_split)
        res_onfolds.append(_res_of)

        # save current result
        pd.DataFrame(res_fullsample, columns = ["coef","2.5%","97.5%","tune_loss_mlg","tune_loss_mlm","fs_loss_mlg","fs_loss_mlm"]).to_csv(res_path + f"fullsample.csv")
        pd.DataFrame(res_splitsample, columns=["coef","2.5%","97.5%","tune_loss_mlg","tune_loss_mlm","fs_loss_mlg","fs_loss_mlm"]).to_csv(res_path + f"splitsample.csv")
        pd.DataFrame(res_onfolds, columns=["coef","2.5%","97.5%","tune_loss_mlg","tune_loss_mlm","fs_loss_mlg","fs_loss_mlm"]).to_csv(res_path + f"onfolds.csv")
