In [175]:
from itertools import product
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import (MaxAbsScaler, MinMaxScaler,
                                   PolynomialFeatures, RobustScaler,
                                   StandardScaler)
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import (BaseCrossValidator, GridSearchCV, KFold,
                                     RandomizedSearchCV, StratifiedKFold,
                                     check_cv, train_test_split)
from sklearn.linear_model import (ARDRegression, BayesianRidge, ElasticNet,
                                  ElasticNetCV, Lars, Lasso, LassoLars,
                                  LinearRegression, LogisticRegression,
                                  LogisticRegressionCV,
                                  OrthogonalMatchingPursuit, Ridge)
from sklearn.ensemble import (GradientBoostingClassifier,
                              GradientBoostingRegressor,
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.base import BaseEstimator, is_regressor
import econml
from econml.orf import DMLOrthoForest
# from econml.metalearners import SLearner, TLearner, XLearner
from econml.grf import CausalForest
from econml.dr import DRLearner
from econml.dml import CausalForestDML, KernelDML, LinearDML, SparseLinearDML
import sklearn.preprocessing
import sklearn.neural_network
import sklearn.linear_model
import sklearn.ensemble
import sklearn
import argparse
import logging
import os
import pdb
import numpy as np
import pandas as pd
import pickle
import utils
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import time

During Cross-fitting

In [187]:
def get_estimators(estimation_model, model_y, model_t):
    if estimation_model == 'dml':
        return LinearDML(model_y=model_y, model_t=model_t)
    elif estimation_model == 'dr':
        return DRLearner(model_propensity=model_t, model_regression=model_y)
        # return DRLearner(model_propensity=model_y, model_regression=model_t)
    elif estimation_model == 'kernel_dml':
        return KernelDML(model_y=model_y, model_t=model_t)
    # elif estimation_model == 'CausalForestDML':
    #     return CausalForestDML(model_y=model_y, model_t=model_t)
    else:
        raise ValueError("Unrecognized 'estimation_model' key.")

In [208]:
#Divide into k folds.
#For each fold, select a different model and find the best hyperparameter 
# get score, mse, time of estimators. Set cv=0 for estimator
def get_models_during_k_folds(X, T, Y, ci_estimator_list, model_y, model_t):
    k = 4
    cv = KFold(n_splits=k, shuffle=True, random_state=123)

    fold_models = {}
    i = 0
    total_start_time = time.time()
    for train_index, test_index in cv.split(X):
        X_train, T_train, Y_train = X.iloc[train_index], T.iloc[train_index], Y.iloc[train_index]
        X_test, T_test, Y_test = X.iloc[test_index], T.iloc[test_index], Y.iloc[test_index]
        # print(current_model_y, type(current_model_y))
        causal_model_score = {}
        causal_model_mse = {}
        causal_model_time = {}
        for ci in ci_estimator_list:
            #find scores + mse's of all estimators
            if ci == 'dr':
                current_model_y = utils.select_continuous_estimator(model_y[i])
                current_model_t = utils.select_discrete_estimator(model_t[i])
            else:
                current_model_y, current_model_t = model_y[i], model_t[i]
            causal_model = get_estimators(ci, current_model_y, current_model_t)
            print(causal_model)
            start_time = time.time()
            causal_model.fit(Y_train, T_train, X=X_train, W=None)
            run_time = time.time() - start_time
            te_pred = causal_model.effect(X_test)
            causal_model_score[ci] = causal_model.score(Y_test, T_test, X_test)
            causal_model_mse[ci] = np.mean((Y_test - te_pred)**2)
            causal_model_time[ci] = run_time
        fold_models[f'fold {i}'] = {'model_y' : current_model_y, 'model_t' : current_model_t, 'Scores' : causal_model_score, 'Mse' : causal_model_mse, 'Runtime' : causal_model_time}
        i += 1

    total_run_time = time.time() - total_start_time
    return fold_models, total_run_time

In [209]:
data, X, T, Y, true_ITE, true_ATE, true_ATE_stderr, is_discrete = utils.load_ihdp()
# X, X_val, T, T_val, Y, Y_val = train_test_split(X, T, Y, train_size=0.6,shuffle=True, random_state=123)
# X_val, X_test, T_val, T_test, Y_val, Y_test = train_test_split(X_val, T_val, Y_val, train_size=.5, shuffle=True, random_state=123)

In [210]:
ci_estimator_list = ['dml', 'kernel_dml', 'dr']
model_y = ['linear', 'forest', 'gbf', 'nnet']
model_t = ['linear', 'forest', 'gbf', 'nnet']

In [None]:
fold_models, total_run_time = get_models_during_k_folds(X, T, Y, ci_estimator_list, model_y, model_t)
print(fold_models, total_run_time)

{'fold 0': {'model_y': ElasticNetCV(), 'model_t': LogisticRegressionCV(), 'Scores': {'dml': 1.7103529828413817, 'kernel_dml': 1.577551579860745, 'dr': 11.359973879492188}, 'Mse': {'dml': 8.011912516177423, 'kernel_dml': 5.155760687808942, 'dr': 7.376126355036506}, 'Runtime': {'dml': 123.22834086418152, 'kernel_dml': 0.3741190433502197, 'dr': 1.1277921199798584}},

'fold 1': {'model_y': RandomForestRegressor(), 'model_t': RandomForestClassifier(), 'Scores': {'dml': 2.055036463528363, 'kernel_dml': 1.7453353741116, 'dr': 1209493.4363039904}, 'Mse': {'dml': 10.254619261904727, 'kernel_dml': 5.098802756361791, 'dr': 12.194402571204995}, 'Runtime': {'dml': 10.461857318878174, 'kernel_dml': 1.359987497329712, 'dr': 1.3761255741119385}},

'fold 2': {'model_y': GradientBoostingRegressor(), 'model_t': GradientBoostingClassifier(), 'Scores': {'dml': 1.9569946763096768, 'kernel_dml': 1.7221695833959687, 'dr': 44.611287276961534}, 'Mse': {'dml': 8.236809987477494, 'kernel_dml': 4.699993764276994, 'dr': 12.13563589456706}, 'Runtime': {'dml': 3.644969940185547, 'kernel_dml': 0.5272073745727539, 'dr': 0.6685647964477539}},

'fold 3': {'model_y': MLPRegressor(), 'model_t': MLPClassifier(), 'Scores': {'dml': 19.139325080508872, 'kernel_dml': 5.032037366032984, 'dr': 38.62779089389492}, 'Mse': {'dml': 7.7025028770708275, 'kernel_dml': 7.951781325018747, 'dr': 32.05839213054398}, 'Runtime': {'dml': 5.253737449645996, 'kernel_dml': 3.1807219982147217, 'dr': 3.172316551208496}}}

154.7059407234192


In [272]:
#Find best fold in terms of MSE and Runtime for each estimator
mse_all_estimators = {}
runtime_all_estimators = {}
for ci in ci_estimator_list:
    mse_all_estimators[f'{ci}'] = []
    runtime_all_estimators[f'{ci}'] = []
for k, value in fold_models.items():
    for ci in ci_estimator_list:
        mse_all_estimators[f'{ci}'].append(value['Mse'][ci])
        runtime_all_estimators[f'{ci}'].append(value['Runtime'][ci])

best_mse_fold = {}
best_runtime_fold = {}
for ci in ci_estimator_list:
    best_mse_fold[ci] = np.argmin(mse_all_estimators[f'{ci}'])
    best_runtime_fold[ci] = np.argmin(runtime_all_estimators[f'{ci}'])

best_models_mse = {}
best_models_time = {}
for ci in ci_estimator_list:
    fold_mse = best_mse_fold[ci]
    fold_time = best_runtime_fold[ci]
    best_models_mse[ci] = {'best_model_y' : fold_models[f'fold {fold_mse}']['model_y'], 'best_model_t' : fold_models[f'fold {fold_mse}']['model_t'], 'Mse' : fold_models[f'fold {fold_mse}']['Mse'][ci]}
    best_models_time[ci] = {'best_model_y' : fold_models[f'fold {fold_time}']['model_y'], 'best_model_t' : fold_models[f'fold {fold_time}']['model_t'], 'Runtime' : fold_models[f'fold {fold_time}']['Runtime'][ci]}

mse_all_estimators, best_mse_fold, best_models_mse

({'dml': [8.011912516177423,
   10.254619261904727,
   8.236809987477494,
   7.7025028770708275],
  'kernel_dml': [5.155760687808942,
   5.098802756361791,
   4.699993764276994,
   7.951781325018747],
  'dr': [7.376126355036506,
   12.194402571204995,
   12.13563589456706,
   32.05839213054398]},
 {'dml': 3, 'kernel_dml': 2, 'dr': 0},
 {'dml': {'best_model_y': MLPRegressor(),
   'best_model_t': MLPClassifier(),
   'Mse': 7.7025028770708275},
  'kernel_dml': {'best_model_y': GradientBoostingRegressor(),
   'best_model_t': GradientBoostingClassifier(),
   'Mse': 4.699993764276994},
  'dr': {'best_model_y': ElasticNetCV(),
   'best_model_t': LogisticRegressionCV(),
   'Mse': 7.376126355036506}})

In [273]:
runtime_all_estimators, best_runtime_fold, best_models_time

({'dml': [123.22834086418152,
   10.461857318878174,
   3.644969940185547,
   5.253737449645996],
  'kernel_dml': [0.3741190433502197,
   1.359987497329712,
   0.5272073745727539,
   3.1807219982147217],
  'dr': [1.1277921199798584,
   1.3761255741119385,
   0.6685647964477539,
   3.172316551208496]},
 {'dml': 2, 'kernel_dml': 0, 'dr': 2},
 {'dml': {'best_model_y': GradientBoostingRegressor(),
   'best_model_t': GradientBoostingClassifier(),
   'Runtime': 3.644969940185547},
  'kernel_dml': {'best_model_y': ElasticNetCV(),
   'best_model_t': LogisticRegressionCV(),
   'Runtime': 0.3741190433502197},
  'dr': {'best_model_y': GradientBoostingRegressor(),
   'best_model_t': GradientBoostingClassifier(),
   'Runtime': 0.6685647964477539}})