In [1]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from statsmodels.api import OLS
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GB, 
    GradientBoostingClassifier as GC)
from ISLP.bart import BART
import sklearn.model_selection as skm
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import networkx as nx
import graphviz
from networkx.drawing.nx_agraph import graphviz_layout
import dowhy
from dowhy import CausalModel
from sklearn.linear_model import LogisticRegression
from econml.metalearners import SLearner
from econml.metalearners import TLearner
from econml.metalearners import XLearner
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
data = pd.read_pickle("data.pkl")

# META LEARNERS

In [3]:
covariates = ["public_facing", "ba_quality", "language_skills", 
              "exp_highquality", "ma", "certificate",
              'occ_Administrative', 'occ_Biotech and Pharmacy', 'occ_Civil Engineer',
              'occ_Clerical', 'occ_Ecommerce', 'occ_Education',
              'occ_Electrical Engineer', 'occ_Executive Assistant', 'occ_Finance',
              'occ_Food Services Managers', 'occ_Human Resources Payroll',
              'occ_Insurance', 'occ_Maintenance Technician',
              'occ_Marketing and Sales', 'occ_Media and Arts', 'occ_Production',
              'occ_Programmer', 'occ_Retail', 'occ_Social Worker', 'occ_Technology']

X_cov = data[covariates]
Y = data['callback']
treatments = ["female_type1", "female_type2", "female_type3", "female_type4"]

## S-LEARNER

In [4]:
# Storage
from econml.sklearn_extensions.model_selection import GridSearchCVList
from econml.inference import BootstrapInference

ate_results_SL = {}
ci_results_SL = []

# Set bootstrap for CI calculation
inf = BootstrapInference(n_bootstrap_samples=100, bootstrap_type='normal')

for tvar in treatments:
    print(f"\nProcessing {tvar} ...")
    T = data[tvar].values

    s_learner = SLearner(
        overall_model=RF(n_estimators=200,
                         min_samples_leaf=10,
                         random_state=123)
    )

    # Fit with inference
    s_learner.fit(Y, T, X=X_cov, inference=inf)

    # Store ATE and CI
    ate = s_learner.ate(X=X_cov)
    low, high = s_learner.ate_interval(X=X_cov)

    ate_results_SL[tvar] = ate
    ci_results_SL.append((low, high))

# Print results
print("\nATEs and 95% CIs using SLearner with Random Forest:")
for i, tvar in enumerate(treatments):
    l, u = ci_results_SL[i]
    print(f"{tvar}: ATE = {ate_results_SL[tvar]:.4f}, CI = [{l:.4f}, {u:.4f}]")



Processing female_type1 ...

Processing female_type2 ...

Processing female_type3 ...

Processing female_type4 ...

ATEs and 95% CIs using SLearner with Random Forest:
female_type1: ATE = 0.0113, CI = [-0.0942, 0.1169]
female_type2: ATE = 0.0054, CI = [-0.0944, 0.1053]
female_type3: ATE = -0.0108, CI = [-0.1038, 0.0821]
female_type4: ATE = -0.0325, CI = [-0.1188, 0.0538]


## T-Learner

In [5]:
# Storage
ate_results_TL = {}
ci_results_TL = []

inf = BootstrapInference(n_bootstrap_samples=100, bootstrap_type='normal')

for tvar in treatments:
    print(f"\nProcessing {tvar} ...")
    T = data[tvar].values
    
    # T-Learner
    t_learner = TLearner(models=RF(n_estimators=200, min_samples_leaf=10))
    
    # Fit model
    t_learner.fit(Y, T, X=X_cov,inference=inf)
    
    # Point estimate
    ate = t_learner.ate(X=X_cov)
    low, high = t_learner.ate_interval(X=X_cov)

    ate_results_TL[tvar] = ate
    ci_results_TL.append((low, high))

# Print results
print("\nATEs and 95% CIs using SLearner with Random Forest:")
for i, tvar in enumerate(treatments):
    lt, ut = ci_results_TL[i]
    print(f"{tvar}: ATE = {ate_results_TL[tvar]:.4f}, CI = [{lt:.4f}, {ut:.4f}]")



Processing female_type1 ...

Processing female_type2 ...

Processing female_type3 ...

Processing female_type4 ...

ATEs and 95% CIs using SLearner with Random Forest:
female_type1: ATE = 0.0100, CI = [-0.1144, 0.1343]
female_type2: ATE = 0.0058, CI = [-0.1271, 0.1386]
female_type3: ATE = -0.0185, CI = [-0.1362, 0.0993]
female_type4: ATE = -0.0395, CI = [-0.1462, 0.0671]


## X LEARNER

In [8]:
# Storage
ate_results_XL = {}
ci_results_XL = []

for tvar in treatments:
    print(f"\nProcessing {tvar} ...")
    T = data[tvar].values
    
    # Set X Learner
    x_learner = XLearner(models=RF(n_estimators=200, min_samples_leaf=10), propensity_model=LogisticRegression(solver='liblinear'))
    
    # Fit model
    x_learner.fit(Y, T, X=X_cov)
    
    # Point estimate
    ate = x_learner.ate(X=X_cov)
    
    ate_results_XL[tvar] = ate

# Print results
print("\nATEs XLearner with Random Forest:")
for i, tvar in enumerate(treatments):
    print(f"{tvar}: ATE = {ate_results_XL[tvar]:.4f}")


Processing female_type1 ...

Processing female_type2 ...

Processing female_type3 ...

Processing female_type4 ...

ATEs and 95% normal-theory CIs using SLearner with Random Forest:
female_type1: ATE = 0.0106
female_type2: ATE = 0.0065
female_type3: ATE = -0.0156
female_type4: ATE = -0.0396


# Table

In [9]:
import pandas as pd

rows = []

for i, tvar in enumerate(treatments):
    sl_ate = ate_results_SL[tvar]
    sl_l, sl_u = ci_results_SL[i]
    
    tl_ate = ate_results_TL[tvar]
    tl_l, tl_u = ci_results_TL[i]
    
    xl_ate = ate_results_XL[tvar]

    rows.append({
        "Treatment": tvar,
        "S Learner ATE": f"{sl_ate:.4f}",
        "S Learner 95% CI": f"[{sl_l:.4f}, {sl_u:.4f}]",
        "T Learner ATE": f"{tl_ate:.4f}",
        "T Learner 95% CI": f"[{tl_l:.4f}, {tl_u:.4f}]",
        "X Learner ATE": f"{xl_ate:.4f}"
    })

df_results = pd.DataFrame(rows)
print(df_results)


      Treatment S Learner ATE   S Learner 95% CI T Learner ATE  \
0  female_type1        0.0113  [-0.0942, 0.1169]        0.0100   
1  female_type2        0.0054  [-0.0944, 0.1053]        0.0058   
2  female_type3       -0.0108  [-0.1038, 0.0821]       -0.0185   
3  female_type4       -0.0325  [-0.1188, 0.0538]       -0.0395   

    T Learner 95% CI X Learner ATE  
0  [-0.1144, 0.1343]        0.0106  
1  [-0.1271, 0.1386]        0.0065  
2  [-0.1362, 0.0993]       -0.0156  
3  [-0.1462, 0.0671]       -0.0396  


In [10]:
print(df_results.to_latex(index=False,
                          caption="ATE Estimates from S, T and X learner",
                          label="tab:ate_results",
                          escape=False,
                          longtable=False,
                          column_format="lccccc"))


\begin{table}
\caption{ATE Estimates from S, T and X learner}
\label{tab:ate_results}
\begin{tabular}{lccccc}
\toprule
Treatment & S Learner ATE & S Learner 95% CI & T Learner ATE & T Learner 95% CI & X Learner ATE \\
\midrule
female_type1 & 0.0113 & [-0.0942, 0.1169] & 0.0100 & [-0.1144, 0.1343] & 0.0106 \\
female_type2 & 0.0054 & [-0.0944, 0.1053] & 0.0058 & [-0.1271, 0.1386] & 0.0065 \\
female_type3 & -0.0108 & [-0.1038, 0.0821] & -0.0185 & [-0.1362, 0.0993] & -0.0156 \\
female_type4 & -0.0325 & [-0.1188, 0.0538] & -0.0395 & [-0.1462, 0.0671] & -0.0396 \\
\bottomrule
\end{tabular}
\end{table}

