# Appendix C: ACIC Challenge Results

Note: As we do not have permission of all participants, we do not publish the file with full ACIC results.
Please contact us for replication.

In [1]:
import numpy as np
import pandas as pd

Load results of our simulations

In [2]:
schemes = ["fullsample", "splitsample", "onfolds"]
models = ["plr", "irm"]
learners = ["Lasso", "Random Forest", "XGBoost", "FLAML"]
scenarios = range(16)
oracle_df = pd.read_csv("../results/acic/oracle.csv", delimiter=";")
res = []
for sc in scenarios:
    true_theta = oracle_df.loc[oracle_df.DGPid==(sc+1),"psi0.ATE"].values[0]
    oracle_mse = oracle_df.loc[oracle_df.DGPid==(sc+1),"MSE"].values[0]
    oracle_bias = oracle_df.loc[oracle_df.DGPid==(sc+1),"bias"].values[0]
    oracle_std = oracle_df.loc[oracle_df.DGPid==(sc+1),"sd"].values[0]
    for model in models:
        for learner in learners:
            if learner == "LinearModels":
                schemes = ["fullsample"]
            else: 
                schemes = ["fullsample", "splitsample", "onfolds"]
            for scheme in schemes:
                paths = {
                    "Lasso": f"../results/acic/{model}/LassoCV/Scenario{sc+1}_LassoCV_{scheme}.csv",
                    "Random Forest":  f"../results/acic/{model}/RandomForest/Scenario{sc+1}_{scheme}.csv",
                    "XGBoost": f"../results/acic/{model}/XGBoost/Scenario{sc+1}_{scheme}.csv",
                    "FLAML": f"../results/acic/{model}/Scenario{sc+1}/60_{scheme}.csv",
                    "LinearModels": f"../results/acic/{model}/LinearModels/Scenario{sc+1}_LinearModels_{scheme}.csv",
                        }
                load_df = pd.read_csv(paths[learner])
                rep = load_df["Unnamed: 0"]
                coef = load_df["coef"].values
                confint = ((load_df["2.5%"] < true_theta) & (load_df["97.5%"] > true_theta)).astype("int").values
                bias = coef - true_theta
                rrmse = np.sqrt(np.mean(np.square(bias))) / np.sqrt(oracle_mse)
                rbias = np.abs(np.mean(coef - true_theta)/oracle_bias)
                rstd = np.std(coef)/oracle_std
                if model == "irm":
                    try:
                        predict_y = load_df["fs_loss_mlg"]
                        comb_loss = load_df["fs_loss_mlg"] * load_df["fs_loss_mlm"]
                    except:
                        predict_y = load_df["fs_loss_mll"]
                        comb_loss = load_df["fs_loss_mll"] * load_df["fs_loss_mlm"]
                if model == "plr":
                    comb_loss = (load_df["fs_loss_mll"] + load_df["fs_loss_mlm"]) * load_df["fs_loss_mlm"]
                    predict_y = load_df["loss_Y"]

                res.append(pd.DataFrame([np.repeat(sc+1,100), rep, np.repeat(model,100),np.repeat(learner,100),np.repeat(scheme,100),
                                         coef, confint, bias, np.repeat(rbias,100), np.repeat(rstd,100), np.repeat(rrmse,100), predict_y, comb_loss], 
                                         index=["sc", "rep", "model", "learner", "scheme", "coef", "confint", "bias", "rbias", "rstd", "rrmse", "predict_y", "comb_loss"]).transpose())
df = pd.concat((res))
df["rrmse-1"] = df.rrmse.values -1
df.head()

Unnamed: 0,sc,rep,model,learner,scheme,coef,confint,bias,rbias,rstd,rrmse,predict_y,comb_loss,rrmse-1
0,1,0,plr,Lasso,fullsample,0.286102,1,0.086102,1.000627,1.090694,1.095024,4.722745,3.709074,0.095024
1,1,1,plr,Lasso,fullsample,0.33644,1,0.13644,1.000627,1.090694,1.095024,4.892763,4.057695,0.095024
2,1,2,plr,Lasso,fullsample,0.090525,1,-0.109475,1.000627,1.090694,1.095024,5.130473,3.955124,0.095024
3,1,3,plr,Lasso,fullsample,0.370382,1,0.170382,1.000627,1.090694,1.095024,5.114704,3.96432,0.095024
4,1,4,plr,Lasso,fullsample,0.233294,1,0.033294,1.000627,1.090694,1.095024,5.559756,4.272133,0.095024


Load the results of the default tuned learners

In [3]:
models = ["plr", "irm"]
learners = ["RF untuned", "XGB untuned"]
scenarios = range(16)
oracle_df = pd.read_csv("../results/acic/oracle.csv", delimiter=";")
res = []
for sc in scenarios:
    true_theta = oracle_df.loc[oracle_df.DGPid==(sc+1),"psi0.ATE"].values[0]
    oracle_mse = oracle_df.loc[oracle_df.DGPid==(sc+1),"MSE"].values[0]
    oracle_bias = oracle_df.loc[oracle_df.DGPid==(sc+1),"bias"].values[0]
    oracle_std = oracle_df.loc[oracle_df.DGPid==(sc+1),"sd"].values[0]
    for model in models:
        for learner in learners:
            paths = {
                "RF untuned": f"../results/acic/{model}/RFdef/Scenario{sc+1}_RandomForestdefault.csv",
                "XGB untuned": f"../results/acic/{model}/XGBdef/Scenario{sc+1}_XGBoostdefault.csv",
                    }
            load_df = pd.read_csv(paths[learner])
            rep = load_df["Unnamed: 0"]
            coef = load_df["coef"].values
            confint = ((load_df["2.5%"] < true_theta) & (load_df["97.5%"] > true_theta)).astype("int").values
            bias = coef - true_theta
            rrmse = np.sqrt(np.mean(np.square(bias))) / np.sqrt(oracle_mse)
            rbias = np.abs(np.mean(coef - true_theta)/oracle_bias)
            rstd = np.std(coef)/oracle_std
            if model == "irm":
                try:
                    predict_y = load_df["fs_loss_mlg"]
                    comb_loss = load_df["fs_loss_mlg"] * load_df["fs_loss_mlm"]
                except:
                    predict_y = load_df["fs_loss_mll"]
                    comb_loss = load_df["fs_loss_mll"] * load_df["fs_loss_mlm"]
            if model == "plr":
                comb_loss = (load_df["fs_loss_mll"] + load_df["fs_loss_mlm"]) * load_df["fs_loss_mlm"]
                try:
                    predict_y = load_df["loss_Y"]
                except:
                    predict_y = np.repeat(np.nan,100)

            res.append(pd.DataFrame([np.repeat(sc+1,100), rep, np.repeat(model,100),np.repeat(learner,100),
                                        coef, confint, bias, np.repeat(rbias,100), np.repeat(rstd,100), np.repeat(rrmse,100), predict_y, comb_loss], 
                                        index=["sc", "rep", "model", "learner", "coef", "confint", "bias", "rbias", "rstd", "rrmse", "predict_y", "comb_loss"]).transpose())
default_df = pd.concat((res))
default_df["rrmse-1"] = default_df.rrmse.values -1
default_df.head()

Unnamed: 0,sc,rep,model,learner,coef,confint,bias,rbias,rstd,rrmse,predict_y,comb_loss,rrmse-1
0,1,0,plr,RF untuned,0.275805,1,0.075805,0.328216,1.095087,1.094198,,3.98525,0.094198
1,1,1,plr,RF untuned,0.344077,1,0.144077,0.328216,1.095087,1.094198,,4.238201,0.094198
2,1,2,plr,RF untuned,0.077935,1,-0.122065,0.328216,1.095087,1.094198,,4.196909,0.094198
3,1,3,plr,RF untuned,0.44003,1,0.24003,0.328216,1.095087,1.094198,,4.353869,0.094198
4,1,4,plr,RF untuned,0.162769,1,-0.037231,0.328216,1.095087,1.094198,,4.472103,0.094198


Load the results of all ACIC 2019 teams

In [None]:
scenarios=range(16)
challenge_results = pd.read_csv("../results/acic/results_challenge.csv").drop(columns=["Unnamed: 0"])
challenge_results = challenge_results.loc[challenge_results["Y.cont"]==1]
for sc in scenarios:
    challenge_results.loc[challenge_results.DGPid==sc+1,"RRMSE"] = challenge_results.loc[challenge_results.DGPid==sc+1,"rMSE"] / np.sqrt(oracle_df.loc[oracle_df.DGPid==(sc+1),"MSE"].values[0])
    challenge_results.loc[challenge_results.DGPid==sc+1,"rel.bias"] = np.abs(challenge_results.loc[challenge_results.DGPid==sc+1,"bias"] / oracle_df.loc[oracle_df.DGPid==(sc+1),"bias"].values[0])
    challenge_results.loc[challenge_results.DGPid==sc+1,"rel.sd"] = np.abs(challenge_results.loc[challenge_results.DGPid==sc+1,"sd"] / oracle_df.loc[oracle_df.DGPid==(sc+1),"sd"].values[0])
challenge_results.head()

Now we introduce different aggregations

In [5]:
def evaluate_strategy(strategy_df, name):
    scenarios=range(16)
    res = []
    for sc in scenarios:
        true_theta = oracle_df.loc[oracle_df.DGPid==(sc+1),"psi0.ATE"].values[0]
        oracle_mse = oracle_df.loc[oracle_df.DGPid==(sc+1),"MSE"].values[0]
        oracle_bias = oracle_df.loc[oracle_df.DGPid==(sc+1),"bias"].values[0]
        oracle_std = oracle_df.loc[oracle_df.DGPid==(sc+1),"sd"].values[0]
        sub_df = strategy_df.loc[strategy_df.sc==(sc+1)]
        bias = sub_df.coef - true_theta
        res.append([np.sqrt(np.mean(np.square(bias)))/np.sqrt(oracle_mse), np.abs(np.mean(bias)/oracle_bias), sub_df.confint.mean(),sub_df.coef.std()/oracle_std])
    res_strat = pd.DataFrame(res, columns=["RRMSE","rel.bias","coverage","rel.sd"])
    res_strat["DGPid"] = res_strat.index + 1
    res_strat["fname"] = np.repeat(name, 16)
    return res_strat

Strategy 1: Select Model by lower median of `predict_y` (MSE on Y), then select learner / scheme by lowest `comb_loss`

In [6]:
grouped_df1 = df.groupby(['sc', 'rep', 'model']).agg({'predict_y': 'median'}).reset_index()
idx = grouped_df1.groupby(['sc', 'rep'])['predict_y'].idxmin()
filtered_df1 = grouped_df1.loc[idx]
filtered_df1 = filtered_df1.merge(df, on=["sc","rep","model"], validate="one_to_many")
grouped_df2 = filtered_df1.groupby(['sc', 'rep', 'learner', 'scheme','model']).agg({'comb_loss': 'min'}).reset_index()
idx = grouped_df2.groupby(['sc', 'rep'])['comb_loss'].idxmin()
filtered_df2 = grouped_df2.loc[idx]
final_df = pd.merge(filtered_df2.drop(columns=["comb_loss"]), df, on=["sc", "rep", "model", "scheme", "learner"], how="left", validate="one_to_one")
res_strat1 = evaluate_strategy(final_df, "DoubleML")

Always use FLAML & Fullsample

In [7]:
grouped_df1 = df.loc[(df.scheme=="fullsample") & (df.learner=="FLAML")].groupby(['sc', 'rep', 'model']).agg({'predict_y': 'median'}).reset_index()
idx = grouped_df1.groupby(['sc', 'rep'])['predict_y'].idxmin()
filtered_df1 = grouped_df1.loc[idx]
filtered_df1 = filtered_df1.merge(df, on=["sc","rep","model"], validate="one_to_many")
grouped_df2 = filtered_df1.groupby(['sc', 'rep', 'learner', 'scheme','model']).agg({'comb_loss': 'min'}).reset_index()
idx = grouped_df2.groupby(['sc', 'rep'])['comb_loss'].idxmin()
filtered_df2 = grouped_df2.loc[idx]
final_df = pd.merge(filtered_df2.drop(columns=["comb_loss"]), df, on=["sc", "rep", "model", "scheme", "learner"], how="left", validate="one_to_one")
res_strat2 = evaluate_strategy(final_df, "Only FLAML FS")

Use IRM and default RF

In [8]:
final_df2 = default_df.loc[(default_df.learner=="RF untuned") & (default_df.model=="irm")]
res_strat3 = evaluate_strategy(final_df2, "RF default")

Aggregate Absolute Results

In [9]:
extended_res = pd.concat((challenge_results, res_strat1, res_strat2, res_strat3))
print(extended_res.loc[extended_res.fname.isin(["DoubleML","Only FLAML FS", "RF default", "BART", "Std", "BART_TMLE", "eb"])].groupby(["fname"])[["RRMSE", "rel.bias","coverage", "rel.sd"]].mean())

                  RRMSE     rel.bias  coverage    rel.sd
fname                                                   
BART           2.257287  1628.814601  0.750000  1.244940
BART_TMLE      2.192713  1301.510278  0.811875  1.356422
DoubleML       2.932048  1372.111140  0.813750  2.407602
Only FLAML FS  2.971526  1457.126124  0.815000  2.441941
RF default     9.256483  1735.088988  0.665000  2.559791
Std            3.428350  1105.027786  0.821875  2.509488
eb             3.567365   988.042630  0.850625  2.748774


Scores for RRMSE, Bias and Standard Deviations

In [10]:
score = np.array([1/i for i in np.arange(1,13,1)] + [0] + [1/i for i in np.arange(-12,0,1)])
extended_res = pd.concat((challenge_results, res_strat1, res_strat2, res_strat3))
extended_res["rel.coverage"] = np.abs(extended_res.coverage - 0.95)
scores_df = pd.DataFrame()
for metric in ["RRMSE","rel.bias","rel.sd"]:
    sorted_res = extended_res.sort_values(by=["DGPid", metric])
    sorted_res["score"] = np.tile(score, 16)
    scores_df[metric] = sorted_res.groupby("fname")["score"].sum()
scores_df.rank(ascending=False, method="min").loc[scores_df.index.isin(["DoubleML","Only FLAML FS", "RF default", "BART", "Std", "BART_TMLE", "eb"])]

Unnamed: 0_level_0,RRMSE,rel.bias,rel.sd
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BART,3.0,11.0,1.0
BART_TMLE,1.0,2.0,4.0
DoubleML,11.0,10.0,16.0
Only FLAML FS,13.0,15.0,17.0
RF default,16.0,17.0,12.0
Std,9.0,9.0,11.0
eb,15.0,1.0,21.0


Scores for Coverage

In [11]:
scores_df["rel.coverage"] = np.zeros_like(scores_df.RRMSE)
score_helper = pd.DataFrame(score)
score_helper["index"] = np.arange(1,26)
extended_res.set_index("fname").groupby(["DGPid"])["rel.coverage"].rank("min").reset_index().merge(score_helper, left_on="rel.coverage", right_on="index").groupby("fname")[0].sum().rank(ascending=False, method="min")[["DoubleML","Only FLAML FS", "RF default", "BART", "Std", "BART_TMLE", "eb"]]

fname
DoubleML         14.0
Only FLAML FS     8.0
RF default       20.0
BART             11.0
Std               4.0
BART_TMLE         2.0
eb                7.0
Name: 0, dtype: float64