# Analyzing RCT with Precision by Adjusting for Baseline Covariates

In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Jonathan Roth's DGP

Here we set up a DGP with heterogenous effects. In this example, with is due to Jonathan Roth, we have
$$
E [Y(0) | Z] = - Z, \quad E [Y(1) |Z] = Z, \quad Z \sim N(0,1).
$$
The CATE is
$$
E [Y(1) - Y(0) | Z ]= 2 Z.
$$
and the ATE is
$$
2 E Z = 0.
$$

We would like to estimate ATE as precisely as possible.

An economic motivation for this example could be provided as follows: Let D be the treatment of going to college, and let $Z$ be academic skills.  Suppose that academic skills cause lower earnings Y(0) in jobs that don't require a college degree, and cause higher earnings  Y(1) in jobs that require college degrees. This type of scenario is reflected in the DGP set-up above.



In [None]:
def gen_data(random_seed):
    np.random.seed(random_seed)
    n = 1000             # sample size
    Z = np.random.normal(size=n)         # generate Z
    Y0 = -Z + np.random.normal(0, 0.1, size=n)   # conditional average baseline response is -Z
    Y1 = Z + np.random.normal(0, 0.1, size=n)    # conditional average treatment effect is +Z
    D = np.random.binomial(1, .2, size=n)    # treatment indicator; only 20% get treated
    Y = Y1 * D + Y0 * (1 - D)  # observed Y
    data = pd.DataFrame({"Y": Y, "D": D, "Z": 1 + Z})  # we artificially add an intercept to the covariates
    return data

# Analyze the RCT data with Precision Adjustment

Consider

*  classical 2-sample approach, no adjustment (CL)
*  classical linear regression adjustment (CRA)
*  interactive regression adjusment (IRA)

Carry out inference using robust inference, using the sandwich formulas (Eicker-Huber-White).  

Observe that CRA delivers estimates that are less efficient than CL (pointed out by Freedman), whereas IRA delivers estimates that are more efficient (pointed out by Lin). In order for CRA to be more efficient than CL, we need the linear model to be a correct model of the conditional expectation function of Y given D and X, which is not the case here.

In [None]:
data = gen_data(123)

In [None]:
CL = smf.ols("Y ~ D", data=data).fit()
# we are interested in the coefficients on variable "D".
CL.get_robustcov_results(cov_type="HC0").summary()

In [None]:
CRA = smf.ols("Y ~ D + Z", data=data).fit()      # classical
CRA.get_robustcov_results(cov_type="HC0").summary()

In [None]:
# if we demean the covariates, then the intercept can be interpreted
# as an estimate of the expected outcome under control
data['Zdemean'] = data['Z'] - data['Z'].mean(axis=0)
CRA = smf.ols("Y ~ D + Zdemean", data=data).fit()
CRA.get_robustcov_results(cov_type="HC0").summary()

In [None]:
# However, then we need to correct the standard error associated
# with the intercept, to account for the variance in estimating the means.
# The standard error for D does not need any correction
J = np.mean(1 - data['D'])
# the HC0 standard error for the intercept is the second moment of the following score quantity
score = CRA.resid * (1 - data['D']) / J
# however, now we need to add a correction to the score to account for the
# error in the means
score += data[['Zdemean']] @ CRA.params[['Zdemean']]
print(f"Corrected stderr['Intercept']: {np.sqrt(np.mean(score**2) / len(data)):.4f}")

In [None]:
# for the interactive approach, we need to demean the covariates Z to interpret
# the coefficient of D as the ATE
IRA = smf.ols("Y ~ D + Zdemean + Zdemean*D", data=data).fit()  # interactive approach
IRA.get_robustcov_results(cov_type="HC1").summary()

In [None]:
# However, in the interactive approach we also need to correct
# the standard error associated with D, to account for the estimation of the means
correction = np.var(data[['Zdemean']].values @ IRA.params[['Zdemean:D']]) / len(data)
print(f"Corrected stderr['D']: {np.sqrt(IRA.HC0_se['D']**2 + correction):.4f}")

In [None]:
# And as before we need to correct the standard error associated
# with the intercept, to account for the variance in estimating the means.
J = np.mean(1 - data['D'])
score = (IRA.resid * (1 - data['D']) + J * data[['Zdemean']] @ IRA.params[['Zdemean']]) / J
print(f"Corrected stderr['Intercept']: {np.sqrt(np.mean(score**2) / len(data)):.4f}")

# Using classical standard errors (non-robust) is misleading here.

We don't teach non-robust standard errors in econometrics courses, but the default statistical inference for the `fit` procedure in python, `smf.ols()`, still uses 100 year old concepts, perhaps in part due to historical legacy.  

Here the non-robust standard errors suggest that there is not much difference between the different approaches, contrary to the conclusions reached using the robust standard errors.


In [None]:
smf.ols("Y ~ D", data).fit().summary()

In [None]:
smf.ols("Y ~ D + Z", data).fit().summary()

# Verify Asymptotic Approximations Hold in Finite-Sample Simulation Experiment

In [None]:
from joblib import Parallel, delayed


def exp(it):
    data = gen_data(it)
    data['Zdemean'] = data['Z'] - data['Z'].mean(axis=0)
    CL = smf.ols("Y ~ D", data).fit()
    CLcoef = CL.params["D"]
    CLint = CL.params["Intercept"]
    CRA = smf.ols("Y ~ D + Zdemean", data).fit()
    CRAcoef = CRA.params["D"]
    CRAint = CRA.params["Intercept"]
    IRA = smf.ols("Y ~ D + Zdemean+ Zdemean*D", data).fit()
    IRAcoef = IRA.params["D"]
    IRAint = IRA.params["Intercept"]
    return CLcoef, CLint, CRAcoef, CRAint, IRAcoef, IRAint


B = 1000
res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it) for it in range(B))

In [None]:
CLcoefs, CLints, CRAcoefs, CRAints, IRAcoefs, IRAints = map(lambda x: np.array(x), zip(*res))

In [None]:
print("Standard deviations for ATE based on different estimators")
print("Two means ATE std: ", np.std(CLcoefs))
print("Non-interactive ATE std: ", np.std(CRAcoefs))
print("Interactive ATE std: ", np.std(IRAcoefs))

In [None]:
print("Standard deviations for Baseline based on different estimators")
print("Two means Baseline std: ", np.std(CLints))
print("Non-interactive Baseline std: ", np.std(CRAints))
print("Interactive Baseline std: ", np.std(IRAints))