In [69]:
from statsmodels.sandbox.regression.gmm import IV2SLS 
from statsmodels.api import OLS, Logit
import numpy as np
import pandas as pd

In [92]:
N = 5000

u = np.random.normal(size=N)

z = np.random.normal(size=N)

p_d = 1. / (1. + np.exp(-(u+z)))
d = np.random.binomial(1, p=p_d)

y0 = np.random.normal(size=N)
y1 = np.random.normal(u)
y = (d==1)* y1 + (d==0) * y0
X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'y0': y0, 'y1': y1})


In [93]:
X['intercept'] = 1.

In [96]:
X[X['d'] == 1]['y'].mean() - X[X['d'] == 0]['y'].mean()

0.34807908945587035

In [97]:
(X['y1'] - X['y0']).mean()

-0.007102631189169498

#### Let's make sure our instrument is reasonably strong...

In [110]:
X[['z', 'd']].corr() 

Unnamed: 0,z,d
z,1.0,0.253473
d,0.253473,1.0


#### First, let's do it in two stages manually, so we can see the process

In [98]:
instrument_model = Logit(X['d'], X[['z', 'intercept']])
instrument_result = instrument_model.fit()

X['d_expected'] = instrument_result.predict(X[['z', 'intercept']])
causal_model = OLS(X['y'], X[['d_expected', 'intercept']])
result = causal_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.614431
         Iterations 5


0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.4417
Date:,"Thu, 01 Mar 2018",Prob (F-statistic):,0.506
Time:,12:40:30,Log-Likelihood:,-8034.4
No. Observations:,5000,AIC:,16070.0
Df Residuals:,4998,BIC:,16090.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d_expected,0.0591,0.089,0.665,0.506,-0.115,0.233
intercept,0.1666,0.048,3.493,0.000,0.073,0.260

0,1,2,3
Omnibus:,14.783,Durbin-Watson:,2.046
Prob(Omnibus):,0.001,Jarque-Bera (JB):,15.132
Skew:,0.115,Prob(JB):,0.000518
Kurtosis:,3.142,Cond. No.,6.56


#### Now, using statsmodel's implementation for 2sls. 
Note: their result.summary method is broken!!

In [108]:
model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,-0.084862,0.08305
intercept,0.134165,0.22065


#### This all works even if Z doesn't cause D! As long as they're associated (and D doesn't cause Z) then you're okay!

In [113]:
N = 5000

u = np.random.normal(size=N)

uz = np.random.normal(size=N)

z = np.random.normal(uz, size=N)

p_d = 1. / (1. + np.exp(-(u+uz)))
d = np.random.binomial(1, p=p_d)

y0 = np.random.normal(size=N)
y1 = np.random.normal(u)
y = (d==1)* y1 + (d==0) * y0
X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'y0': y0, 'y1': y1})

#### Let's make sure the instrument is reasonably strong!

In [114]:
X[['z', 'd']].corr()

Unnamed: 0,z,d
z,1.0,0.260994
d,0.260994,1.0


In [115]:
X['intercept'] = 1.

In [116]:
model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,-0.29458,0.209399
intercept,0.060953,0.318652


#### Looks good!

#### What if the assumption that Z only causes Y through D is violated?



In [145]:
N = 5000

u = np.random.normal(size=N)

z = np.random.normal(size=N)

a = z + np.random.normal(size=N)

p_d = 1. / (1. + np.exp(-(u+z)))
d = np.random.binomial(1, p=p_d)

y0 = np.random.normal(size=N)
y1 = a + np.random.normal(u)
y = (d==1)* y1 + (d==0) * y0
X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'a' :a, 'y0': y0, 'y1': y1})

In [146]:
X.corr()

Unnamed: 0,a,d,y,y0,y1,z
a,1.0,0.255894,0.474881,-0.021884,0.711883,0.704982
d,0.255894,1.0,0.245038,-0.014144,0.373323,0.363579
y,0.474881,0.245038,1.0,0.30826,0.65639,0.342872
y0,-0.021884,-0.014144,0.30826,1.0,-0.011003,-0.031003
y1,0.711883,0.373323,0.65639,-0.011003,1.0,0.514429
z,0.704982,0.363579,0.342872,-0.031003,0.514429,1.0


In [147]:
X['intercept'] = 1

In [148]:
model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,2.618198,3.178242
intercept,-1.168116,-0.875108


#### Lot's of bias!!!

We can fix it!!!

In [149]:
# here, we have to include 'a' in the instrument definition as well, 
# even though it's a control variable for the y regression
model = IV2SLS(X['y'], X[['d', 'intercept', 'a']], 
               instrument=X[['z', 'intercept', 'a']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,-0.152977,0.424428
intercept,0.178235,0.47021
a,0.470047,0.544819


In [150]:
instrument_model = Logit(X['d'], X[['z', 'intercept']])
instrument_result = instrument_model.fit()

X['d_expected'] = instrument_result.predict(X[['z', 'intercept']])
causal_model = OLS(X['y'], X[['d_expected', 'a', 'intercept']])
result = causal_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.622092
         Iterations 5


0,1,2,3
Dep. Variable:,y,R-squared:,0.226
Model:,OLS,Adj. R-squared:,0.225
Method:,Least Squares,F-statistic:,727.9
Date:,"Thu, 01 Mar 2018",Prob (F-statistic):,3.63e-278
Time:,12:56:02,Log-Likelihood:,-8602.5
No. Observations:,5000,AIC:,17210.0
Df Residuals:,4997,BIC:,17230.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d_expected,0.1193,0.147,0.812,0.417,-0.169,0.407
a,0.5089,0.019,26.639,0.000,0.471,0.546
intercept,0.3323,0.074,4.473,0.000,0.187,0.478

0,1,2,3
Omnibus:,12.644,Durbin-Watson:,2.056
Prob(Omnibus):,0.002,Jarque-Bera (JB):,12.644
Skew:,0.121,Prob(JB):,0.0018
Kurtosis:,3.045,Cond. No.,12.1
