In [1]:
from statsmodels.sandbox.regression.gmm import IV2SLS 
from statsmodels.api import OLS, Logit
import numpy as np
import pandas as pd

In [2]:
N = 5000

u = np.random.normal(size=N)

z = np.random.normal(size=N)

p_d = 1. / (1. + np.exp(-(u+z)))
d = np.random.binomial(1, p=p_d)

y0 = np.random.normal(size=N)
y1 = np.random.normal(u)
y = (d==1)* y1 + (d==0) * y0
X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'y0': y0, 'y1': y1})


In [3]:
X['intercept'] = 1.

In [4]:
X[X['d'] == 1]['y'].mean() - X[X['d'] == 0]['y'].mean()

0.3781502927687042

In [5]:
(X['y1'] - X['y0']).mean()

0.017696338574287915

#### Let's make sure our instrument is reasonably strong...

In [6]:
X[['z', 'd']].corr() 

Unnamed: 0,z,d
z,1.0,0.372853
d,0.372853,1.0


#### First, let's do it in two stages manually, so we can see the process

In [7]:
instrument_model = Logit(X['d'], X[['z', 'intercept']])
instrument_result = instrument_model.fit()

X['d_expected'] = instrument_result.predict(X[['z', 'intercept']])
causal_model = OLS(X['y'], X[['d_expected', 'intercept']])
result = causal_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.618227
         Iterations 5


0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.3505
Date:,"Sun, 16 Dec 2018",Prob (F-statistic):,0.554
Time:,17:38:15,Log-Likelihood:,-8023.1
No. Observations:,5000,AIC:,16050.0
Df Residuals:,4998,BIC:,16060.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d_expected,0.0538,0.091,0.592,0.554,-0.124,0.232
intercept,0.1447,0.049,2.966,0.003,0.049,0.240

0,1,2,3
Omnibus:,26.575,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.196
Skew:,0.148,Prob(JB):,7.54e-07
Kurtosis:,3.218,Cond. No.,6.73


#### Now, using statsmodel's implementation for 2sls. 
Note: their result.summary method is broken!!

In [8]:
model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,-0.122317,0.234678
intercept,0.047724,0.239295


#### This all works even if Z doesn't cause D! As long as they're associated (and D doesn't cause Z) then you're okay!

In [9]:
N = 5000

u = np.random.normal(size=N)

uz = np.random.normal(size=N)

z = np.random.normal(uz, size=N)

p_d = 1. / (1. + np.exp(-(u+uz)))
d = np.random.binomial(1, p=p_d)

y0 = np.random.normal(size=N)
y1 = np.random.normal(u)
y = (d==1)* y1 + (d==0) * y0
X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'y0': y0, 'y1': y1})

#### Let's make sure the instrument is reasonably strong!

In [10]:
X[['z', 'd']].corr()

Unnamed: 0,z,d
z,1.0,0.26619
d,0.26619,1.0


In [11]:
X['intercept'] = 1.

In [12]:
model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,-0.426694,0.079613
intercept,0.157059,0.417365


#### Looks good!

#### What if the assumption that Z only causes Y through D is violated?



In [13]:
N = 5000

u = np.random.normal(size=N)

z = np.random.normal(size=N)

a = z + np.random.normal(size=N)

p_d = 1. / (1. + np.exp(-(u+z)))
d = np.random.binomial(1, p=p_d)

y0 = np.random.normal(size=N)
y1 = a + np.random.normal(u)
y = (d==1)* y1 + (d==0) * y0
X = pd.DataFrame({'d': d, 'y': y, 'z': z, 'a' :a, 'y0': y0, 'y1': y1})

In [14]:
X.corr()

Unnamed: 0,d,y,z,a,y0,y1
d,1.0,0.247074,0.379669,0.273664,-0.010638,0.386919
y,0.247074,1.0,0.352677,0.476923,0.321049,0.670205
z,0.379669,0.352677,1.0,0.714752,0.003129,0.512113
a,0.273664,0.476923,0.714752,1.0,0.001102,0.713514
y0,-0.010638,0.321049,0.003129,0.001102,1.0,0.006791
y1,0.386919,0.670205,0.512113,0.713514,0.006791,1.0


In [15]:
X['intercept'] = 1

In [16]:
model = IV2SLS(X['y'], X[['d', 'intercept']], instrument=X[['z', 'intercept']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,2.641166,3.183834
intercept,-1.191711,-0.903204


#### Lot's of bias!!!

We can fix it!!!

In [17]:
# here, we have to include 'a' in the instrument definition as well, 
# even though it's a control variable for the y regression
model = IV2SLS(X['y'], X[['d', 'intercept', 'a']], 
               instrument=X[['z', 'intercept', 'a']])
result = model.fit()
result.conf_int()

Unnamed: 0,0,1
d,-0.087369,0.489219
intercept,0.1526,0.448994
a,0.463688,0.53996


In [18]:
instrument_model = Logit(X['d'], X[['z', 'intercept']])
instrument_result = instrument_model.fit()

X['d_expected'] = instrument_result.predict(X[['z', 'intercept']])
causal_model = OLS(X['y'], X[['d_expected', 'a', 'intercept']])
result = causal_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.615328
         Iterations 5


0,1,2,3
Dep. Variable:,y,R-squared:,0.228
Model:,OLS,Adj. R-squared:,0.227
Method:,Least Squares,F-statistic:,736.5
Date:,"Sun, 16 Dec 2018",Prob (F-statistic):,4.97e-281
Time:,17:38:16,Log-Likelihood:,-8696.8
No. Observations:,5000,AIC:,17400.0
Df Residuals:,4997,BIC:,17420.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
d_expected,0.1650,0.145,1.136,0.256,-0.120,0.450
a,0.5054,0.019,26.174,0.000,0.468,0.543
intercept,0.3187,0.075,4.265,0.000,0.172,0.465

0,1,2,3
Omnibus:,8.806,Durbin-Watson:,1.956
Prob(Omnibus):,0.012,Jarque-Bera (JB):,9.054
Skew:,0.08,Prob(JB):,0.0108
Kurtosis:,3.134,Cond. No.,12.1
