In [5]:
import numpy as np
import statsmodels.api as sm
import linearmodels.iv.model as lm
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

# Generate data
n = 10000
X1 = np.random.randn(n)  # Exogenous variable
instrument = np.random.randn(n)  # Instrument for X2
epsilon = np.random.randn(n)  # Error term

# Assume X2 is endogenous, correlated with epsilon
X2 = 0.5 * instrument + 0.5 * epsilon   #reduced form of the equation 

# Generate outcome variable Y
beta0, beta1, beta2 = 1, 2, 3   #parameter estimates 
Y = beta0 + beta1 * X1 + beta2 * X2 + epsilon     #equation after adding the instrumental variable to the sructural form 


In [6]:
# Original Regression
X_orig = sm.add_constant(np.column_stack((X1, X2)))  #Add a column of ones to an array (in this case adding columns X1 and X2)
model_original = sm.OLS(Y, X_orig).fit()   #estimating regression parameters using OLS using endog and exog varaibles 
orig_coef = model_original.params[2]
model_original.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.962
Model:,OLS,Adj. R-squared:,0.962
Method:,Least Squares,F-statistic:,128000.0
Date:,"Fri, 01 Sep 2023",Prob (F-statistic):,0.0
Time:,15:42:00,Log-Likelihood:,-10546.0
No. Observations:,10000,AIC:,21100.0
Df Residuals:,9997,BIC:,21120.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9870,0.007,142.072,0.000,0.973,1.001
x1,2.0176,0.007,291.370,0.000,2.004,2.031
x2,3.9903,0.010,409.933,0.000,3.971,4.009

0,1,2,3
Omnibus:,0.152,Durbin-Watson:,1.999
Prob(Omnibus):,0.927,Jarque-Bera (JB):,0.131
Skew:,0.006,Prob(JB):,0.937
Kurtosis:,3.014,Cond. No.,1.41


In [11]:
# Or you can use linearmodels package to do 2SLS and generate test result as well
mlr2 = lm.IV2SLS(dependent=Y, exog=X1, endog=X2, instruments=instrument).fit(cov_type="homoskedastic", debiased=True) #Estimation of IV models using two-stage least squares
print(mlr2.wu_hausman())   #using hausman test to test for endogeneity

"""Test statistic is difference between sum of squared OLS and sum of
squared IV residuals where each set of residuals has been projected
onto the set of instruments in the IV model"""

Wu-Hausman test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 4571.5444
P-value: 0.0000
Distributed: F(1,9997)


'Test statistic is difference between sum of squared OLS and sum of\nsquared IV residuals where each set of residuals has been projected\nonto the set of instruments in the IV model'

If the p-value from the chi-squared test is small (typically below a chosen significance level like 0.05), it suggests that the null hypothesis of no systematic difference between the OLS and IV coefficient estimates should be rejected. In this case, there is evidence of endogeneity, indicating that the OLS estimates are biased and inconsistent.

In [8]:
# First stage
X = sm.add_constant(np.column_stack((X1, instrument))) #Add a column of ones to an array (in this case adding columns X1 and IV)
model_first_stage = sm.OLS(X2, X).fit()
X2_hat = model_first_stage.predict(X)  # Predicted values of X2


In [9]:
# Second stage
X_main = sm.add_constant(np.column_stack((X1, X2_hat)))  #you obtain the predicted values of the endogenous variable based on the instrumental variables.
model_second_stage = sm.OLS(Y, X_main).fit()
second_stage_coef = model_second_stage.params[2]
model_second_stage.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.522
Model:,OLS,Adj. R-squared:,0.522
Method:,Least Squares,F-statistic:,5464.0
Date:,"Fri, 01 Sep 2023",Prob (F-statistic):,0.0
Time:,15:42:00,Log-Likelihood:,-23258.0
No. Observations:,10000,AIC:,46520.0
Df Residuals:,9997,BIC:,46540.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9876,0.025,39.869,0.000,0.939,1.036
x1,2.0259,0.025,82.057,0.000,1.978,2.074
x2,3.0522,0.048,63.319,0.000,2.958,3.147

0,1,2,3
Omnibus:,3.316,Durbin-Watson:,1.995
Prob(Omnibus):,0.191,Jarque-Bera (JB):,3.159
Skew:,0.007,Prob(JB):,0.206
Kurtosis:,2.914,Cond. No.,1.95


For more information and how we should do Hausman Test, please refer to this Wikipedia page: https://en.wikipedia.org/wiki/Durbin%E2%80%93Wu%E2%80%93Hausman_test. After we calculated the test statistics, we can go to the Chi-square distribution table and find out the p-value. We can reject the null hypothesis if the test result is statistically significant and conclude that there is endogeneity problem exists in this model. We can also do the test using another package linearmodels.iv.model, for more information, please check this page: https://www.datascienceconcepts.com/tutorials/python-programming-language/exogeneity-wu-hausman-and-sargan-tests-in-python/.

In [10]:
# # Compute test statistic
# test_statistic = (orig_coef - second_stage_coef) * (1 / (model_second_stage.bse[2]**2) - (model_original.bse[2]**2)) * (orig_coef - second_stage_coef)

# # Compute the p-value
# p_value = 1 - stats.chi2.cdf(test_statistic, 1)
# print("Test Statistic:", test_statistic)
# print("P-value:", p_value)