In [78]:
# install your library

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import graphviz as gr
import random

In [79]:
path = "C:/Users/zjy97/Downloads/python-causality-handbook-v1.0/matheusfacure-python-causality-handbook-f666303/causal-inference-for-the-brave-and-true/data/"

In [80]:
# 1. Use linear regression to get rid of confouding concerns
# try to include more features that are not correlated to your treatment

wage = pd.read_csv(path+'wage.csv').dropna()

In [81]:
result = smf.ols('falsexam ~ format_ol', data=data).fit()
result.summary().tables[1]# the coef -4.2203 is the ate

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,77.8555,0.762,102.235,0.000,76.357,79.354
format_ol,-4.2203,1.412,-2.990,0.003,-6.998,-1.443


In [None]:
# 2. Use the grouped and dummy regression for causal inference

In [84]:
# let's still use the wage data
# 2.1 run a regression model to figure out how education is associated with log hourly wages
model_1 = smf.ols('lhwage ~ educ', data=wage).fit()
model_1.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3071,0.104,22.089,0.000,2.102,2.512
educ,0.0536,0.008,7.114,0.000,0.039,0.068


In [94]:
# 2.2 we still need to solve confouding issue, one method is to use grouped regression, let's try
group_wage = (wage.
              assign(count=1).
              groupby('educ').
              agg({'lhwage':'mean','count':'count'}).
              reset_index())
model_2 = smf.wls('lhwage ~ educ', data=group_wage, weights=group_wage['count']).fit()
model_2.summary().tables[1]

# actually you can use more columns to do grouping, that is equal to do dummy regression

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3071,0.108,21.321,0.000,2.058,2.557
educ,0.0536,0.008,6.867,0.000,0.036,0.072


In [95]:
# 2.3 dummy regression
# by this, you can contain more columns into your regression and consider the impact of confoudings

# for example, consider the impact from graduating 12th grade on hourlywage
new_wage = (wage
           .assign(hwage=lambda d:d['wage'] / d['hours'])
           .assign(T=lambda d: (d['educ']>12).astype(int))
           )

In [98]:
smf.ols('hwage ~ T*IQ', data=new_wage).fit().summary().tables[1]# T*IQ considers the interactions between, but T+IQ doesnt

# you can make more dummy variables into it to get it work

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.9891,3.986,2.004,0.045,0.162,15.816
T,3.9215,6.027,0.651,0.515,-7.913,15.756
IQ,0.1302,0.042,3.135,0.002,0.049,0.212
T:IQ,-0.0065,0.058,-0.112,0.911,-0.121,0.108
