In [None]:
%matplotlib inline

# Importing libraries 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib
import matplotlib.pyplot as plt


np.random.seed(987)

print("numpy      ", np.__version__)
print("pandas     ", pd.__version__)
print("statsmodels", sm.__version__)
print("matplotlib ", matplotlib.__version__)


In [None]:
'''
Read preprocessed data, no train test split needed.
Read the entire data for all individuals after missing value imputation and standardization has been done. 
'''

data = pd.read_csv("data/jhs_gcomputation.csv")
data = data.rename(columns={"y_tot": "y"})
data.head()

In [None]:
'''
Select appropriate columns. If additional columns such as lifestyle changes or behaviors need to be added, update here.
For now we can restrict to the columns used in case of logistic regression. 
'''
data = data[['nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()
data.head()

In [None]:
def get_causal_effect(treatment, formula, data):
    '''
    Input: treatment - column_name
           formula - Regression formula
           data - dataframe
    Output: ate - average causal effect
    
    The main idea is to have two separate models for those with treatment=1 and treatment =0
    According to this we sample our data based on the treatment and fit two models
    
    We then predict the outcome for the entire data based on our fitted models and
    then evaluate the expected difference in the outcome which is our causal effect.
    '''
    
    f = sm.families.family.Binomial()
    fm_a1 = smf.glm(formula, 
                    data.loc[data[treatment] == 1], family=f).fit()
    print(fm_a1.summary())
    fm_a0 = smf.glm(formula, 
                data.loc[data[treatment] == 0], family=f).fit()
    
    print(fm_a0.summary())
    y_a1 = fm_a1.predict(data)
    y_a0 = fm_a0.predict(data)
    ate = np.mean(y_a1 - y_a0)
    return ate


In [None]:
'''
This formula is based on the columns selected above. Note that we do not include nSES here as data is subsampled based on nSES in the analysis.
'''
formula = "y ~  age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"
treatment = 'nSES'

In [None]:
'''
Here we obtain the causal effect.
'''
ate = get_causal_effect(treatment,formula,data)
print("ATE", np.round(ate, 10))


In [None]:
'''
Bootstrapping to get confidence intervals for the causal effect
'''
def bootstrap_ci(treatment, formula, data, nb, ate):
    ate_rs = []
    for i in range(nb):  # Drawing nb bootstrapped samples, can simply start with 10 samples
        d_star = data.sample(n=data.shape[0], # Same size as input data
                             replace=True)  # Draw with replacement
        f = sm.families.family.Binomial()
        fm_a1 = smf.glm(formula, 
                        d_star.loc[d_star[treatment] == 1], family=f).fit()
        fm_a0 = smf.glm(formula, 
                    d_star.loc[d_star[treatment] == 0], family=f).fit()
        y_a1 = fm_a1.predict(d_star)
        y_a0 = fm_a0.predict(d_star)
        ate_rs.append(np.mean(y_a1 - y_a0))

    print("95% Confidence limits for the ATE")
    ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])
    ate_se = np.std(ate_rs, ddof=1)
    ci_approx = np.round([ate - 1.96*ate_se,
                          ate + 1.96*ate_se],6)
    return ci_perc, ci_approx

In [None]:
ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, nb=1000, ate=0.0139084256)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

In [None]:
# more features

In [None]:
g_data = pd.read_csv("data/jhs_gcomputation_more.csv")
g_data = g_data.rename(columns={"y_tot": "y"})
g_data.head()

In [None]:
## PA3cat
PA3cat_data = g_data[['PA3cat_0','PA3cat_1','PA3cat_2','nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()

formula = "y ~  C(PA3cat_0)+C(PA3cat_1)+C(PA3cat_2) +age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"
treatment = 'nSES'

ate = get_causal_effect(treatment,formula,PA3cat_data)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, PA3cat_data, nb=1000, ate=ate)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

In [None]:
# plot errorbar

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = [0.1, 0.3, 0.5, 0.7]
y = [0.01391, 0.01348, 0.00567, -0.00647]

errors = [0.03553-0.01391, 0.03571-0.01348,0.02646-0.00567,0.01491+0.00647]
 
plt.errorbar(x, y, yerr=errors, fmt='o')
plt.title('Neighborhood Feature Causal Effects')
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.xticks((0,0.1, 0.3, 0.5, 0.7, 0.8), ('', 'nSES', 'nFood', 'nFac','nRes','')) 
plt.show()

In [None]:
lower = [-0.00771, -0.00874, -0.01511, -0.02728]
upper = [0.03553, 0.03571, 0.02646, 0.01491]
interval = [lower, upper]