In [1]:
%matplotlib inline

# Importing libraries 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib
import matplotlib.pyplot as plt


np.random.seed(987)

print("numpy      ", np.__version__)
print("pandas     ", pd.__version__)
print("statsmodels", sm.__version__)
print("matplotlib ", matplotlib.__version__)
import warnings
warnings.filterwarnings('ignore')

numpy       1.22.4
pandas      1.3.4
statsmodels 0.12.1
matplotlib  3.4.3


In [2]:
'''
Read preprocessed data, no train test split needed.
Read the entire data for all individuals after missing value imputation and standardization has been done. 
'''

data = pd.read_csv("data/jhs_gcomputation.csv")
data = data.rename(columns={"y_tot": "y"})
data.head()

Unnamed: 0,subjid,nSES,nbSESpc2score,currentSmoker,Diabetes,gender,age,sbp,hdl,totchol,y1,y2,y3,y
0,100079,0,-0.095714,0,1,0,0.640241,-1.485405,0.950038,2.16866,0,0,0,0
1,100180,1,1.548708,0,0,0,1.726966,-0.215153,1.3853,1.192832,0,0,0,0
2,100291,0,-0.572135,0,0,0,1.643372,1.748479,-1.226273,-2.736854,1,0,0,1
3,100953,0,0.211655,1,0,0,0.473052,-1.023152,-0.57338,-1.233551,0,0,0,0
4,101030,1,0.334602,0,1,1,0.473052,-0.330402,-0.065574,-0.284097,0,0,0,0


In [3]:
'''
Select appropriate columns. If additional columns such as lifestyle changes or behaviors need to be added, update here.
For now we can restrict to the columns used in case of logistic regression. 
'''
data = data[['nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()
data.head()

Unnamed: 0,nSES,y,gender,currentSmoker,Diabetes,age,sbp,hdl,totchol
0,0,0,0,0,1,0.640241,-1.485405,0.950038,2.16866
1,1,0,0,0,0,1.726966,-0.215153,1.3853,1.192832
2,0,1,0,0,0,1.643372,1.748479,-1.226273,-2.736854
3,0,0,0,1,0,0.473052,-1.023152,-0.57338,-1.233551
4,1,0,1,0,1,0.473052,-0.330402,-0.065574,-0.284097


In [2]:
def get_causal_effect(treatment, formula, data):
    '''
    Input: treatment - column_name
           formula - Regression formula
           data - dataframe
    Output: ate - average causal effect
    
    The main idea is to have two separate models for those with treatment=1 and treatment =0
    According to this we sample our data based on the treatment and fit two models
    
    We then predict the outcome for the entire data based on our fitted models and
    then evaluate the expected difference in the outcome which is our causal effect.
    '''
    
    f = sm.families.family.Binomial()
#     fm_a1 = smf.glm(formula, 
#                     data.loc[data[treatment] == 1], family=f).fit()
#     print(fm_a1.summary())
#     fm_a0 = smf.glm(formula, 
#                 data.loc[data[treatment] == 0], family=f).fit()
    fm_a1 = smf.mixedlm(formula, 
                    data.loc[data[treatment] == 1],groups = data.loc[data[treatment]==1]['gender']).fit()
    print(fm_a1.summary())
    fm_a0 = smf.mixedlm(formula, 
                data.loc[data[treatment] == 0], groups = data.loc[data[treatment]==0]['gender']).fit()
    
    print(fm_a0.summary())
    y_a1 = fm_a1.predict(data)
    y_a0 = fm_a0.predict(data)
    ate = np.mean(y_a1 - y_a0)
    return ate


In [37]:
'''
This formula is based on the columns selected above. Note that we do not include nSES here as data is subsampled based on nSES in the analysis.
'''
# formula = "y ~  age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"
formula = "y ~  age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes)"
# mixedlm("Weight ~ Time", data, groups=data["Pig"])
treatment = 'nSES'

In [38]:
'''
Here we obtain the causal effect.
'''
ate = get_causal_effect(treatment,formula,data)
print("ATE", np.round(ate, 10))


             Mixed Linear Model Regression Results
Model:               MixedLM    Dependent Variable:    y        
No. Observations:    1579       Method:                REML     
No. Groups:          2          Scale:                 0.1168   
Min. group size:     539        Log-Likelihood:        -567.3635
Max. group size:     1040       Converged:             Yes      
Mean group size:     789.5                                      
----------------------------------------------------------------
                      Coef.  Std.Err.   z    P>|z| [0.025 0.975]
----------------------------------------------------------------
Intercept              0.116    0.020  5.833 0.000  0.077  0.154
C(currentSmoker)[T.1]  0.071    0.027  2.622 0.009  0.018  0.124
C(Diabetes)[T.1]       0.090    0.021  4.242 0.000  0.049  0.132
age                    0.060    0.009  6.409 0.000  0.042  0.079
sbp                    0.023    0.009  2.532 0.011  0.005  0.040
hdl                   -0.003    0.009 -

In [48]:
'''
Bootstrapping to get confidence intervals for the causal effect
'''
def bootstrap_ci(treatment, formula, data, nb, ate):
    ate_rs = []
    for i in range(nb):  # Drawing nb bootstrapped samples, can simply start with 10 samples
        d_star = data.sample(n=data.shape[0], # Same size as input data
                             replace=True)  # Draw with replacement
        f = sm.families.family.Binomial()
#         fm_a1 = smf.glm(formula, 
#                         d_star.loc[d_star[treatment] == 1], family=f).fit()
#         fm_a0 = smf.glm(formula, 
#                     d_star.loc[d_star[treatment] == 0], family=f).fit()
        fm_a1 = smf.mixedlm(formula, 
                    data.loc[data[treatment] == 1],groups = data.loc[data[treatment]==1]['gender']).fit()
#         print(fm_a1.summary())
        fm_a0 = smf.mixedlm(formula, 
                data.loc[data[treatment] == 0], groups = data.loc[data[treatment]==0]['gender']).fit()
        y_a1 = fm_a1.predict(d_star)
        y_a0 = fm_a0.predict(d_star)
        ate_rs.append(np.mean(y_a1 - y_a0))

    print("95% Confidence limits for the ATE")
    ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])
    ate_se = np.std(ate_rs, ddof=1)
    ci_approx = np.round([ate - 1.96*ate_se,
                          ate + 1.96*ate_se],6)
    return ci_perc, ci_approx

In [49]:
ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, nb=10, ate=ate)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

KeyError: 'nRes'

In [3]:
# more features
g_data = pd.read_csv("data/jhs_gcomputation_vm.csv")
g_data = g_data.rename(columns={"y_tot": "y"})
g_data.head()

Unnamed: 0,subjid,nSES,PA3cat,idealHealthPA,idealHealthNutrition,nbSESpc2score,nbK3FavorFoodstore,nbK3paFacilities,nbpctResiden1mi,currentSmoker,...,y1,y2,y3,y,nFood,nFac,nRes,PA3cat_0,PA3cat_1,PA3cat_2
0,100079,0,1,0,0,-0.095714,0.46,0.58,0.52,0,...,0,0,0,0,1,1,1,0,1,0
1,100180,1,1,0,0,1.548708,0.45,0.29,0.4,0,...,0,0,0,0,1,0,1,0,1,0
2,100291,0,1,0,0,-0.572135,0.43,0.55,0.33,0,...,1,0,0,1,1,1,1,0,1,0
3,100953,0,0,0,0,0.211655,0.49,0.48,0.38,1,...,0,0,0,0,1,1,1,1,0,0
4,101030,1,0,0,0,0.334602,0.37,0.47,0.22,0,...,0,0,0,0,1,1,0,1,0,0


In [4]:
list(g_data.columnsumns)

['subjid',
 'nSES',
 'PA3cat',
 'idealHealthPA',
 'idealHealthNutrition',
 'nbSESpc2score',
 'nbK3FavorFoodstore',
 'nbK3paFacilities',
 'nbpctResiden1mi',
 'currentSmoker',
 'Diabetes',
 'gender',
 'age',
 'sbp',
 'hdl',
 'totchol',
 'y1',
 'y2',
 'y3',
 'y',
 'nFood',
 'nFac',
 'nRes',
 'PA3cat_0',
 'PA3cat_1',
 'PA3cat_2']

In [51]:
## PA3cat
PA3cat_data = g_data[['idealHealthPA','idealHealthNutrition','nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()
# PA3cat_data = g_data[['PA3cat_0','PA3cat_1','PA3cat_2','idealHealthPA','idealHealthNutrition','nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()

formula = "y ~  C(idealHealthPA)+C(idealHealthNutrition)+age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes)"
# formula = "y ~  C(PA3cat_0)+C(PA3cat_1)+C(PA3cat_2) +idealHealthPA+idealHealthNutrition+age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"

treatment = 'nSES'


ate = get_causal_effect(treatment,formula,PA3cat_data)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, PA3cat_data, nb=10, ate=ate)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

                 Mixed Linear Model Regression Results
Model:                  MixedLM      Dependent Variable:      y        
No. Observations:       1579         Method:                  REML     
No. Groups:             2            Scale:                   0.1170   
Min. group size:        539          Log-Likelihood:          -571.7944
Max. group size:        1040         Converged:               Yes      
Mean group size:        789.5                                          
-----------------------------------------------------------------------
                             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------
Intercept                     0.116    0.020  5.728 0.000  0.076  0.155
C(idealHealthPA)[T.1]         0.001    0.023  0.053 0.958 -0.045  0.047
C(idealHealthNutrition)[T.1] -0.037    0.071 -0.530 0.596 -0.176  0.101
C(currentSmoker)[T.1]         0.071    0.027  2.609 0.009  0.018  0.124
C(Diabete

In [52]:
## PA3cat
PA3cat_data = g_data[['idealHealthPA','idealHealthNutrition','nSES','nFood','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()
# PA3cat_data = g_data[['PA3cat_0','PA3cat_1','PA3cat_2','idealHealthPA','idealHealthNutrition','nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()

formula = "y ~  C(idealHealthPA)+C(idealHealthNutrition)+age + nSES+sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) "
# formula = "y ~  C(PA3cat_0)+C(PA3cat_1)+C(PA3cat_2) +idealHealthPA+idealHealthNutrition+age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"

treatment = 'nFood'


ate = get_causal_effect(treatment,formula,PA3cat_data)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, PA3cat_data, nb=10, ate=ate)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

                 Mixed Linear Model Regression Results
Model:                  MixedLM      Dependent Variable:      y        
No. Observations:       1821         Method:                  REML     
No. Groups:             2            Scale:                   0.1219   
Min. group size:        606          Log-Likelihood:          -696.3650
Max. group size:        1215         Converged:               No       
Mean group size:        910.5                                          
-----------------------------------------------------------------------
                             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------
Intercept                     0.107    0.017  6.321 0.000  0.074  0.140
C(idealHealthPA)[T.1]         0.003    0.022  0.125 0.901 -0.040  0.045
C(idealHealthNutrition)[T.1] -0.017    0.068 -0.247 0.805 -0.150  0.117
C(currentSmoker)[T.1]         0.076    0.025  2.995 0.003  0.026  0.125
C(Diabete

In [53]:
## PA3cat
PA3cat_data = g_data[['idealHealthPA','idealHealthNutrition','nSES','nFac','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()
# PA3cat_data = g_data[['PA3cat_0','PA3cat_1','PA3cat_2','idealHealthPA','idealHealthNutrition','nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()

formula = "y ~  C(idealHealthPA)+C(idealHealthNutrition)+age +sbp +nSES+ hdl + totchol + C(currentSmoker) + C(Diabetes) "
# formula = "y ~  C(PA3cat_0)+C(PA3cat_1)+C(PA3cat_2) +idealHealthPA+idealHealthNutrition+age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"

treatment = 'nFac'


ate = get_causal_effect(treatment,formula,PA3cat_data)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, PA3cat_data, nb=10, ate=ate)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

                 Mixed Linear Model Regression Results
Model:                  MixedLM      Dependent Variable:      y        
No. Observations:       1810         Method:                  REML     
No. Groups:             2            Scale:                   0.1104   
Min. group size:        618          Log-Likelihood:          -603.0535
Max. group size:        1192         Converged:               No       
Mean group size:        905.0                                          
-----------------------------------------------------------------------
                             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------
Intercept                     0.097    0.016  6.042 0.000  0.065  0.128
C(idealHealthPA)[T.1]         0.017    0.020  0.834 0.404 -0.023  0.056
C(idealHealthNutrition)[T.1] -0.012    0.066 -0.178 0.859 -0.141  0.118
C(currentSmoker)[T.1]         0.080    0.025  3.239 0.001  0.032  0.129
C(Diabete

In [54]:
## PA3cat
PA3cat_data = g_data[['idealHealthPA','idealHealthNutrition','nSES','nFood','nFac','nRes','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()
# PA3cat_data = g_data[['PA3cat_0','PA3cat_1','PA3cat_2','idealHealthPA','idealHealthNutrition','nSES','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()

formula = "y ~  C(idealHealthPA)+C(idealHealthNutrition)+age +sbp +nSES+ nFood+nFac+hdl + totchol + C(currentSmoker) + C(Diabetes) "
# formula = "y ~  C(PA3cat_0)+C(PA3cat_1)+C(PA3cat_2) +idealHealthPA+idealHealthNutrition+age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes) + C(gender)"

treatment = 'nRes'


ate = get_causal_effect(treatment,formula,PA3cat_data)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, PA3cat_data, nb=10, ate=ate)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)

                 Mixed Linear Model Regression Results
Model:                  MixedLM      Dependent Variable:      y        
No. Observations:       2045         Method:                  REML     
No. Groups:             2            Scale:                   0.0967   
Min. group size:        736          Log-Likelihood:          -550.7881
Max. group size:        1309         Converged:               Yes      
Mean group size:        1022.5                                         
-----------------------------------------------------------------------
                             Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------
Intercept                     0.089    0.111  0.802 0.423 -0.128  0.306
C(idealHealthPA)[T.1]         0.024    0.017  1.419 0.156 -0.009  0.058
C(idealHealthNutrition)[T.1]  0.027    0.064  0.421 0.674 -0.099  0.153
C(currentSmoker)[T.1]         0.093    0.022  4.183 0.000  0.050  0.137
C(Diabete

In [None]:
# plot errorbar

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = [0.1, 0.3, 0.5, 0.7]
y = [0.01391, 0.01348, 0.00567, -0.00647]

errors = [0.03553-0.01391, 0.03571-0.01348,0.02646-0.00567,0.01491+0.00647]
 
plt.errorbar(x, y, yerr=errors, fmt='o')
plt.title('Neighborhood Feature Causal Effects')
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.xticks((0,0.1, 0.3, 0.5, 0.7, 0.8), ('', 'nSES', 'nFood', 'nFac','nRes','')) 
plt.show()

In [None]:
lower = [-0.00771, -0.00874, -0.01511, -0.02728]
upper = [0.03553, 0.03571, 0.02646, 0.01491]
interval = [lower, upper]