In [None]:
%matplotlib inline

# Importing libraries 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib
import matplotlib.pyplot as plt
import warnings
import random



np.random.seed(987)

print("numpy      ", np.__version__)
print("pandas     ", pd.__version__)
print("statsmodels", sm.__version__)
print("matplotlib ", matplotlib.__version__)

## Functions

In [None]:
def get_causal_effect(treatment, formula, data, var):
    '''
    Input: treatment - column_name
           formula - Regression formula
           data - dataframe
    Output: ate - average causal effect
    
    The main idea is to have two separate models for those with treatment=1 and treatment =0
    According to this we sample our data based on the treatment and fit two models
    
    We then predict the outcome for the entire data based on our fitted models and
    then evaluate the expected difference in the outcome which is our causal effect.
    '''
    
    f = sm.families.family.Binomial()
    
    fm_a1 = smf.mixedlm(formula, 
                    data.loc[data[treatment] == 1],groups = data.loc[data[treatment]==1][var]).fit()
    print(fm_a1.summary())
    fm_a0 = smf.mixedlm(formula, 
                data.loc[data[treatment] == 0], groups = data.loc[data[treatment]==0][var]).fit()
    
    print(fm_a0.summary())
    y_a1 = fm_a1.predict(data)
    y_a0 = fm_a0.predict(data)
    ate = np.mean(y_a1 - y_a0)
    return ate

In [None]:
'''
Bootstrapping to get confidence intervals for the causal effect
'''
def bootstrap_ci(treatment, formula, data, nb, ate,var):
    ate_rs = []
    #random.seed(1)
    #seeds = []

    for i in range(nb):  # Drawing nb bootstrapped samples, can simply start with 10 samples
        d_star = data.sample(n=data.shape[0], # Same size as input data
                             replace=True, random_state = 29*i)  # Draw with replacement
        f = sm.families.family.Binomial()
        #multilevel model using mixedlm
        fm_a1 = smf.mixedlm(formula, 
                        d_star.loc[d_star[treatment] == 1],groups = d_star.loc[d_star[treatment]==1][var]).fit()
        fm_a0 = smf.mixedlm(formula,
                    d_star.loc[d_star[treatment] == 0], groups = d_star.loc[d_star[treatment]==0][var]).fit()
        y_a1 = fm_a1.predict(d_star)
        y_a0 = fm_a0.predict(d_star)
        ate_rs.append(np.mean(y_a1 - y_a0))

    print("95% Confidence limits for the ATE")
    ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])
    ate_se = np.std(ate_rs, ddof=1)
    ci_approx = np.round([ate - 1.96*ate_se,
                          ate + 1.96*ate_se],6)
    return ci_perc, ci_approx

## Data Import

In [None]:
mesa=pd.read_csv("../../code/thesis_code/mesa_preprocess_entire.csv")

'''
Select appropriate columns. If additional columns such as lifestyle changes or behaviors need to be added, update here.
For now we can restrict to the columns used in case of logistic regression. 
'''
data = mesa[['site1c','nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()
data.head()

## By Gender

### Overall 

In [None]:
'''
This formula is based on the columns selected above. Note that we do not include nSES here as data is subsampled based on nSES in the analysis.
'''
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2)"
treatment = 'nSES'

'''
Here we obtain the causal effect.
'''
warnings.filterwarnings('ignore')
var = 'gender1'
ate = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, 10, ate, var)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)


### Site 3

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 3)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2)"
treatment = 'nSES'
var = 'gender1'

'''
Here we obtain the causal effect.
'''

ate3 = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate3, 10))

ci_perc_3, ci_approx_3 = bootstrap_ci(treatment, formula, data, 10, ate3,var)
print("Percentile method:   ", ci_perc_3)
print("Normal approx method:", ci_approx_3)

### Site 4

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 4)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2)"
treatment = 'nSES'
var = 'gender1'

'''
Here we obtain the causal effect.
'''
ate4 = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate4, 10))

ci_perc_4, ci_approx_4 = bootstrap_ci(treatment, formula, data, 10, ate4, var)
print("Percentile method:   ", ci_perc_4)
print("Normal approx method:", ci_approx_4)

### Site 5

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 5)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2)"
treatment = 'nSES'
var = 'gender1'

'''
Here we obtain the causal effect.
'''
ate5 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate5, 10))

ci_perc_5, ci_approx_5 = bootstrap_ci(treatment, formula, data, 10, ate5, var)
print("Percentile method:   ", ci_perc_5)
print("Normal approx method:", ci_approx_5)

### Site 7

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 7)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2)"
treatment = 'nSES'
var = 'gender1'

'''
Here we obtain the causal effect.
'''
ate7 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate7, 10))

ci_perc_7, ci_approx_7 = bootstrap_ci(treatment, formula, data, 10, ate7, var)
print("Percentile method:   ", ci_perc_7)
print("Normal approx method:", ci_approx_7)

### Site 8

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 8)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2)"
treatment = 'nSES'
var = 'gender1'

'''
Here we obtain the causal effect.
'''
ate8 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate8, 10))

ci_perc_8, ci_approx_8 = bootstrap_ci(treatment, formula, data, 10, ate8, var)
print("Percentile method:   ", ci_perc_8)
print("Normal approx method:", ci_approx_8)

## By Race

### Overall 

In [None]:
'''
This formula is based on the columns selected above. Note that we do not include nSES here as data is subsampled based on nSES in the analysis.
'''
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1)"
treatment = 'nSES'

'''
Here we obtain the causal effect.
'''
warnings.filterwarnings('ignore')
var = 'race_2'
ate = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, 10, ate, var)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)


### Site 3

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 3)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1)"
treatment = 'nSES'
var = 'race_2'

'''
Here we obtain the causal effect.
'''

ate3 = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate3, 10))

ci_perc_3, ci_approx_3 = bootstrap_ci(treatment, formula, data, 10, ate3,var)
print("Percentile method:   ", ci_perc_3)
print("Normal approx method:", ci_approx_3)

### Site 4

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 4)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1)"
treatment = 'nSES'
var = 'race_2'

'''
Here we obtain the causal effect.
'''
ate4 = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate4, 10))

ci_perc_4, ci_approx_4 = bootstrap_ci(treatment, formula, data, 10, ate4, var)
print("Percentile method:   ", ci_perc_4)
print("Normal approx method:", ci_approx_4)

### Site 5

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 5)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1)"
treatment = 'nSES'
var = 'race_2'

'''
Here we obtain the causal effect.
'''
ate5 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate5, 10))

ci_perc_5, ci_approx_5 = bootstrap_ci(treatment, formula, data, 10, ate5, var)
print("Percentile method:   ", ci_perc_5)
print("Normal approx method:", ci_approx_5)

### Site 7

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 7)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1)"
treatment = 'nSES'
var = 'race_2'

'''
Here we obtain the causal effect.
'''
ate7 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate7, 10))

ci_perc_7, ci_approx_7 = bootstrap_ci(treatment, formula, data, 10, ate7, var)
print("Percentile method:   ", ci_perc_7)
print("Normal approx method:", ci_approx_7)

### Site 8

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 8)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1)"
treatment = 'nSES'
var = 'race_2'

'''
Here we obtain the causal effect.
'''
ate8 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate8, 10))

ci_perc_8, ci_approx_8 = bootstrap_ci(treatment, formula, data, 10, ate8, var)
print("Percentile method:   ", ci_perc_8)
print("Normal approx method:", ci_approx_8)

## By Site

### Overall

In [None]:
'''
This formula is based on the columns selected above. Note that we do not include nSES here as data is subsampled based on nSES in the analysis.
'''
data = mesa[['site1c','nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2) + C(gender1)"
treatment = 'nSES'

'''
Here we obtain the causal effect.
'''
warnings.filterwarnings('ignore')
var = 'site1c'
ate = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, 10, ate, var)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)


## By gender_race interaction term

### Overall

In [None]:
'''
This formula is based on the columns selected above. Note that we do not include nSES here as data is subsampled based on nSES in the analysis.
'''
data = mesa[['site1c','nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(race_2) + C(gender1)"
treatment = 'nSES'

'''
Here we obtain the causal effect.
'''
warnings.filterwarnings('ignore')
var = 'gender_race'
ate = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate, 10))

ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, 10, ate, var)
print("Percentile method:   ", ci_perc)
print("Normal approx method:", ci_approx)


### Site 3

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 3)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) +C(race_2)"
treatment = 'nSES'
var = 'gender_race'

'''
Here we obtain the causal effect.
'''

ate3 = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate3, 10))

ci_perc_3, ci_approx_3 = bootstrap_ci(treatment, formula, data, 10, ate3,var)
print("Percentile method:   ", ci_perc_3)
print("Normal approx method:", ci_approx_3)

### Site 4

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 4)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) +C(race_2)"
treatment = 'nSES'
var = 'gender_race'

'''
Here we obtain the causal effect.
'''

ate4 = get_causal_effect(treatment,formula,data,var)
print("ATE", np.round(ate4, 10))

ci_perc_4, ci_approx_4 = bootstrap_ci(treatment, formula, data, 10, ate4,var)
print("Percentile method:   ", ci_perc_4)
print("Normal approx method:", ci_approx_4)


### Site 5

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 5)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'
var = 'gender_race'

'''
Here we obtain the causal effect.
'''
ate5 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate5, 10))

ci_perc_5, ci_approx_5 = bootstrap_ci(treatment, formula, data, 10, ate5, var)
print("Percentile method:   ", ci_perc_5)
print("Normal approx method:", ci_approx_5)

### Site 7

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 7)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'
var = 'gender_race'

'''
Here we obtain the causal effect.
'''
ate7 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate7, 10))

ci_perc_7, ci_approx_7 = bootstrap_ci(treatment, formula, data, 10, ate7, var)
print("Percentile method:   ", ci_perc_7)
print("Normal approx method:", ci_approx_7)

### Site 8

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 8)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2','gender_race']].copy()

formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'
var = 'gender_race'

'''
Here we obtain the causal effect.
'''
ate8 = get_causal_effect(treatment,formula,data, var)
print("ATE", np.round(ate8, 10))

ci_perc_8, ci_approx_8 = bootstrap_ci(treatment, formula, data, 10, ate8, var)
print("Percentile method:   ", ci_perc_8)
print("Normal approx method:", ci_approx_8)