## Prep

In [1]:
%matplotlib inline

# Importing libraries 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib
import matplotlib.pyplot as plt


np.random.seed(987)

print("numpy      ", np.__version__)
print("pandas     ", pd.__version__)
print("statsmodels", sm.__version__)
print("matplotlib ", matplotlib.__version__)
import warnings
warnings.filterwarnings('ignore')

numpy       1.22.4
pandas      1.3.4
statsmodels 0.12.1
matplotlib  3.4.3


In [2]:
## Read preprocessed data

dat = pd.read_csv("data/jhs_complete_processed.csv")
dat_v1 = dat[dat['visit'] == 1]
dat_v2 = dat.loc[dat['visit'] == 2, ['subjid','y']].rename(columns={'y': 'y2'})
dat_v3 = dat.loc[dat['visit'] == 3, ['subjid','y']].rename(columns={'y': 'y3'})

merged_df = pd.merge(dat_v1, dat_v2, on='subjid')
merged_df = pd.merge(merged_df, dat_v3, on='subjid')

# calculate Y
def get_Y_tot(df):
    df['y_tot'] = 0
    df.loc[
       (df['y']==1) |
       (df['y2']==1) |
       (df['y3']==1),
       'y_tot'] = 1
    return df
merged_df = get_Y_tot(merged_df)

data = merged_df

In [3]:
## Select appropriate columns. 
data = data[['y_tot','nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 
                   'sportIndex','hyIndex','activeIndex','darkgrnVeg', 'eggs','fish', 
                   'age','gender', 'currentSmoker', 'Diabetes','sbp','hdl','totchol']].copy()
data = data.rename(columns={"y_tot": "y"})
data.head()

Unnamed: 0,y,nbSESpc2score,nbK3paFacilities,N_UNFAV_CT00,sportIndex,hyIndex,activeIndex,darkgrnVeg,eggs,fish,age,gender,currentSmoker,Diabetes,sbp,hdl,totchol
0,0,2.0,3.0,4.0,3.0,3.0,3.0,1.0,1.0,1.0,62,0,0.0,1.0,-1.485405,0.950038,2.16866
1,0,4.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,75,0,0.0,0.0,-0.215153,1.3853,1.192832
2,1,1.0,3.0,4.0,4.0,1.0,4.0,3.0,4.0,4.0,74,0,0.0,0.0,1.748479,-1.226273,-2.736854
3,0,2.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,4.0,60,0,1.0,0.0,-1.023152,-0.57338,-1.233551
4,0,3.0,3.0,3.0,1.0,1.0,2.0,4.0,3.0,3.0,60,1,0.0,1.0,-0.330402,-0.065574,-0.284097


In [4]:
'''
Bootstrapping to get confidence intervals for the causal effect
'''
def bootstrap_ci(treatment, formula_ps, formula_outcome, data, nb, ate, method,multilevel):
    '''
    Input: 
    treatment - column_name
    formula_ps - Regression formula for the propensity score model
    formula_outcome - Regression formula for the outcome model
    data - dataframe
    nb - number of bootstrapped samples
    ate -  ate obtained prioring to obtaining the confidence intervals
    method - "ipw" or "gcomputation"
    Output: ate - average causal effect
    
    We first obtain the propensity of the each instance to receive the treatment.
    Following this we fit the outcome model and weight the outcomes based on their propensity weights
    '''
    ate_rs = []
    for i in range(nb):  # Drawing nb bootstrapped samples, can simply start with 10 samples
        d_star = data.sample(n=data.shape[0], # Same size as input data
                             replace=True)  # Draw with replacement

        if method == "gcomputation":
            ate_ci = get_causal_effect_gcomputation(treatment,formula_outcome,d_star,multilevel)
            ate_rs.append(ate_ci)

        elif method == "ipw":
            ate_ci = get_causal_effect_ipw(treatment,d_star,formula_ps,formula_outcome,multilevel)
            ate_rs.append(ate_ci)

    ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])
    return ci_perc


## G-computation

In [5]:
def get_causal_effect_gcomputation(treatment, formula, data,multilevel=False):
    '''
    Input: treatment - column_name
           formula - Regression formula
           data - dataframe
           multilevel - specify if to use multilevel model for gcomputation
    Output: ate - average causal effect
    
    The main idea is to have two separate models for those with treatment=1 and treatment =0
    According to this we sample our data based on the treatment and fit two models
    
    We then predict the outcome for the entire data based on our fitted models and
    then evaluate the expected difference in the outcome which is our causal effect.
    '''
    if multilevel:
        fm_a1 = smf.mixedlm(formula, 
                        data.loc[data[treatment] == 4],groups = data.loc[data[treatment]==4]['gender']).fit()
        fm_a0 = smf.mixedlm(formula, 
                    data.loc[data[treatment] == 1], groups = data.loc[data[treatment]==1]['gender']).fit()

    else:
        f = sm.families.family.Binomial()
        fm_a1 = smf.glm(formula, 
                        data.loc[data[treatment] == 4], family=f).fit()
        fm_a0 = smf.glm(formula, 
                    data.loc[data[treatment] == 1], family=f).fit()

    y_a1 = fm_a1.predict(data)
    y_a0 = fm_a0.predict(data)
    ate = np.mean(y_a1 - y_a0)
    return ate

In [6]:
print("Obtaining causal effect using gcomputation multilevel")

treatment = "nbSESpc2score"
formula_g = "y ~ C(nbK3paFacilities)+C(N_UNFAV_CT00)+\
                C(sportIndex)+C(hyIndex)+C(activeIndex)+ C(darkgrnVeg)+C(eggs)+C(fish)+\
                age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes)"
formula_ps = "nbSESpc2score~ C(nbK3paFacilities)+C(N_UNFAV_CT00)+\
                C(sportIndex)+C(hyIndex)+C(activeIndex)+ C(darkgrnVeg)+C(eggs)+C(fish)+\
                age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes)"

effect_g= get_causal_effect_gcomputation(treatment,formula_g,data,multilevel=True)
effect_g
#ci_g = bootstrap_ci(treatment,formula_ps,formula_g,data,100,effect_g,"gcomputation",multilevel=True)
#print("ATE from multilevel IPW is {} with confidence interval {}".format(np.round(effect_g,10),ci_g))


Obtaining causal effect using gcomputation multilevel


0.028270539572825434

## IPW

In [None]:
def get_causal_effect_ipw(treatment,data,formula_ps,formula_outcome,multilevel=False):
    '''
    Input: 
    treatment - column_name
    data - dataframe
    formula_ps - Regression formula for the propensity score model
    formula_outcome - Regression formula for the outcome model
    multilevel - specify if to use a multilevel model for estimating propensity scores
    Output: ate - average causal effect
    
    We first obtain the propensity of the each instance to receive the treatment.
    Following this we fit the outcome model and weight the outcomes based on their propensity weights
    '''
    # data['gender'] = pd.to_numeric(data['gender'])
    f = sm.families.family.Binomial()
    if multilevel:
        ps_model = smf.mixedlm(formula_ps, 
                        data,groups = data['gender']).fit()
    else:
        ps_model = smf.glm(formula_ps,data,family=f).fit()
    
    ps_scores = ps_model.predict(data)
        
    outcome_model = smf.glm(formula_outcome,data,family=f).fit()
    predicted_outcomes = outcome_model.predict(data)
    
    potential_outcome1 = data[treatment]*predicted_outcomes/ps_scores / sum(data[treatment]/ps_scores)
    potential_outcome0 = (1-data[treatment])*predicted_outcomes/(1-ps_scores) / sum((1-data[treatment])/(1-ps_scores))
    ate = potential_outcome1.sum() - potential_outcome0.sum()
    return ate

## Summary

In [None]:
PA3cat_data = data[['idealHealthPA','idealHealthNutrition',
                    'nbProblems',
                    'nbCohesion',
                    'nbViolence','nbSESanascore','nbK3paFacilities','y','gender','currentSmoker','Diabetes','age','sbp','hdl','totchol']].copy()

treatment = 'idealHealthPA'
outcome = 'y'
formula_ps = "idealHealthPA ~  C(nbK3paFacilities)+C(idealHealthNutrition)+C(nbProblems)+C(nbCohesion)+C(nbViolence)+ age + sbp + hdl + totchol + C(currentSmoker) + C(Diabetes)"
formula_outcome = "y ~ 1+ C(idealHealthPA)"

print("Obtaining causal effect using IPW multilevel")
effect_ipw= get_causal_effect_ipw(treatment,PA3cat_data,formula_ps,formula_outcome,multilevel=True)
ci_ipw = bootstrap_ci(treatment,formula_ps,formula_outcome,data,100,effect_ipw,"ipw",multilevel=True)
print("ATE from multilevel IPW is {} with confidence interval {}".format(np.round(effect_ipw,10),ci_ipw))