In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from scipy import stats
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

## data

In [None]:
mesa_std = pd.read_csv('../data_processed/MESA/mesa_std.csv', index_col=False)
mesa_std = mesa_std.dropna()

analysis_feature = ['Y_10y','nSES','nFavFood','nPhysFac', 'nRS', 
                    'FamIncome', 'nutrition', 'PhysAct', 'currentSmoker', 'alc',
                    'age', 'gender', 'Diabetes', 'hdl', 'totchol', 'sbp',
                   'site', 'race']

mesa_std = mesa_std[analysis_feature]

mesa_bla_std = mesa_std[mesa_std['race'] == 1]


In [None]:
jhs_std = pd.read_csv('../data_processed/JHS/jhs_std.csv', index_col=False)
jhs_std = jhs_std.dropna()

analysis_feature = ['Y_10y','nSES','nFavFood','nPhysFac', 'nRS', 
                    'FamIncome', 'nutrition', 'PhysAct', 'currentSmoker', 'alc',
                    'age','gender', 'Diabetes', 'hdl', 'totchol', 'sbp']

jhs_std = jhs_std[analysis_feature]

In [None]:
# check nSES distribution

jhs_std['nSES'].plot.kde()

plt.title('Density Plot for nSES')
plt.xlabel('nSES Values')
plt.ylabel('Density')

plt.show()

## Generalized IPW for single estimate

In [None]:
# Step 1: estimate propensity density

def conditional_densities(data,formula_ps_no_con,formula_ps_con, use_confounders=True):
    
    formula = formula_ps_con if use_confounders else formula_ps_no_con
        
    model = sm.formula.ols(formula, data=data).fit()
    density = stats.norm( 
        loc=model.fittedvalues,
        scale=model.resid.std(),
    )  # a series of distributions?
    densities = density.pdf(data["nSES"])
    densities = pd.Series(densities, index=model.fittedvalues.index)
    return densities


In [None]:
# check balance??

# love plot displays standardized mean differences, but only works for binary treatment


In [None]:
# Step 2: Weighted Logistic regression 

formula_outcome = 'Y_10y ~ 1 + nSES'

model = smf.logit(formula=formula_outcome, data=data, weights=generalized_ipw).fit()
print(model.summary())

In [None]:
# visualize the predictions

dosage = list(range(-10,10))  # hypothesized dosage value for plotting
dosage = pd.DataFrame(
    data={"nSES": dosage},
    index=dosage,
)
response = model.predict(dosage)
ax = response.plot(
    kind="line",
    xlabel="nSES value",
    ylabel="Prediction of 10-year CVD risk"
)


In [None]:
# check logistic regression assumption 

import pygam
from pygam import LogisticGAM, s

X = data['nSES']  # Predictor variables
y = data['Y_10y']  # Response variable

# Fit a GAM with a spline for each predictor variable
gam = LogisticGAM(s(0)).fit(X, y)

# Plot the partial dependence for each predictor to inspect linearity
for i, term in enumerate(gam.terms):
    if term.isintercept:
        continue
    XX = gam.generate_X_grid(term=i)
    pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)
    plt.figure()
    plt.plot(XX[:, i], pdep)
    plt.plot(XX[:, i], confi, c='r', ls='--')
    plt.title(f'Partial Dependence')
    plt.show()

## roughly linear, assumption holds

### bootstrap for CI

In [None]:
# return CI for the coefficient of treatment in the outcome function

def bootstrap_G_IPW(data, formula_ps_no_con,formula_ps_con, formula_outcome, iteration):
    
    coef_treatment = []
    
    for i in range(iteration):
        d_star = data.sample(n=data.shape[0], replace=True)
        
        # step 1
        denominator = conditional_densities(d_star, formula_ps_no_con,formula_ps_con, use_confounders=True)
        numerator = conditional_densities(d_star, formula_ps_no_con,formula_ps_con, use_confounders=False)
        generalized_ipw = numerator / denominator
        
        # step 2
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
        model = smf.logit(formula=formula_outcome, data=d_star, weights=generalized_ipw).fit(disp=0)
        coef_treatment.append(model.params[1])
    
    # Calculate mean and 95% CI
    mean_coef = np.round(np.mean(coef_treatment), 5)
    ci = np.round(np.percentile(coef_treatment, q=[2.5, 97.5]), 5)
    
    result_template = "Using bootstrap method, the estimated coefficient of treatment \
    in the outcome function is {mean_coef}, and its 95% CI is {ci}."
    result = result_template.format(mean_coef=mean_coef, ci=ci)
    
    return result


In [None]:
formula_ps_no_con = "nRS ~ 1"
formula_ps_con = "nRS ~ 1 + \
        nSES + nFavFood + nPhysFac + \
        C(FamIncome)+C(nutrition)+C(PhysAct)+C(currentSmoker)+C(alc)+ \
        age + sbp + hdl + totchol + C(Diabetes) +\
        C(site)"

formula_outcome = 'Y_10y ~ 1 + nRS'

data = mesa_bla_std.copy()

In [None]:
bootstrap_G_IPW(data, formula_ps_no_con,formula_ps_con, formula_outcome, iteration=1000)