## Prep

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

import statsmodels.api as sm
from scipy import stats
import statsmodels.tools as tools
from scipy.stats import bootstrap


import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

# Set a seed for NumPy
np.random.seed(0)

# Set a seed for Python's built-in random module
random.seed(0)

### Raw data

In [None]:
mesa_raw = pd.read_csv('../mesa/data_processed/Y_BaselineX_raw_full.csv')

mesa_raw = mesa_raw.rename(columns = {'cvda': 'Y_tot', 
                                     'F1_PC2': 'nSES',
                                    'S1FAV': 'nFavFood',
                                    'S1PAI': 'nPhysFac', 
                                    'G_bla_rk': 'nRS', 
                                    'chdiet': 'nutrition', 
                                    'chphysact': 'PhysAct',
                                    'income': 'FamIncome',
                                    'cig':'currentSmoker',
                                    'cural':'alc',
                                      'diabet': 'Diabetes',
                                      'chol':'totchol'})

diet_phys_map = {0:1, 1:2, 2:3}
mesa_raw['nutrition'] = mesa_raw['nutrition'].replace(diet_phys_map)
mesa_raw['PhysAct'] = mesa_raw['PhysAct'].replace(diet_phys_map)

mesa = mesa_raw.copy()
mesa = mesa.dropna()

mesa_bla = mesa[mesa['race'] == 3]


In [None]:
jhs_outcome_full = pd.read_csv('../jhs/data/processed/jhs_cox_base.csv')
jhs_outcome = jhs_outcome_full[['subjid','event','time']].copy()

jhs_covar = pd.read_csv('../jhs/data/processed/jhs_raw_full.csv')
jhs_covar = jhs_covar[jhs_covar['visit'] == 1]

jhs_raw = pd.merge(jhs_outcome, jhs_covar, on=['subjid'], how='left')

jhs_raw = jhs_raw.drop(jhs_raw.columns[4], axis=1)

jhs_raw = jhs_raw.rename(columns = {'event': 'Y_tot', 
                             'nbSESpc2score': 'nSES',
                            'S1FAV': 'nFavFood',
                            'S1PAI': 'nPhysFac', 
                            'G_bla_rk': 'nRS', 
                            'nutrition3cat': 'nutrition', 
                            'PA3cat': 'PhysAct',
                            'fmlyinc': 'FamIncome'})
jhs_raw['nutrition'] = jhs_raw['nutrition'].replace(diet_phys_map)
jhs_raw['PhysAct'] = jhs_raw['PhysAct'].replace(diet_phys_map)

jhs = jhs_raw.copy()
jhs = jhs.dropna()

### Raw data with missing imputed 

In [None]:
def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df

cat_feat = ['FamIncome', 'nutrition', 'PhysAct',
           'currentSmoker','alc','Diabetes']
cont_feat = ['nSES','nFavFood','nPhysFac', 'nRS',
           'hdl','totchol','sbp']

In [None]:
mesa = fillna_cat(mesa_raw,cat_feat)  
mesa = fillna_cat(mesa_raw,cont_feat)  

mesa_bla = mesa[mesa['race'] == 3]

In [None]:
jhs = fillna_cat(jhs_raw,cat_feat)  
jhs = fillna_cat(jhs_raw,cont_feat)  

### Processed data

In [None]:
mesa = pd.read_csv('../mesa/data_processed/Y_BaselineX_processed_full.csv')

mesa = mesa.rename(columns = {'cvda': 'Y_tot', 
                             'F1_PC2': 'nSES',
                            'S1FAV': 'nFavFood',
                            'S1PAI': 'nPhysFac', 
                            'G_bla_rk': 'nRS', 
                            'chdiet': 'nutrition', 
                            'chphysact': 'PhysAct',
                            'income': 'FamIncome',
                             'cig':'currentSmoker',
                            'cural':'alc',
                              'diabet': 'Diabetes',
                              'chol':'totchol'})

diet_phys_map = {0:1, 1:2, 2:3}
mesa['nutrition'] = mesa['nutrition'].replace(diet_phys_map)
mesa['PhysAct'] = mesa['PhysAct'].replace(diet_phys_map)

mesa_bla = mesa[mesa['race'] == 3]

In [None]:
jhs = pd.read_csv('../jhs/data/processed/jhs_cox_base.csv')
jhs = jhs.drop("nSES", axis=1)

jhs = jhs.rename(columns = {'event': 'Y_tot', 
                             'nbSESpc2score': 'nSES',
                            'S1FAV': 'nFavFood',
                            'S1PAI': 'nPhysFac', 
                            'G_bla_rk': 'nRS', 
                            'nutrition3cat': 'nutrition', 
                            'PA3cat': 'PhysAct',
                            'fmlyinc': 'FamIncome'})

jhs['nutrition'] = jhs['nutrition'].replace(diet_phys_map)
jhs['PhysAct'] = jhs['PhysAct'].replace(diet_phys_map)

## Moderation test

### Mesa

In [None]:
y = mesa['Y_tot']

X = mesa[['nSES','nFavFood','nPhysFac', 'nRS',
          'FamIncome','nutrition', 'PhysAct',
          'age','gender','race','currentSmoker','alc','Diabetes','hdl','totchol','sbp']]

X = pd.get_dummies(X, columns=['gender','race','currentSmoker','alc','Diabetes'], 
                    drop_first=True)

X1 = X.copy()
X2 = X.copy()
X3 = X.copy()
X4 = X.copy()
X5 = X.copy()
X6 = X.copy()

X1['nSES_FamIncome'] = X['nSES'] * X['FamIncome']
X2['nFavFood_FamIncome'] = X['nFavFood'] * X['FamIncome']
X3['nPhysFac_FamIncome'] = X['nPhysFac'] * X['FamIncome']
X4['nRS_FamIncome'] = X['nRS'] * X['FamIncome']
X5['nFavFood_nutrition'] = X['nFavFood'] * X['nutrition']
X6['nPhysFac_PhysAct'] = X['nPhysFac'] * X['PhysAct']

In [None]:
logistic_model_with_moderation = sm.Logit(y, sm.add_constant(X6)).fit()
print(logistic_model_with_moderation.summary())

### MESA BLACK

In [None]:
y = mesa_bla['Y_tot']

X = mesa_bla[['nSES','nFavFood','nPhysFac', 'nRS',
          'FamIncome','nutrition', 'PhysAct',
          'age','gender','race','currentSmoker','alc','Diabetes','hdl','totchol','sbp']]

X = pd.get_dummies(X, columns=['gender','race','currentSmoker','alc','Diabetes'], 
                    drop_first=True)

X1 = X.copy()
X2 = X.copy()
X3 = X.copy()
X4 = X.copy()
X5 = X.copy()
X6 = X.copy()

X1['nSES_FamIncome'] = X['nSES'] * X['FamIncome']
X2['nFavFood_FamIncome'] = X['nFavFood'] * X['FamIncome']
X3['nPhysFac_FamIncome'] = X['nPhysFac'] * X['FamIncome']
X4['nRS_FamIncome'] = X['nRS'] * X['FamIncome']
X5['nFavFood_nutrition'] = X['nFavFood'] * X['nutrition']
X6['nPhysFac_PhysAct'] = X['nPhysFac'] * X['PhysAct']

In [None]:
logistic_model_with_moderation = sm.Logit(y, sm.add_constant(X6)).fit()
print(logistic_model_with_moderation.summary())

### JHS

In [None]:
y = jhs['Y_tot']

X = jhs[['nSES','nFavFood','nPhysFac', 'nRS',
          'FamIncome','nutrition', 'PhysAct',
          'age','gender','currentSmoker','alc','Diabetes','hdl','totchol','sbp']]
X = pd.get_dummies(X, columns=['gender','currentSmoker','alc','Diabetes'], 
                    drop_first=True)

X1 = X.copy()
X2 = X.copy()
X3 = X.copy()
X4 = X.copy()
X5 = X.copy()
X6 = X.copy()

X1['nSES_FamIncome'] = X['nSES'] * X['FamIncome']
X2['nFavFood_FamIncome'] = X['nFavFood'] * X['FamIncome']
X3['nPhysFac_FamIncome'] = X['nPhysFac'] * X['FamIncome']
X4['nRS_FamIncome'] = X['nRS'] * X['FamIncome']
X5['nFavFood_nutrition'] = X['nFavFood'] * X['nutrition']
X6['nPhysFac_PhysAct'] = X['nPhysFac'] * X['PhysAct']

In [None]:
logistic_model_with_moderation = sm.Logit(y, sm.add_constant(X6)).fit()
print(logistic_model_with_moderation.summary())

## Mediation Test

In [None]:
df = mesa_bla
Y = 'Y_tot'
X = 'nSES'
M = 'FamIncome'

In [None]:
### estimate the indirect effect on original sample

# Step 1: Regression of M on X
model_mediator = sm.OLS(df[M], sm.add_constant(df[X])).fit()

# Step 2: Regression of Y on X; total effect
model_outcome = sm.Logit(df[Y], sm.add_constant(df[X])).fit(disp=0)

# Step 3: Regression of Y on both X and M
model_combined = sm.Logit(df[Y], sm.add_constant(df[[X, M]])).fit(disp=0)

# Calculate the indirect effect for the original sample
indirect_effect_original = model_combined.params[M] * model_outcome.params[X]


### estimate CI using bootstrapping

num_bootstrap_samples = 1000
bootstrap_results = []

for _ in range(num_bootstrap_samples):
    # Step 1: Draw a bootstrap sample
    bootstrap_sample = df.sample(frac=1, replace=True)

    # Step 2: Regression of M on X
    model_mediator = sm.OLS(bootstrap_sample[M], sm.add_constant(bootstrap_sample[X])).fit()

    # Step 3: Regression of Y on X
    model_outcome = sm.Logit(bootstrap_sample[Y], sm.add_constant(bootstrap_sample[X])).fit(disp=0)

    # Step 4: Regression of Y on both X and M
    model_combined = sm.Logit(bootstrap_sample[Y], sm.add_constant(bootstrap_sample[[X, M]])).fit(disp=0)

    # Calculate the indirect effect for this bootstrap sample
    indirect_effect_bootstrap = model_combined.params[M] * model_outcome.params[X]
    
    # Store the result
    bootstrap_results.append(indirect_effect_bootstrap)

# Calculate the confidence interval
ci_lower = round(np.percentile(bootstrap_results, 2.5),4)
ci_upper = round(np.percentile(bootstrap_results, 97.5),4)


### report result

# Print results
print("Step 1 - Coefficient for X in M model:", round(model_mediator.params[X],4), 
      "p-value:", round(model_mediator.pvalues[X],4))

print("Step 2 - Coefficient for X in Y model:", round(model_outcome.params[X],4), 
      "p-value:", round(model_outcome.pvalues[X],4))

print("Step 3 - Coefficient for M in Y model (controlling for X):", round(model_combined.params[M],4), 
      "p-value:", round(model_combined.pvalues[M],4))

print("\nIndirect Effect (original sample):", round(indirect_effect_original,4))
print("Bootstrapped 95% Confidence Interval:", (ci_lower, ci_upper))

In [None]:
# model_mediator = sm.Logit(df[Y], sm.add_constant(df[X])).fit()
