## Prep

In [None]:
import statsmodels.api as sm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [None]:
dat = pd.read_csv('../data_processed/Y_BaselineX_processed_full.csv')
dat = dat.drop(columns=['idno','EXAM'])

### dummy coding

In [None]:
cols_to_convert = ['F1_PC2','A_S1FAV','A_S1PAI','G_bla_rk',
                   'chdiet','chphysact', 'income',
                  'site','race','gender', 'cig', 'diabet','cural']

dat = pd.get_dummies(dat, columns=cols_to_convert, drop_first=True)

### outcome 

In [None]:
# check outcome distribution

outcome_dis = dat[['idno','cvda', 'cvdatt']].copy()
outcome_dis['10y'] = (outcome_dis['cvdatt'] < 3650).astype(int)
outcome_dis['15y'] = (outcome_dis['cvdatt'] < 365*15).astype(int)

print("y within 10 years =",outcome_dis.loc[outcome_dis['10y'] == 1, 'cvda'].sum(),"\n"
     "y within 15 years =",outcome_dis.loc[outcome_dis['15y'] == 1, 'cvda'].sum(),"\n"
     "tot Y (up to 18.5 yrs) =",outcome_dis['cvda'].sum())

In [None]:
df_less_10 = outcome_dis[outcome_dis['cvdatt'] < 3650]
df_between_10_and_15 = outcome_dis[(outcome_dis['cvdatt'] >= 3650) & (outcome_dis['cvdatt'] < 365 * 15)]
df_greater_than_15 = outcome_dis[outcome_dis['cvdatt'] >= 365 * 15]
print("df_10y has {} rows and {} rows w CVD outcome\n".format(df_less_10.shape[0], df_less_10['cvda'].sum()),
     "df_10to15y has {} rows and {} rows w CVD outcome\n".format(df_between_10_and_15.shape[0], 
                                                                 df_between_10_and_15['cvda'].sum()),
    "df_over_15y has {} rows and {} rows w CVD outcome\n".format(df_greater_than_15.shape[0], 
                                                                 df_greater_than_15['cvda'].sum()))


In [None]:
# Y for dif periods

dat_tot = dat.copy().drop(columns=['cvdatt'])

dat_10y = dat.copy()
dat_10y['event_10y'] = np.where((dat_10y['cvda'] == 1) & (dat_10y['cvdatt'] <= 3650), 1, 0)
dat_10y = dat_10y.drop(columns = ['cvda','cvdatt'])

dat_over_10y = dat.copy()
dat_over_10y = dat_over_10y[dat_over_10y['cvdatt'] > 3650]
dat_over_10y = dat_over_10y.rename(columns={'cvda': 'event_over_10y'})
dat_over_10y = dat_over_10y.drop(columns = ['cvdatt'])

## Logit Stepwise

In [None]:
def forward_stepwise_selection_with_lock(X, Y, locked_features):
    remaining_features = [col for col in X.columns if col not in selected_features]
    
    while remaining_features:
        best_pvalue = 1.0
        best_feature = None
        
        for feature in remaining_features:
            model_features = selected_features + [feature]
            X_subset = X[model_features]
            X_subset = sm.add_constant(X_subset)  # Add a constant for intercept
            model = sm.Logit(Y, X_subset).fit(disp=0)
            p_value = model.pvalues[feature]
            
            if p_value < best_pvalue:
                best_pvalue = p_value
                best_feature = feature
        
        if best_pvalue < 0.2:  # set a significance level
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    
    return selected_features

In [None]:
# specify model components
# dat_tot - cvda; dat_10y - event_10y; dat_over_10y - event_over_10y

## X
X = dat_tot.drop(columns=['cvda']).columns.tolist()

## Y

Y = dat_tot['cvda']

## locked_features
column_names = dat_tot.columns.tolist()
columns_to_drop = ['cvda',
                   'F1_PC2_2.0','N_UNFAV_CT00_2.0', 'G_bla_rk_2.0',
                  'F1_PC2_3.0','N_UNFAV_CT00_3.0', 'G_bla_rk_3.0',
                  'F1_PC2_4.0','N_UNFAV_CT00_4.0', 'G_bla_rk_4.0']
#locked_features = [col for col in column_names if col not in columns_to_drop]
locked_features = []

## Initialize an empty list to store selected features
selected_features = locked_features.copy()

In [None]:
selected_features = forward_stepwise_selection_with_lock(dat_tot[X], Y, locked_features)
print("Selected features:", selected_features)

In [None]:
# check logit model result 
X_subset = dat_tot.drop(columns = ['cvda'])
X_subset = sm.add_constant(X_subset)  # Add a constant for intercept
model = sm.Logit(Y, X_subset).fit(disp=0)
model.summary()