## Prep

In [None]:
import statsmodels.api as sm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [None]:
dat = pd.read_csv('../data_processed/Y_BaselineX_processed_full.csv')
dat = dat.drop(columns=['idno','EXAM','cvdatt'])

In [None]:
# dummy coding

cols_to_convert = ['F1_PC2','A_S1FAV','A_S1PAI','G_bla_rk',
                   'chdiet','chphysact', 'income',
                  'site','race','gender', 'cig', 'diabet','cural']

dat = pd.get_dummies(dat, columns=cols_to_convert, drop_first=True)

## Logit Stepwise

In [None]:
def forward_stepwise_selection_with_lock(X, Y, locked_features):
    remaining_features = [col for col in X.columns if col not in selected_features]
    
    while remaining_features:
        best_pvalue = 1.0
        best_feature = None
        
        for feature in remaining_features:
            model_features = selected_features + [feature]
            X_subset = X[model_features]
            X_subset = sm.add_constant(X_subset)  # Add a constant for intercept
            model = sm.Logit(Y, X_subset).fit(disp=0)
            p_value = model.pvalues[feature]
            
            if p_value < best_pvalue:
                best_pvalue = p_value
                best_feature = feature
        
        if best_pvalue < 0.2:  # set a significance level
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    
    return selected_features

In [None]:
# specify model components
# dat_tot - y_tot; dat_less_5y - y2; dat_over_5y - y3

## X
X = dat.drop(columns=['cvda']).columns.tolist()

## Y

Y = dat['cvda']

## locked_features
column_names = dat.columns.tolist()
columns_to_drop = ['cvda',
                   'F1_PC2_2.0','N_UNFAV_CT00_2.0', 'G_bla_rk_2.0',
                  'F1_PC2_3.0','N_UNFAV_CT00_3.0', 'G_bla_rk_3.0',
                  'F1_PC2_4.0','N_UNFAV_CT00_4.0', 'G_bla_rk_4.0']
#locked_features = [col for col in column_names if col not in columns_to_drop]
locked_features = []

## Initialize an empty list to store selected features
selected_features = locked_features.copy()

In [None]:
selected_features = forward_stepwise_selection_with_lock(dat[X], Y, locked_features)
print("Selected features:", selected_features)

In [None]:
# check logit model result 
X_subset = dat.drop(columns = ['cvda'])
X_subset = sm.add_constant(X_subset)  # Add a constant for intercept
model = sm.Logit(Y, X_subset).fit(disp=0)
model.summary()