## Prep

In [None]:
import statsmodels.api as sm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

### prep covariates

In [None]:
dat = pd.read_csv('data/processed/jhs_cox_base.csv')
dat = dat[['event', 'y', 'y2', 'y3', 'time_12', 'time_13',
           'nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 'G_bla_rk',
           'PA3cat','nutrition3cat', 
           'age','gender', 'currentSmoker', 'Diabetes','sbp','hdl','totchol','alc','fmlyinc'
           ]]
dat = dat.rename(columns = {'event':'y_tot', 'y':'y_base'})

In [None]:
# dummy coding

cols_to_convert = ['nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 'G_bla_rk',
                   'PA3cat','nutrition3cat', 
                  'gender', 'currentSmoker', 'Diabetes','alc','fmlyinc']

dat = pd.get_dummies(dat, columns=cols_to_convert, drop_first=True)

### outcome at dif time points

In [None]:
# Y as outcome during the whole study period

dat_tot = dat.drop(columns=['y_base','y2', 'y3', 'time_12', 'time_13'])

In [None]:
# print(dat['time_12'].describe())
# print(dat['time_13'].describe())

In [None]:
# Y as outcome within 5 yrs

dat_less_5y = dat[dat['time_12'] <= 1825]
dat_less_5y = dat_less_5y.drop(columns=['y_tot', 'y_base', 'y3', 'time_12', 'time_13'])

In [None]:
# Y as outcome after 5 yrs

dat_over_5y = dat.drop(columns=['y_base','y_tot', 'y2', 'time_12', 'time_13'])

### Logit Stepwise

In [None]:
def forward_stepwise_selection_with_lock(X, Y, locked_features):
    remaining_features = [col for col in X.columns if col not in selected_features]
    
    while remaining_features:
        best_pvalue = 1.0
        best_feature = None
        
        for feature in remaining_features:
            model_features = selected_features + [feature]
            X_subset = X[model_features]
            X_subset = sm.add_constant(X_subset)  # Add a constant for intercept
            model = sm.Logit(Y, X_subset).fit(disp=0)
            p_value = model.pvalues[feature]
            
            if p_value < best_pvalue:
                best_pvalue = p_value
                best_feature = feature
        
        if best_pvalue < 0.2:  # set a significance level
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    
    return selected_features


In [None]:
# specify model components
# dat_tot - y_tot; dat_less_5y - y2; dat_over_5y - y3

## X
X = dat_over_5y.columns[1:].tolist()

## Y
Y = dat_over_5y['y3']

## locked_features
column_names = dat_over_5y.columns.tolist()
columns_to_drop = ['y3',
                   'nbK3paFacilities_2.0','N_UNFAV_CT00_2.0', 'G_bla_rk_2.0',
                  'nbK3paFacilities_3.0','N_UNFAV_CT00_3.0', 'G_bla_rk_3.0',
                  'nbK3paFacilities_4.0','N_UNFAV_CT00_4.0', 'G_bla_rk_4.0']
locked_features = [col for col in column_names if col not in columns_to_drop]
#locked_features = []

## Initialize an empty list to store selected features
selected_features = locked_features.copy()

In [None]:
selected_features = forward_stepwise_selection_with_lock(dat_over_5y[X], Y, locked_features)
print("Selected features:", selected_features)

In [None]:
# check model result w selected features
X_subset = dat_over_5y[selected_features]
X_subset = sm.add_constant(X_subset)  # Add a constant for intercept
model = sm.Logit(Y, X_subset).fit(disp=0)
model.summary()

## Boosting

In [None]:
X = dat_tot.drop('y_tot', axis=1)  
y = dat_tot['y_tot'] 

In [None]:
clf = GradientBoostingClassifier(random_state=0)

# Feature selection
clf.fit(X, y)

# Access feature importances
feature_importances = clf.feature_importances_

# Create a DataFrame to store feature names and importances
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print feature importances by rank
print("Feature Importances:")
for idx, row in importance_df.iterrows():
    print(f"{row['Feature']}: {row['Importance']}")