In [1]:
import numpy as np
import numpy
import pandas as pd
from sksurv.metrics import concordance_index_censored, brier_score, cumulative_dynamic_auc
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

from scipy import stats
from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve,f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import ShuffleSplit, GridSearchCV
from skopt import BayesSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [2]:
x_train1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Xtrain_0820.csv')
y_train1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Ytrain_competing_risk_0727.csv')
x_test1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Xtest_0820.csv')
y_test1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Ytest_competing_risk_0816_10y.csv')

In [3]:
x_train1_grs = x_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]
x_test1_grs = x_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]

In [4]:
x_train1_nogrs = x_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]
x_test1_nogrs = x_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]

In [5]:
# Apply SMOTE to dataset with GRS
x_train1smote = pd.concat([x_train1_grs,y_train1[["BKHIPDY"]]], axis=1)
y_train1smote = y_train1[["BKHIP"]]
sm = SMOTE(random_state=2, sampling_strategy="minority")
x_train_ss1, y_train_ss1 = sm.fit_resample(x_train1smote, y_train1smote)

x_train_s1_grs = x_train_ss1.drop("BKHIPDY", axis=1)
y_train_s1_grs = pd.concat([y_train_ss1,x_train_ss1[["BKHIPDY"]]], axis=1)

# Apply SMOTE to dataset without GRS
x_train1smote_nogrs = pd.concat([x_train1_nogrs,y_train1[["BKHIPDY"]]], axis=1)
y_train1smote_nogrs = y_train1[["BKHIP"]]
sm = SMOTE(random_state=2, sampling_strategy="minority")
x_train_ss1_nogrs, y_train_ss1 = sm.fit_resample(x_train1smote_nogrs, y_train1smote)

x_train_s1_nogrs = x_train_ss1_nogrs.drop("BKHIPDY", axis=1)
y_train_s1_nogrs = pd.concat([y_train_ss1,x_train_ss1_nogrs[["BKHIPDY"]]], axis=1)

y_train1_array_grs_h = y_train_s1_grs[["BKHIP","BKHIPDY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_train1_array_grs_h]
y_train1_array_rsf_grs_h = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

y_train1_array_nogrs_h = y_train_s1_nogrs[["BKHIP","BKHIPDY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_train1_array_nogrs_h]
y_train1_array_rsf_nogrs_h = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])


y_test1_array_h = y_test1[["BKHIP","BKHIPDY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_test1_array_h]
y_test1_array_rsf_h = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

In [6]:
# Bayesian optimization for Model 4 (FRAX CRFs + GRS)
param_space = {
    'n_estimators': (20, 400), # The number of regression trees to create
    'learning_rate': (0.01, 1.0), # learning rate shrinks the contribution of each tree by 'learning_rate'
    'max_depth': (2, 4) # Maximum depth of the individual regression estimators
}

# Initialize the Bayesian optimization search
opt = BayesSearchCV(
    GradientBoostingSurvivalAnalysis(),
    param_space,
    n_iter=5,  
    cv=10,
    n_jobs=-1,  
    verbose=0,  
    n_points=1,  
    random_state=0  
)

# Perform Bayesian optimization
opt.fit(x_train_s1_grs, y_train1_array_rsf_grs_h)

# Get the best hyperparameters
best_params_bayes = opt.best_params_
print("Best Hyperparameters:", best_params_bayes)

Best Hyperparameters: OrderedDict([('learning_rate', 0.28974223277895816), ('max_depth', 4), ('n_estimators', 43)])


In [7]:
# Bayesian optimization for Model 2 (FRAX CRFs)
param_space = {
    'n_estimators': (20, 400), # The number of regression trees to create
    'learning_rate': (0.01, 1.0), # learning rate shrinks the contribution of each tree by 'learning_rate'
    'max_depth': (2, 4), # Maximum depth of the individual regression estimators
}


# Initialize the Bayesian optimization search
opt = BayesSearchCV(
    GradientBoostingSurvivalAnalysis(),
    param_space,
    n_iter=5,  
    cv=10,
    n_jobs=-1,  
    verbose=0, 
    n_points=1,  
    random_state=0  
)

# Perform Bayesian optimization
opt.fit(x_train_s1_nogrs, y_train1_array_rsf_nogrs_h)

# Get the best hyperparameters
best_params_bayes = opt.best_params_
print("Best Hyperparameters:", best_params_bayes)

Best Hyperparameters: OrderedDict([('learning_rate', 0.25697256091740617), ('max_depth', 4), ('n_estimators', 51)])
