In [1]:
import numpy as np
import numpy
import pandas as pd
from sksurv.metrics import concordance_index_censored, brier_score, cumulative_dynamic_auc
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

from scipy import stats
from sklearn import metrics
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve,f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import ShuffleSplit, GridSearchCV
from skopt import BayesSearchCV

In [2]:
x_train1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Xtrain_0820.csv')
y_train1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Ytrain_competing_risk_0727.csv')
x_test1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Xtest_0820.csv')
y_test1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_Ytest_competing_risk_0816_10y.csv')

In [3]:
x_train1_grs = x_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]
x_test1_grs = x_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]

In [4]:
x_train1_nogrs = x_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]
x_test1_nogrs = x_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3"]]

In [5]:
y_train1_array_h = y_train1[["mof","mofDAY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_train1_array_h]
y_train1_array_rsf_h = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

y_test1_array_h = y_test1[["mof","mofDAY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_test1_array_h]
y_test1_array_rsf_h = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

y_train1_array_d = y_train1[["Death_10y","DeathDAY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_train1_array_d]
y_train1_array_rsf_d = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

y_test1_array_d = y_test1[["Death_10y","DeathDAY"]].to_numpy()
#List of tuples
aux = [(e1,e2) for e1,e2 in y_test1_array_d]
y_test1_array_rsf_d = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

y_train1_array_cr = y_train1[["mof_cr","mofDAY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_train1_array_cr]
y_train1_array_rsf_cr = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

y_test1_array_cr = y_test1[["mof_cr","mofDAY"]].to_numpy()
aux = [(e1,e2) for e1,e2 in y_test1_array_cr]
y_test1_array_rsf_cr = numpy.array(aux, dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

In [6]:
# grid search for Model 3 (FRAX CRFs + GRS)
param_grid = {
    'n_estimators': [20,250,400], # The number of regression trees to create
    'learning_rate': [0.01, 0.1, 1.0], # learning rate shrinks the contribution of each tree by 'learning_rate'
    'subsample': [0.2, 0.5, 0.8], # The fraction of samples to be used for fitting the individual base learners
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    ComponentwiseGradientBoostingSurvivalAnalysis(),
    param_grid,
    cv=10,           
    n_jobs=-1,      
    verbose=0       
)

# Perform grid search
grid_search.fit(x_train1_grs, y_train1_array_rsf_h)

# Get the best hyperparameters
best_params_grid = grid_search.best_params_
print("Best Hyperparameters:", best_params_grid)

Best Hyperparameters: {'learning_rate': 1.0, 'n_estimators': 250, 'subsample': 0.2}


In [7]:
# grid search for Model 1 (FRAX CRFs)
param_grid = {
    'n_estimators': [20,250,400], # The number of regression trees to create
    'learning_rate': [0.01, 0.1, 1.0], # learning rate shrinks the contribution of each tree by 'learning_rate'
    'subsample': [0.2, 0.5, 0.8], # The fraction of samples to be used for fitting the individual base learners
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    ComponentwiseGradientBoostingSurvivalAnalysis(),
    param_grid,
    cv=10, 
    n_jobs=-1,       
    verbose=0       
)

# Perform grid search
grid_search.fit(x_train1_nogrs, y_train1_array_rsf_h)

# Get the best hyperparameters
best_params_grid = grid_search.best_params_
print("Best Hyperparameters:", best_params_grid)

Best Hyperparameters: {'learning_rate': 1.0, 'n_estimators': 250, 'subsample': 0.5}
