In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

import torch
import torchtuples as tt

from pycox.datasets import metabric
from pycox.models import CoxPH
from pycox.evaluation import EvalSurv
from sksurv.metrics import concordance_index_censored

In [2]:
xy_train1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_XYtrain_0820.csv')
xy_test1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_XYtest_0820.csv')

In [3]:
xy_train1_grs = xy_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]
xy_test1_grs = xy_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]
xy_train1_nogrs = xy_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]
xy_test1_nogrs = xy_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]

In [4]:
cols_leave = ["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL"]

cols_leave_nogrs = ["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL"]

leave = [(col, None) for col in cols_leave]
leave_nogrs = [(col, None) for col in cols_leave_nogrs]

x_mapper = DataFrameMapper(leave)
x_mapper_nogrs = DataFrameMapper(leave_nogrs)

In [5]:
x_train_grs = x_mapper.fit_transform(xy_train1_grs).astype('float32')
x_test_grs = x_mapper.transform(xy_test1_grs).astype('float32')

x_train_nogrs = x_mapper_nogrs.fit_transform(xy_train1_nogrs).astype('float32')
x_test_nogrs = x_mapper_nogrs.fit_transform(xy_test1_nogrs).astype('float32')

In [6]:
get_target = lambda df: (df['mofDAY'].values, df['mof'].values)
y_train_mof = get_target(xy_train1)
durations_test_mof, events_test_mof = get_target(xy_test1)

In [7]:
# Model 4 (Bayesian optimization + FRAX CRFs + GRS)
np.random.seed(1234)
_ = torch.manual_seed(123)

in_features = x_train_grs.shape[1]
num_nodes = [130,130]
out_features = 1
batch_norm = True
dropout = 0.08405376347880598
learning_rate=0.6341016716921827
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)
model = CoxPH(net, tt.optim.Adam)
batch_size = 614
lrfinder = model.lr_finder(x_train_grs, y_train_mof, batch_size, tolerance=10)
lrfinder.get_best_lr()
model = CoxPH(net, tt.optim.Adam)
batch_size = 614
model.optimizer.set_lr(learning_rate)
epochs = 43
callbacks = [tt.callbacks.EarlyStopping()]
verbose = True
log = model.fit(x_train_grs, y_train_mof, batch_size, epochs, callbacks, verbose)

0:	[0s / 0s],		train_loss: 39.1589
1:	[0s / 0s],		train_loss: 46.2428
2:	[0s / 0s],		train_loss: 42.1546
3:	[0s / 0s],		train_loss: 53.0652
4:	[0s / 0s],		train_loss: 35.5528
5:	[0s / 0s],		train_loss: 30.7007
6:	[0s / 0s],		train_loss: 77.4712
7:	[0s / 0s],		train_loss: 57.5710
8:	[0s / 0s],		train_loss: 36.9522
9:	[0s / 0s],		train_loss: 32.2396


In [8]:
time_grid = np.linspace(durations_test_mof.min(), durations_test_mof.max(), 100)
_ = model.compute_baseline_hazards()
surv = model.predict_surv_df(x_test_grs)
cindex = concordance_index_censored(events_test_mof.astype('bool'), durations_test_mof, 1-surv.iloc[1000])[0]
ev = EvalSurv(surv, durations_test_mof, events_test_mof, censor_surv='km')

In [9]:
print("Model 4: ")
print("C-index: ", cindex)
print("Brier score: ", ev.integrated_brier_score(time_grid))
print("Dynamic mean auc: ", ev.concordance_td())

Model 4: 
C-index:  0.7453712179463708




Brier score:  0.007611135690063963
Dynamic mean auc:  0.7146352973722859


In [10]:
# Model 3 (Grid search + FRAX CRFs + GRS)
np.random.seed(1234)
_ = torch.manual_seed(123)

in_features = x_train_grs.shape[1]
num_nodes = [200, 200]
out_features = 1
batch_norm = True
dropout = 0.1 
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)
model = CoxPH(net, tt.optim.Adam)
batch_size = 600 
lrfinder = model.lr_finder(x_train_grs, y_train_mof, batch_size, tolerance=10)
lrfinder.get_best_lr()
model = CoxPH(net, tt.optim.Adam)
batch_size = 600 
model.optimizer.set_lr(1)
epochs = 60 
callbacks = [tt.callbacks.EarlyStopping()]
verbose = True
log = model.fit(x_train_grs, y_train_mof, batch_size, epochs, callbacks, verbose)

0:	[0s / 0s],		train_loss: 225.8998
1:	[0s / 0s],		train_loss: 222.8280
2:	[0s / 0s],		train_loss: 221.9486
3:	[0s / 0s],		train_loss: 161.7642
4:	[0s / 0s],		train_loss: 248.8383
5:	[0s / 0s],		train_loss: 318.6328
6:	[0s / 0s],		train_loss: 304.9946
7:	[0s / 0s],		train_loss: 233.7371
8:	[0s / 0s],		train_loss: 304.2512
9:	[0s / 0s],		train_loss: 471.3184


In [11]:
time_grid = np.linspace(durations_test_mof.min(), durations_test_mof.max(), 100)
_ = model.compute_baseline_hazards()
surv = model.predict_surv_df(x_test_grs)
cindex = concordance_index_censored(events_test_mof.astype('bool'), durations_test_mof, 1-surv.iloc[1000])[0]
ev = EvalSurv(surv, durations_test_mof, events_test_mof, censor_surv='km')

In [12]:
print("Model 3: ")
print("C-index: ", cindex)
print("Brier score: ", ev.integrated_brier_score(time_grid))
print("Dynamic mean auc: ", ev.concordance_td())

Model 3: 
C-index:  0.6811785239158825
Brier score:  0.0076320850415950905
Dynamic mean auc:  0.6581551079523033


In [13]:
# Model 2 (Bayesian optimization + FRAX CRFs + GRS)
np.random.seed(1234)
_ = torch.manual_seed(123)

in_features = x_train_nogrs.shape[1]
num_nodes = [250, 250]
out_features = 1
batch_norm = True
dropout = 0.010074801781291724 
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)

model = CoxPH(net, tt.optim.Adam)
batch_size = 726 
lrfinder = model.lr_finder(x_train_nogrs, y_train_mof, batch_size, tolerance=10)
model = CoxPH(net, tt.optim.Adam)
batch_size = 726 
model.optimizer.set_lr(0.010630405728982002) 
epochs = 60 
callbacks = [tt.callbacks.EarlyStopping()]
verbose = True
log = model.fit(x_train_nogrs, y_train_mof, batch_size, epochs, callbacks, verbose)

0:	[0s / 0s],		train_loss: 5.7247
1:	[0s / 0s],		train_loss: 5.5862
2:	[0s / 0s],		train_loss: 5.5992
3:	[0s / 0s],		train_loss: 5.5672
4:	[0s / 0s],		train_loss: 5.5335
5:	[0s / 0s],		train_loss: 5.5234
6:	[0s / 0s],		train_loss: 5.5674
7:	[0s / 0s],		train_loss: 5.5121
8:	[0s / 0s],		train_loss: 5.4662
9:	[0s / 0s],		train_loss: 5.5043


In [14]:
time_grid = np.linspace(durations_test_mof.min(), durations_test_mof.max(), 100)
_ = model.compute_baseline_hazards()
surv = model.predict_surv_df(x_test_nogrs)
cindex = concordance_index_censored(events_test_mof.astype('bool'), durations_test_mof, 1-surv.iloc[1000])[0]
ev = EvalSurv(surv, durations_test_mof, events_test_mof, censor_surv='km')

In [15]:
print("Model 2: ")
print("C-index: ", cindex)
print("Brier score: ", ev.integrated_brier_score(time_grid))
print("Dynamic mean auc: ", ev.concordance_td())

Model 2: 
C-index:  0.6210521605051281
Brier score:  0.007650882692544136
Dynamic mean auc:  0.5803305899439796


In [16]:
# Model 1 (grid search + FRAX CRFs)
np.random.seed(1234)
_ = torch.manual_seed(123)

in_features = x_train_nogrs.shape[1]
num_nodes = [300, 300]
out_features = 1
batch_norm = True
dropout = 0.01 
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)

model = CoxPH(net, tt.optim.Adam)
batch_size = 600 
lrfinder = model.lr_finder(x_train_nogrs, y_train_mof, batch_size, tolerance=10)
model = CoxPH(net, tt.optim.Adam)
batch_size = 600 
model.optimizer.set_lr(0.1) 
epochs = 60 
callbacks = [tt.callbacks.EarlyStopping()]
verbose = True
log = model.fit(x_train_nogrs, y_train_mof, batch_size, epochs, callbacks, verbose)

0:	[0s / 0s],		train_loss: 8.6250
1:	[0s / 0s],		train_loss: 5.4904
2:	[0s / 0s],		train_loss: 5.3759
3:	[0s / 0s],		train_loss: 5.4487
4:	[0s / 0s],		train_loss: 5.3931
5:	[0s / 0s],		train_loss: 5.3589
6:	[0s / 0s],		train_loss: 5.3525
7:	[0s / 0s],		train_loss: 5.3822
8:	[0s / 0s],		train_loss: 5.3781
9:	[0s / 0s],		train_loss: 5.3331


In [17]:
time_grid = np.linspace(durations_test_mof.min(), durations_test_mof.max(), 100)
_ = model.compute_baseline_hazards()
surv = model.predict_surv_df(x_test_nogrs)
cindex = concordance_index_censored(events_test_mof.astype('bool'), durations_test_mof, 1-surv.iloc[1000])[0]
ev = EvalSurv(surv, durations_test_mof, events_test_mof, censor_surv='km')

In [18]:
print("Model 1: ")
print("C-index: ", cindex)
print("Brier score: ", ev.integrated_brier_score(time_grid))
print("Dynamic mean auc: ", ev.concordance_td())

Model 1: 
C-index:  0.557105602545169
Brier score:  0.007675809428274024
Dynamic mean auc:  0.5745047863532992
