In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import DataFrameMapper

import torch
import torchtuples as tt

from pycox.datasets import metabric
from pycox.models import CoxPH
from pycox.evaluation import EvalSurv
from levenberg_marquardt import LevenbergMarquardtReg

In [2]:
xy_train1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_XYtrain_0820.csv')
xy_test1 = pd.read_csv('/users/PAS2433/dai417osc/WHI_sp23/data/sp23_nobmd_XYtest_0820.csv')

In [3]:
xy_train1_grs = xy_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]
xy_test1_grs = xy_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]
xy_train1_nogrs = xy_train1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]
xy_test1_nogrs = xy_test1[["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL","mof","mofDAY","BKHIP","BKHIPDY"]]

In [4]:
cols_leave = ["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo","SCORE",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL"]

cols_leave_nogrs = ["AGE","HEIGHTX","WEIGHTX","DIABNW","parental_hip_frac","previous_frac","DRNKSDAY_3_more","CORT","RHEUMAT","Second_Osteo",
"RACE_1","RACE_2","RACE_3","RACE_4","RACE_5","SMOKING_2","NUMFALLS_0","NUMFALLS_1","NUMFALLS_2","NUMFALLS_3","DEATHALL"]

leave = [(col, None) for col in cols_leave]
leave_nogrs = [(col, None) for col in cols_leave_nogrs]


x_mapper = DataFrameMapper(leave)
x_mapper_nogrs = DataFrameMapper(leave_nogrs)

In [5]:
x_train_grs = x_mapper.fit_transform(xy_train1_grs).astype('float32')
x_test_grs = x_mapper.transform(xy_test1_grs).astype('float32')

x_train_nogrs = x_mapper_nogrs.fit_transform(xy_train1_nogrs).astype('float32')
x_test_nogrs = x_mapper_nogrs.fit_transform(xy_test1_nogrs).astype('float32')

In [6]:
get_target = lambda df: (df['mofDAY'].values, df['mof'].values)
y_train_mof = get_target(xy_train1_grs)
durations_test_mof, events_test_mof = get_target(xy_test1_grs)

In [7]:
# Model 4 (Bayesian optimization + FRAX CRFs + GRS)
np.random.seed(1234)
_ = torch.manual_seed(123)

in_features = x_train_grs.shape[1]
num_nodes = [2317,2317]
out_features = 1
batch_norm = True
dropout = 0.08405376347880598
learning_rate=0.6341016716921827
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)
model = CoxPH(net, tt.optim.Adam)
batch_size = 614
lrfinder = model.lr_finder(x_train_grs, y_train_mof, batch_size, tolerance=10)
lrfinder.get_best_lr()
model = CoxPH(net, tt.optim.Adam)
batch_size = 614
model.optimizer.set_lr(learning_rate)
epochs = 43
callbacks = [tt.callbacks.EarlyStopping()]
verbose = True
log = model.fit(x_train_grs, y_train_mof, batch_size, epochs, callbacks, verbose)

0:	[0s / 0s],		train_loss: 931.9271
1:	[0s / 0s],		train_loss: 1878.1660
2:	[0s / 0s],		train_loss: 2096.4102
3:	[0s / 0s],		train_loss: 2227.9536
4:	[0s / 0s],		train_loss: 1847.5720
5:	[0s / 0s],		train_loss: 1822.9082
6:	[0s / 0s],		train_loss: 1418.7189
7:	[0s / 1s],		train_loss: 1082.3897
8:	[0s / 1s],		train_loss: 936.7400
9:	[0s / 1s],		train_loss: 816.2570


In [8]:
_ = model.compute_baseline_hazards()
surv_train = model.predict_surv_df(x_train_grs).tail(1)
score_train_h = pd.DataFrame(surv_train, columns = ["TrainScore"])
train_cal1=pd.concat([xy_train1_grs[["mof"]],xy_train1[["WHOFRAC"]] ],axis="columns")
train_cal2=pd.concat([train_cal1,score_train_h],axis="columns")

X, y = train_cal2[["WHOFRAC"]].values, train_cal2[["TrainScore"]].values
theta_actual = np.array([15., 0.1, 0.4]) # Values that were used to generate the data
sigma_actual = 0.6
    
# Define nonlinear model and declare LevenbergMarquardtReg class
def f(X, theta):
    return theta[0] * np.tanh(theta[1] + theta[2] * X[:, 0])
lr = LevenbergMarquardtReg(model_fn = f)    

# Fit model
lr.fit(X, y, theta_init = np.ones(3)) # starting point = [1., 1., 1.]
    
# Display results
expected_values = lr.__get_optimization_status__(theta_actual)
print("\n*** RESULTS")
print("Estimated theta: {}".format(lr.theta))

Initial WSS: 122.2542873644836
Check after 10 iterations: % displacement = 1.6175459922317652, norm_theta = 0.8228195243379646
Check after 20 iterations: % displacement = 0.00044684750884445605, norm_theta = 0.8284337643557455
Check after 30 iterations: % displacement = 0.0, norm_theta = 0.8284337643557455

*** RESULTS
Estimated theta: [ 5.53319827e-01  6.16554668e-01 -1.13959113e-04]


In [9]:
_ = model.compute_baseline_hazards()
surv_test = model.predict_surv_df(x_test_grs)
score_test = surv_test.tail(1)

import math
sigmoid = lambda x: 1 / (0.553319827 + math.exp(0.616554668*x))-0.000113959113
sigmoid_v = np.vectorize(sigmoid) 

score_test_h = pd.DataFrame(sigmoid_v(score_test))
score_test_h.columns = ['SANN']
score_test_h.to_csv('/users/PAS2433/dai417osc/WHI_sp23/data/SANN_prob_mof.csv')