In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import scipy.io

# Importing Preprocessed Data

In [2]:
path = os.getcwd() 
filename = path+'/TrainSetPCA30.csv'
trainSet = pd.read_csv(filename)
filename = path+'/TestSetPCA30.csv'
testSet = pd.read_csv(filename)

# Defining Labels

In [3]:
Labels = ['death30', 'death60', 'death90', 'ICU', 'vent', 'anyCatastrophic','Admit30Days', 
          'Admit60Days', 'Admit90Days', 'Admit7Days','Admit14Days']

## Separating features (X) from labels (y)

In [4]:
y_train = trainSet[Labels]
y_train = y_train.astype(int)
X_train = trainSet.drop(Labels,axis=1)
y_test = testSet[Labels]
y_test = y_test.astype(int)
X_test = testSet.drop(Labels,axis=1)

In [5]:
#number of features
n_features = len(X_train.columns)

## Focusing on one label to be predicted

In [6]:
labelName = Labels[5]
labelTrain = y_train.loc[:,labelName]
labelTest= y_test.loc[:,labelName]
#display(labelTrain.value_counts())
print(f'Predicting label: {labelName}')

Predicting label: anyCatastrophic


## Defining Neural Network Architecture

In [7]:
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow import keras

#Define the optimizer parameters, we tried several learning rates and these parameters gave the best results
adam_opt = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

elasticnet_regularizer= regularizers.l1_l2(l1=0.001, l2=0.001)

# define the model
#Our neural network has two hidden layers with variable number of neurons
#The activation function was tested with sigmoid, tanh and relu. Relu gave the best results
#We set a lasso regularizer to avoid overfitting as well as two dropout layers
#Dropout rate was changed manually
def create_model(n_neuronsL1,n_neuronsL2):
    model = Sequential()
    model.add(Dense(n_neuronsL1, activation='relu', kernel_initializer='lecun_uniform',kernel_regularizer=elasticnet_regularizer,
                    input_shape=(n_features,)))
    model.add(Dropout(0.2))
    model.add(Dense(n_neuronsL2, activation='relu', kernel_initializer='lecun_uniform',kernel_regularizer=elasticnet_regularizer))
    model.add(Dropout(0.2))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics='binary_accuracy')
    return model

#The number of epochs and batchsize were set by manual search
modelNN = KerasClassifier(build_fn=create_model, epochs=1000, batch_size=500, initial_epoch=0, verbose=1)

# Defining parameter grid for optimization

In [8]:
# define the grid search parameters
neuronsL1 = range(10,20,2)
neuronsL2 = range(2,10,2)

param_grid = {'kerasclassifier__n_neuronsL1':neuronsL1,'kerasclassifier__n_neuronsL2':neuronsL2}


In [9]:
#This callback is not being used in the final version
#from tensorflow import keras
#es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

## Defining training pipeline

In [10]:
#We define a pipeline to upsample the minority class on each training split of the crossvalidation
#We upsample with a random sampler which copies random samples from the minority class
#Then we pass the neural network model

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import RandomOverSampler

pipeline_grid = make_pipeline(
    RandomOverSampler(random_state=0),
    modelNN,
)

# Training model

In [None]:
#We perform a gridsearch over the hyperparameter grid defined above (number of neurons of layer 1 and 2)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, matthews_corrcoef, f1_score

scorer_mcc = make_scorer(matthews_corrcoef,greater_is_better=True)
scoring = {'F1': 'f1', 'MCC': scorer_mcc}

#Create cross-validation object for stratified splits
kfold = StratifiedKFold(n_splits=5, shuffle=True)

#Perform gridsearch and score models based on f1 metric
grid = GridSearchCV(pipeline_grid, param_grid=param_grid, n_jobs=-1,cv=kfold,scoring=scoring,refit = 'MCC',verbose=3)
grid_result = grid.fit(X_train, labelTrain)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
Best_params = grid_result.best_params_

In [None]:
results = pd.DataFrame(grid_result.cv_results_)
results = results.loc[:,['param_kerasclassifier__n_neuronsL1','param_kerasclassifier__n_neuronsL2','mean_test_F1',
                         'rank_test_F1','mean_test_MCC','rank_test_MCC']]
results['mean_test_score'] = results['mean_test_F1']
results

# Sensitivity plot

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.patches import Rectangle
import seaborn as sns

ax = plt.figure(figsize=(9,7)).gca()
#Plot best estimator during training (based on f1 score)
ax.scatter(Best_params['kerasclassifier__n_neuronsL1'], Best_params['kerasclassifier__n_neuronsL2'], s=200, marker= 'o',color='none',edgecolor='r')
#Plot all models' scores



heatmap_data = pd.pivot_table(results, values='mean_test_score', 
                     index=['param_kerasclassifier__n_neuronsL2'], 
                     columns='param_kerasclassifier__n_neuronsL1')
#midpoint = (heatmap_data.values.max() - heatmap_data.values.min()) / 2

ind_max = np.unravel_index(np.argmax(heatmap_data, axis=None), heatmap_data.shape)

sns.set(font_scale=1.2) 
colormap = sns.diverging_palette(10, 240, n=20)
ax = sns.heatmap(heatmap_data, annot=True,cmap =colormap,linewidths=.5,fmt='.3f',annot_kws={"size":15},robust=True)
ax.add_patch(Rectangle((ind_max[1],ind_max[0]), 1, 1, fill=False, edgecolor='red', lw=3))

#sc = ax.scatter(results.param_kerasclassifier__n_neuronsL1, results.param_kerasclassifier__n_neuronsL2, s=200, c=results.mean_test_score, cmap='Greens', marker= 'x')

#ax.set_xticks(results.param_kerasclassifier__n_neuronsL1.tolist())
#ax.set_yticks(results.param_kerasclassifier__n_neuronsL2.tolist())

ax.set_xlabel("Number Neurons Layer 1",fontsize=15)
ax.set_ylabel("Number Neurons Layer 2",fontsize=15)
ax.set_title("F1 score",fontsize=15)
plt.show()

# Testing Models

In [None]:
# For testing the model we use the complete training set to train again the models
# The training set is upsampled on a similar way as during gridsearch
from sklearn.utils import resample
#The minority class is always the positive class
majority_class = X_train[labelTrain==0].copy()
majority_class_labeled = majority_class.join(labelTrain[labelTrain==0])
minority_class = X_train[labelTrain==1].copy()
minority_class_labeled = minority_class.join(labelTrain[labelTrain==1])

# Upsample minority class
minority_upsampled = resample(minority_class_labeled, 
                                 replace=True,     # sample with replacement
                                 n_samples=11911,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
trainSet_upsampled = pd.concat([majority_class_labeled, minority_upsampled])
 
labels_upsampled = trainSet_upsampled.loc[:,labelName]
labels_upsampled.to_numpy()
features_upsampled = trainSet_upsampled.drop(labelName,axis=1).to_numpy()

In [None]:
from sklearn.metrics import plot_confusion_matrix,confusion_matrix, f1_score, classification_report, balanced_accuracy_score

resultsTest = []
global_results = pd.DataFrame()
#Iterate through the combinations of parameters to test the model on the testing set
#Requires to retrain the model for each parameter combination prior to testing

for neurons1 in neuronsL1:
    for neurons2 in neuronsL2:
        model = Sequential()
        model.add(Dense(neurons1, activation='relu', kernel_initializer='lecun_uniform',kernel_regularizer='l1',
                    input_shape=(n_features,)))
        model.add(Dropout(0.2))
        model.add(Dense(neurons2, activation='relu', kernel_initializer='lecun_uniform',kernel_regularizer='l1'))
        model.add(Dropout(0.2))
        model.add(Dense(1,activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics='binary_accuracy')
        model.fit(features_upsampled, labels_upsampled, epochs=1000, batch_size= 500, initial_epoch=0, verbose=0)
        predictions = model.predict(X_test) > 0.5
        testingScore = f1_score(labelTest,predictions,average='binary')
        results_aux = dict(NeuronsLayer1=neurons1,NeuronsLayer2 = neurons2, TestingScore = testingScore)
        resultsTest.append(results_aux)
        print(results_aux)

In [None]:
resultsTest= pd.DataFrame(resultsTest)

#global_results stores the scores during training and testing and calculates the mean of both scores
#The mean score is used to select the "best overall model"

global_results = resultsTest.copy()
global_results['TrainingScore'] = results.loc[:,'mean_test_score']
col = global_results.loc[: , ["TrainingScore","TestingScore"]]
global_results['MeanScore'] = col.mean(axis=1)
global_results

#We find the best overall model (given by the maximum mean score bw training and testing)
best_overall_model = global_results[global_results.MeanScore==global_results.MeanScore.max()]
best_overall_model

# Predictability plot

In [None]:
#Funtion to find the min and max score of both training and testing
#This is used to set the limits of the predictability plot axis
def findlimits(training_score,testing_score):
    min_training = training_score.min()
    min_testing = testing_score.min()
    min_global = np.min([min_training,min_testing])-0.005 #Adding 0.005 offset so that markers do not appear on the edge of plot
    
    max_training = training_score.max()
    max_testing = testing_score.max()
    max_global = np.max([max_training,max_testing])+0.005 #Adding 0.005 offset so that markers do not appear on the edge of plot
    
    return [min_global,max_global]


In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

#We find the training score for the best model during trainning
best_training_result = results.mean_test_score.max()

#We find the testing score for the best model during training
testing_result = resultsTest.TestingScore[(resultsTest.NeuronsLayer1==Best_params['kerasclassifier__n_neuronsL1'])&
                                   (resultsTest.NeuronsLayer2==Best_params['kerasclassifier__n_neuronsL2'])]

ax = plt.figure(figsize=(7,7)).gca()

#We mark the result for the best model during training
ax.scatter(best_training_result,testing_result, s=200, marker= 'o',color='none',edgecolor='r')

#We mark the result for the best overall model
ax.scatter(best_overall_model.TrainingScore,best_overall_model.TestingScore, s=200, marker= 'o',color='none',edgecolor='green')

#We plot all the scores for all models
sc = ax.scatter(results.mean_test_score, resultsTest.TestingScore, s=50, marker= 'x',c='b')

ax.set_xlim([0.25,0.45])
ax.set_ylim([0.0,0.25])
ax.set_xlabel("Training Score",fontsize=15)
ax.set_ylabel("Testing Score",fontsize=15)
ax.set_title("F1 score",fontsize=15)

# Testing best overall model

In [None]:
#Train and test again the best overall model to get the performance metrics
model = Sequential()
model.add(Dense(best_overall_model.NeuronsLayer1, activation='relu', kernel_initializer='lecun_uniform',kernel_regularizer='l1',
                input_shape=(n_features,)))
model.add(Dropout(0.2))
model.add(Dense(best_overall_model.NeuronsLayer2, activation='relu', kernel_initializer='lecun_uniform',kernel_regularizer='l1'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics='binary_accuracy')

model.fit(features_upsampled, labels_upsampled,
              epochs=1000,
              batch_size=500,
              verbose=1)

# Performance metrics for best model

In [None]:
from sklearn.metrics import plot_confusion_matrix,confusion_matrix, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

#Predict the labels for testing set
predictions = (model.predict(X_test) > 0.5).astype("int32")

#Calculate f1 score and display
print(f1_score(labelTest,predictions,average='binary'))

#Calculate confusion matrix
confusionMatrix = confusion_matrix(labelTest,predictions)

figure = plt.figure(figsize=(7, 6))

sns.heatmap(confusionMatrix, annot=True,cmap='Blues',fmt='d')
plt.tight_layout()
plt.ylabel('True label',fontsize=20)
plt.xlabel('Predicted label',fontsize=20)
plt.title('Truth Table Any Catastrophy Classification',fontsize=20)
plt.show()

#Calculate performance metrics
report = classification_report(labelTest,predictions,output_dict=True)
report = pd.DataFrame(report).transpose()

#Calculate ROC based on prediction probabilities
predictions = model.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(labelTest, predictions)
roc_auc = metrics.auc(fpr, tpr)
display_curve = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
display_curve.plot()  
plt.show() 

report