# Train models

With this notebook, we train simple untunted recurrent models with sufficient performance to demonstrate the effect of data augmentation strategies

In [1]:
import os, time, pickle, boto3, sys
from datetime import datetime

import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix

In [2]:
from utils.datagen import PROCESSED_DATAPATH, MODEL_INPUT_DATAPATH, RESULTFILE_DATAPATH, DataGen, strategies, targets, get_datafile
from utils.utils import read_data, dump_data
from utils.connections import model_input_bucket, processed_data_bucket, get_s3_keys_as_generator, download_file

# check pre-reqs. 
assert(tf.__version__[0]=='2')
assert len(tf.config.experimental.list_physical_devices('GPU')) > 0

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [8]:
def create_model(max_features, output_dim, num_units, name, timesteps=500, dropout=0.3, unit_type='LSTM'):
    model_input = tfk.Input(shape=(timesteps,), name=f'{name}_input')
    x = tfkl.Embedding(max_features+1, output_dim, name=f'{name}_embed')(model_input)
    if unit_type == 'LSTM':
        x = tfkl.Bidirectional(tfkl.LSTM(units=num_units, name=f'{name}_recurrent',
                                         return_sequences=False, 
                                         kernel_initializer='glorot_uniform', 
                                         bias_initializer='zeros'))(x)
    elif unit_type == 'GRU':
        x = tfkl.Bidirectional(tfkl.GRU(units=num_units, name=f'{name}_recurrent',
                                        return_sequences=False, 
                                        kernel_initializer='glorot_uniform', 
                                        bias_initializer='zeros'))(x)
    else:
        raise NotImplementedError
    x = tfkl.LeakyReLU(alpha=0.3)(x)
    x = tfkl.Dropout(dropout)(x)
    x = tfkl.Dense(num_units//2, activation='relu')(x)
    x = tfkl.Dropout(dropout)(x)
    model_output = tfkl.Dense(2, activation='softmax', name=f'{name}_output')(x)
    return tfk.Model(inputs=model_input, outputs=model_output)

# def create_single_recurrent(max_features, output_dim, name, timesteps=500, dropout=0.3):
#     model_input = tfk.Input(shape=(timesteps,), name=f'{name}_input')
#     x = tfkl.Embedding(max_features+1, output_dim, name=f'{name}_embed')(model_input)
#     x = tfkl.Bidirectional(tfkl.LSTM(units=10, name=f'{name}_recurrent',
#                                      return_sequences=False, 
#                                      kernel_initializer='glorot_uniform', 
#                                      bias_initializer='zeros'))(x)
#     x = tfkl.LeakyReLU(alpha=0.3)(x)
#     x = tfkl.Dropout(dropout)(x)
#     x = tfkl.Dense(10, activation='relu')(x)
#     x = tfkl.Dropout(dropout)(x)
#     model_output = tfkl.Dense(2, activation='softmax', name=f'{name}_output')(x)
#     return tfk.Model(inputs=model_input, outputs=model_output)

# def create_combined(vs_embedding_weight, clin_embedding_weight, timesteps=500, dropout=0.3):
#     embeddings = []
#     model_inputs = []
#     for name, embedding_weight in zip(['vs', 'clin'], [vs_embedding_weight, clin_embedding_weight]):
#         max_features = embedding_weight.shape[0]
#         output_dim = embedding_weight.shape[1]
#         model_input = tfk.Input(shape=(timesteps,), name=f'{name}_input_combined')
#         x = tfkl.Embedding(max_features+1, output_dim, name=f'{name}_embed_combined')(model_input)
#         embeddings.append(x)
#         model_inputs.append(model_input)
#                                #weights=[embedding_weight], name=f'{t}_embed')(model_input)
#     x = tfkl.concatenate(embeddings, axis=1)
#     x = tfkl.Bidirectional(tfkl.LSTM(units=20, name=f'recurrent_combined',
#                                      return_sequences=False, 
#                                      kernel_initializer='glorot_uniform', 
#                                      bias_initializer='zeros'))(x)
#     x = tfkl.LeakyReLU(alpha=0.3)(x)
#     x = tfkl.Dropout(dropout)(x)
#     x = tfkl.Dense(20, activation='relu')(x)
#     x = tfkl.Dropout(dropout)(x)
#     x = tfkl.Dense(10, activation='relu')(x)
#     x = tfkl.Dropout(dropout)(x)
#     model_output = tfkl.Dense(2, activation='softmax', name=f'recurrent_output')(x)
#     return tfk.Model(inputs=model_inputs, outputs=model_output)

In [9]:
import pickle

def valid_epoch(data_valid, model, valid_step, data_index, valid_output=''):
    valid_iter = iter(data_valid)
    preds = []
    ys = []
    id_list = []
    valid_loss = []
    while True:
        try:
            x_clin, x_vs, y, ids, tt = next(valid_iter)
            x = [x_clin, x_vs]
            if data_index >= 0:
                x = x[data_index]
            pred, loss = valid_step(x, y, model)
            valid_loss.append(loss.numpy())
            preds.append(pred.numpy())
            ys.append(y.numpy())
            id_list.append(ids.numpy())
        except:# StopIteration:
            break
    probabilities = np.vstack(preds)[:,0]
    Y = np.hstack(ys)
    IDS = np.hstack(id_list)
    predictions = np.where(probabilities < 0.5, 0., 1.)
    
    if valid_output != '':
        with open(valid_output, 'wb+') as outfile:
            pickle.dump(probabilities, outfile)
            pickle.dump(Y, outfile)
            pickle.dump(IDS, outfile)
    tp = len(np.where((predictions==Y)&(predictions==1))[0])
    fp = len(np.where((predictions!=Y)&(predictions==1))[0])
    tn = len(np.where((predictions==Y)&(predictions==0))[0])
    fn = len(np.where((predictions!=Y)&(predictions==0))[0])
    accuracy = (tp + tn)/len(predictions)
    fpr, tpr, thresholds = roc_curve(Y, probabilities)
    val_auc = auc(fpr, tpr)
    sensitivity = tp/(tp + fn)
    specificity = tn/(tn + fp)
    try:
        WDR = (tp + fp)/tp
    except ZeroDivisionError:
        WDR = 0
    return np.mean(valid_loss), tp, fp, tn, fn, accuracy, val_auc, sensitivity, specificity, WDR

def train_epoch(data_train, model, train_step, data_index):
    t0 = time.clock()
    train_iter = iter(data_train)
    epoch_losses = []
    while True:
        try:
            x_clin, x_vs, y, ids, tt = next(train_iter)
            _, loss = train_step(x_clin, y, clin_model)
            epoch_losses.append(loss.numpy())
        except:# StopIteration:
            break
    t1 = time.clock() - t0
    return t1, np.mean(epoch_losses)

In [10]:
def train_model(data_train, data_valid, model, data_index, epochs, label, epoch_start=0):
    result_strings = []
    @tf.function
    def get_loss(Y, predictions):
        return tfk.backend.binary_crossentropy(predictions[:,0], tf.cast(Y, tf.float32))

    @tf.function
    def train_step(x, y, model):
        with tf.GradientTape() as tape:
            preds = model(x, training=True)
            loss = tf.reduce_mean(get_loss(y, preds))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return preds, loss

    @tf.function
    def valid_step(x, y, model, training=False):
        preds = model(x, training=training)
        loss = tf.reduce_mean(get_loss(y, preds))
        return preds, loss

    runfile_name = datetime.now().strftime("%Y%m%d_%H:%M:%S")
    losses = []
    min_loss = 1e20
    for e in range(epoch_start, epochs):
        elapsed_time, epoch_loss = train_epoch(data_train, model, train_step, data_index)
        valid_output = f'{label}_{e}_preds'
        val_loss, tp, fp, tn, fn, accuracy, val_auc, sensitivity, specificity, WDR = valid_epoch(data_valid, model, valid_step, data_index, valid_output)
        result_string = (f'{label}\t{e}\t{elapsed_time}\t{val_loss}\t{epoch_loss}\t{tp}\t{fp}\t{tn}\t{fn}\t{accuracy}\t{val_auc}\t{sensitivity}\t{specificity}\t{WDR}')
        print(result_string)
        result_strings.append(result_string)
        model.save_weights(os.path.join(RESULTFILE_DATAPATH, f'{label}_e{e}.h5'))
        with open(os.path.join(RESULTFILE_DATAPATH, 'summaries', f'{label}_{runfile_name}.tsv'), 'a+') as outfile:
            outfile.write(result_string + '\n')
        if val_loss < min_loss:
            min_loss = val_loss
            losses = []
        else:
            losses.append(val_loss)
        if len(losses)>3:
            break
    return result_strings

In [6]:
targets

{'hosp_death': 'tt_dth',
 'icu_death': 'tt_dth',
 'icu_readm': 'tt_readm',
 'long_icu': 'duration'}

In [7]:
x = iter(data_valid)
next(x)

NameError: name 'data_valid' is not defined

In [None]:
fold = 0
batch_size=128

max_features = {'clin': 7916, 'vs': 19405}
embed_dim = {'clin': 150, 'vs': 100}
data_index = {'clin': 0, 'vs': 1}

name = 'clin'

for fold in range(1):
    for unit_type in ['GRU']:
        for target in ['hosp_death', 'icu_death', 'long_icu']:
            for strategy in strategies:
                if 'original' in strategy:
                    epochs = 200
                elif 'augment' not in strategy:
                    epochs = 100
                else:
                    epochs = 10
                data_train = get_datafile(target, strategy, fold=fold, 
                                          phase='train', batch_size=batch_size, 
                                          model_type='both')
                data_valid = get_datafile(target, strategy, fold=fold, 
                                          phase='valid', batch_size=batch_size, 
                                          model_type='both')

                for model_width in [5,10,15]:
                    clin_model = create_model(max_features[name], embed_dim[name], model_width, name, unit_type=unit_type)

                    try:
                        weight_files = [f for f in os.listdir(RESULTFILE_DATAPATH) if strategy in f and target in f and unit_type in f and f'width_{model_width}_fold_{fold}' in f and 'h5' in f]
                        completed_epochs = max([int(w.split('.')[0].split('_')[-1].strip('e')) for w in weight_files])
                        target_weight_file = f'{unit_type}_{strategy}_{target}_width_{model_width}_fold_{fold}_e{completed_epochs}.h5'
                        print(target_weight_file, target_weight_file in weight_files)    
                        clin_model.load_weights(os.path.join(RESULTFILE_DATAPATH, target_weight_file))
                        print(completed_epochs)
                    except (NameError, ValueError):
                        weight_files = []
                        completed_epochs = 0

                    optimizer = tfk.optimizers.Adam(learning_rate=1e-4, clipnorm=1)
                    clin_results = train_model(data_train, data_valid, clin_model, data_index[name], epochs, f'{unit_type}_{strategy}_{target}_width_{model_width}_fold_{fold}', completed_epochs + 1)

GRU_original_hosp_death_width_5_fold_0	1	44.940884	2.4249002933502197	4.954570293426514	0	0	9484	1012	0.9035823170731707	0.5577544893584035	0.0	1.0	0
GRU_original_hosp_death_width_5_fold_0	2	41.293561000000004	1.7256404161453247	3.4765617847442627	0	0	9483	1013	0.9034870426829268	0.5625814115954783	0.0	1.0	0
GRU_original_hosp_death_width_5_fold_0	3	41.388868999999985	1.5447229146957397	3.1342945098876953	0	0	9484	1012	0.9035823170731707	0.5651510740785813	0.0	1.0	0
GRU_original_hosp_death_width_5_fold_0	4	41.40797699999999	1.4862028360366821	2.7483153343200684	0	0	9486	1010	0.9037728658536586	0.5658954937239455	0.0	1.0	0
GRU_original_hosp_death_width_5_fold_0	5	41.188259999999985	1.4740536212921143	2.459167242050171	0	0	9486	1010	0.9037728658536586	0.5671671958467194	0.0	1.0	0
GRU_original_hosp_death_width_5_fold_0	6	41.137280000000004	1.4761946201324463	2.300034284591675	0	0	9482	1014	0.903391768292683	0.566909813964963	0.0	1.0	0
GRU_original_hosp_death_width_5_fold_0	7	41.07222799999

In [22]:
fold = 0
batch_size=64

max_features = {'clin': 7916, 'vs': 19405}
embed_dim = {'clin': 100, 'vs': 100}
data_index = {'clin': 0, 'vs': 1}

def get_probs_for_augment(data_valid, model, data_index, valid_output):
    @tf.function
    def valid_step(x, y, model):
        preds = model(x)
        return preds, 0
    
    valid_epoch(data_valid, model, valid_step, data_index, valid_output)



for name in ['clin', 'vs']:
    for target in ['hosp_death', 'icu_death']:
        for strategy in strategies:
            if 'augment' not in strategy:
                data_valid = get_datafile(target, strategy, fold=fold, 
                                          phase='valid', batch_size=batch_size, 
                                          model_type='both')

                clin_model = create_single_recurrent(max_features[name], embed_dim[name], name)

                #try:
                weight_files = [f for f in os.listdir(RESULTFILE_DATAPATH) if strategy in f and target in f and name in f and 'h5' in f]
                completed_epochs = [int(w.split('.')[0].split('_')[-1].strip('e')) for w in weight_files]
                #    target_weight_file = f'{name}_{strategy}_{target}_e{completed_epochs}.h5'
                #    print(target_weight_file, target_weight_file in weight_files)    
                for target_weight_file, e in zip(weight_files, completed_epochs):
                    clin_model.load_weights(os.path.join(RESULTFILE_DATAPATH, target_weight_file))
                    #    print(completed_epochs)
                    #except (NameError, ValueError):
                    #    weight_files = []
                    #    completed_epochs = 0

                    valid_output = f'{name}_{strategy}_{target}_{e}_preds'
                    get_probs_for_augment(data_valid, clin_model, data_index[name], valid_output)




UnknownError:    Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[functional_7/bidirectional_3/forward_clin_recurrent/PartitionedCall]] [Op:__inference_valid_step_19037]

Function call stack:
valid_step -> valid_step -> valid_step
