<a href="https://colab.research.google.com/github/2667schummr/recurrent-nn-cystic-fibrosis/blob/master/cf_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from google.colab import drive

In [0]:
# Specify training parameters
device = '/device:GPU:0'
print_every = 10
batch_size = 100

In [3]:
# Make google drive files accessible
drive.mount('/content/gdrive')
path = '/content/gdrive/My Drive/Colab Notebooks'
os.chdir(path)
print('Current Working Directory: {}'.format(os.getcwd()))

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Current Working Directory: /content/gdrive/My Drive/Colab Notebooks


In [4]:
# Load the panda dataframe containing the features
feats = pd.read_pickle('./processed_data_complete_feats.pkl')
print('Feature Shape: {}'.format(feats.shape))

num_feats = feats.shape[1] - 2
max_seq_length = feats.groupby('eDWID').count()['YRM'].max()

drop_cols = ['eDWID', 'YRM'] # These are identification columns
feats.head()

Feature Shape: (513452, 34)


Unnamed: 0,eDWID,YRM,mssa,mrsa,h_flu,pseudo,burkho_complex,alcalig,steno,enterobacter,serratia_marcescens,aspergillus,candida,scedosporium,mabscessus,mai,bd_age,sex,suff,trunc03,all_tob,all_mod,all_bd,dnase,inhcolistin,inhaztreonam,hypersaline,chronic_macrolide,oral_steroids,inh_steroids,inhsteroids_bronchodil,oral_other_abx,txflag,othertx
0,900000702,2003,0,0,0,1,0,0,0,0,0,1,0,0,0,0,6,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0
1,900000702,2004,0,0,0,1,0,0,0,0,0,0,0,0,0,0,7,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0
2,900000702,2005,0,0,0,1,0,0,0,0,0,1,0,0,0,0,8,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0
3,900000702,2006,0,0,0,1,0,0,0,0,0,1,0,0,0,0,9,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0
4,900000702,2007,0,0,0,1,0,0,0,0,0,1,0,0,0,0,10,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0


In [5]:
# Load the pandas dataframe containing the labels
labels = pd.read_pickle('./dflags.pkl')
print('Labels Shape: {}'.format(labels.shape))
num_obs = labels.shape[0]
labels.head()

Labels Shape: (43454, 2)


Unnamed: 0,eDWID,dflag
0,900000702,0
99,900000736,0
106,900000742,1
196,900000776,0
200,900010702,1


In [0]:
# Define a class that generates batches of training and testing data
class Dataset(object):
    def __init__(self, labels, batch_size, shuffle=False):
        self.X = labels['eDWID'].as_matrix()
        self.y = labels['dflag'].as_matrix()
        self.batch_size, self.shuffle = batch_size, shuffle

    def __iter__(self):
        N, B = self.X.shape[0], self.batch_size
        idxs = np.arange(N)
        if self.shuffle:
            np.random.shuffle(idxs)
        return iter((self.X[i:i+B], self.y[i:i+B]) for i in range(0, N, B))

In [7]:
# Split the data into training and testing sets and create Dataset objects 
labels_train, labels_test = train_test_split(labels, test_size=0.2)
patient_batches_train = Dataset(labels_train, batch_size)
patient_batches_test = Dataset(labels_test, batch_size)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [0]:
# Method for computing testing accuracies 
def check_accuracy(sess, dset, x, scores, seq_mask, is_training=None):
    """
    Check accuracy on a classification model.
    
    Inputs:
    - sess: A TensorFlow Session that will be used to run the graph
    - dset: A Dataset object on which to check accuracy
    - x: A TensorFlow placeholder Tensor where input images should be fed
    - scores: A TensorFlow Tensor representing the scores output from the
      model; this is the Tensor we will ask TensorFlow to evaluate.
      
    Returns: Nothing, but prints the accuracy of the model
    """
    num_correct, num_samples = 0, 0
    for patients, patient_labels in dset:
      x_batch, y_batch, mask_batch = generate_data(patients, patient_labels)
      feed_dict = {x: x_batch, is_training: 0, seq_mask: mask_batch}
      scores_np = sess.run(scores, feed_dict=feed_dict)
      y_pred = scores_np.argmax(axis=1)
      num_samples += x_batch.shape[0]
      num_correct += (y_pred == y_batch).sum()
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))

In [0]:
# Method for transforming the pandas dataframes into 3-D tensors for the RNN
def generate_data(patients, patient_labels):
  X = np.zeros((batch_size, max_seq_length, num_feats))
  Y = np.zeros((batch_size))
  mask = np.zeros((batch_size, max_seq_length))

  for i, patient in enumerate(patients):
    patient_feats = feats[feats.eDWID == patient].drop(drop_cols, axis=1)
    patient_feats = patient_feats.as_matrix()
    seq_length = patient_feats.shape[0]
    X[i, :seq_length, :] = patient_feats
    mask[i, :seq_length] = True
    mask[i, seq_length:] = False 
    Y[i] = labels[labels.eDWID == patient]['dflag'].as_matrix()[0]
  
  return X, Y, mask

In [0]:
# The training procedure
def train(model_init_fn, optimizer_init_fn, num_epochs=1):
    tf.reset_default_graph()    

    x = tf.placeholder(tf.float32, [None,  max_seq_length, num_feats])
    y = tf.placeholder(tf.int32, [None])
    seq_mask = tf.placeholder(tf.bool, [None, max_seq_length])
    is_training = tf.placeholder(tf.bool, name='is_training')
    
    scores = model_init_fn(x, is_training, seq_mask)

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
    loss = tf.reduce_mean(loss)

    optimizer = optimizer_init_fn()
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss)
      
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, 
                                            log_device_placement=False))
    with sess:
      sess.run(tf.global_variables_initializer())
      t = 0
      for epoch in range(num_epochs):
          print('Starting epoch %d' % epoch)
          for (patients, patient_labels) in patient_batches_train:
            x_np, y_np, mask_np = generate_data(patients, patient_labels)
            feed_dict = {x: x_np, y: y_np, is_training: 1, seq_mask: mask_np}
            score_np, loss_np, _ = sess.run([scores, loss, train_op], feed_dict=feed_dict)
            
            if t % print_every == 0:
              print('Iteration %d, loss = %.4f' % (t, loss_np))
              check_accuracy(sess, patient_batches_test, x, scores, seq_mask,
                             is_training=is_training)
              print()
            t += 1
            

In [34]:
# Define the model and optimize and train the model
def model_init_fn(inputs, is_training, seq_filter):
    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.SimpleRNN(2, 
                                  batch_input_shape=(None, max_seq_length, num_feats),
                                  return_sequences=False))
    return model(inputs, mask=seq_filter, training=is_training) 

learning_rate = 1e-3
def optimizer_init_fn():
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    return optimizer

train(model_init_fn, optimizer_init_fn)

Starting epoch 0


  
  del sys.path[0]


Iteration 0, loss = 0.7122
Got 2811 / 8700 correct (32.31%)

Iteration 10, loss = 0.6964
Got 3086 / 8700 correct (35.47%)

Iteration 20, loss = 0.6860
Got 3495 / 8700 correct (40.17%)

Iteration 30, loss = 0.6723
Got 4004 / 8700 correct (46.02%)

Iteration 40, loss = 0.6667
Got 4668 / 8700 correct (53.66%)

Iteration 50, loss = 0.6723
Got 5326 / 8700 correct (61.22%)

Iteration 60, loss = 0.6375
Got 5905 / 8700 correct (67.87%)

Iteration 70, loss = 0.6266
Got 6118 / 8700 correct (70.32%)

Iteration 80, loss = 0.5928
Got 6106 / 8700 correct (70.18%)

Iteration 90, loss = 0.6167
Got 6212 / 8700 correct (71.40%)

Iteration 100, loss = 0.5818
Got 6272 / 8700 correct (72.09%)

Iteration 110, loss = 0.5467
Got 6338 / 8700 correct (72.85%)

Iteration 120, loss = 0.6053
Got 6335 / 8700 correct (72.82%)

Iteration 130, loss = 0.5254
Got 6333 / 8700 correct (72.79%)

Iteration 140, loss = 0.4354
Got 6333 / 8700 correct (72.79%)

Iteration 150, loss = 0.4626
Got 6333 / 8700 correct (72.79%)

Ite