<a href="https://colab.research.google.com/github/ChrisKantor/Deep-Learning/blob/main/Project%202/Deep_Learning_Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Deep learning model that builds on the one made in HW3 for determining an infants pain level using vital signs
#This improved model uses LSTM to better understand the sequential readings



#Run these commands - NECCESARY to use the GPU
# export CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))
# export LD_LIBRARY_PATH=${CUDNN_PATH}/lib

#To connect to local runtime: jupyter lab --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0

In [1]:
#import libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
import os
from matplotlib import pyplot as plt
import PIL
from tensorflow import keras

In [37]:
#Some hyperparameter setup
batch_size = 64           #set batch size to 64
epochs = 20               #set num epochs to 20

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# dir = 'Deep Learning/Project 2/'
# data_dir = "data/"


## Mount Google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
data_dir = '/content/drive/MyDrive/Colab Notebooks/Deep Learning/Project 2/data/'


import os
print(os.getcwd())

Num GPUs Available:  0
Mounted at /content/drive
/content


# Data Preprocessing

In [None]:
#NEEDS FURTHER CLARIFICATION:
#If a csv file sequence has an invalid row, do we just remove it or is the whole sequence invalid?
#Can we mix the csv sequences together to get a fixed length input sequence? or should each seperate csv file be treated as its own sequence?
#Are we predicting if the baby is in pain during the next timestep?



#lots to do here - roughly follow this tutorial for general RNN processing: https://www.tensorflow.org/text/tutorials/text_generation

#need sequences of data to use as input. Each csv file is 1 sequence, and each of them could potentially have missing/invalid data in them. 602 total csv files
#Each csv file has 30 entries, but due to missing/invalid data, our cleaned sequences could have any number of entries
#After cleaning the csv files, our dataset will be compromised of a bunch of variable length sequences
#we will split our dataset up into training/validation sets on a sequence level

#the inputs at a specific timestep will be the 3 labels in the csv files (Heart Rate, Respiratory Rate, O2 level) and the output will be (either if the baby is in pain OR if the baby is in pain during the NEXT step)? -Need clarification here

In [38]:
#store a list of dataframes representing the data from the csv files we read in
#after reading in all of the csv files as dataframes, iterate through them, cleaning up the data and normalizing the inputs


#Should we normalize on a dataframe level or a dataset level?


data = []


#use the os library to iterate through all files in the current path
for (root,dirs,files) in os.walk(data_dir, topdown=True):
  #open each CSV file, we don't want to open any other type of file
  for f in files:
    if f.endswith(".csv"):
      with open(os.path.join(root, f), mode ='r') as csvFile:

        #read in the csv file
        csvData = csv.reader(csvFile)

        #store the data temporarily before we turn it into a pandas dataframe
        temp = []


        #check if each row is valid
        #one way to tell if the data is invalid is if the label is NOT [0, 1, 2]. This means there was something wrong with one of the sensors at the time of capture
        #another way is to check if the heart rate, respiratory rate, or o2 level is too low or too high
        #for heart Rate, we want a range of 40 - 200
        #for Respiratory Rate, we want a range of 15 - 90
        #for o2 level, we want a rate of 80 - 100

        for row in csvData:
          try:
            if row[4] in "012" and (int(row[1]) >= 40 and int(row[1]) <= 200) and (int(row[2]) >= 15 and int(row[2]) <= 90) and (int(row[3]) >= 80 and int(row[3]) <= 100):   #data is valid, so add it to the main csv
              #row[0] just stores the row # in the csv file, so it is not needed in the training dataset
              temp.append([int(row[1]), int(row[2]), int(row[3]), int(row[4])])

          #Catches errors that occur when an element cannot be parsed into an integer. In this case the row is invalid so we can skip it
          except ValueError:
            continue

        #if this valid and cleaned sequence is NOT empty, add it to our dataset
        if len(temp) > 0:
          df = pd.DataFrame(temp, columns = ['Heart Rate', 'Respiratory Rate', 'O2 Level', 'Label'])
          data.append(df)

print(len(data))

487


In [39]:
#count sequence lengths and how often they appear
freqMap = {}
for d in data:
  if d.shape not in freqMap.keys():
    freqMap[d.shape] = 1

  else:
    freqMap[d.shape] += 1

for key, value in freqMap.items():
    print(f'{key} -> {value}')


#split data into training/validation sets using an 85/15 split
data = np.asarray(data, dtype="object")
np.random.shuffle(data)

trainData, valData = np.split(data, [int(0.85*len(data))])

print(len(trainData), len(valData))

#remove the labels from the trainingData and validation data, also storing them as a list
#we don't need the entire column of labels, just the first rows label
trainLabels = []
valLabels = []

for df in trainData:
  label = df['Label'].iloc[0]    #get the first rows label, and use it for this whole sequence
  df.drop('Label', axis=1, inplace=True)
  trainLabels.append(label)

for df in valData:
  label = df['Label'].iloc[0]    #get the first rows label, and use it for this whole sequence
  df.drop('Label', axis=1, inplace=True)
  valLabels.append(label)

print(trainData.shape, len(trainLabels))
print(valData.shape, len(valLabels))

(31, 4) -> 381
(30, 4) -> 31
(14, 4) -> 2
(29, 4) -> 21
(5, 4) -> 1
(11, 4) -> 1
(21, 4) -> 4
(4, 4) -> 1
(7, 4) -> 1
(27, 4) -> 10
(28, 4) -> 15
(25, 4) -> 4
(26, 4) -> 6
(17, 4) -> 2
(22, 4) -> 2
(24, 4) -> 2
(23, 4) -> 2
(15, 4) -> 1
413 74
(413,) 413
(74,) 74


In [40]:
#normalize data using only the training set
#find the min/max of each column throughout all of the dataframes in the training dataset
#save the values, then iterate through the training and validation dataset and use min-max normalization
#df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

#norm values stores the min/max of each column, initialized to min = float('inf'), max = float('-inf')
#[[Heart Rate Min, Heart Rate Max], [Respiratory Rate Min, Respiratory Rate Max], [O2 Level Min, O2 Level Max]]
normValues = [[float('inf'), float('-inf')], [float('inf'), float('-inf')], [float('inf'), float('-inf')]]

for df in trainData:
  normValues[0][0] = min(normValues[0][0], df['Heart Rate'].min())
  normValues[0][1] = max(normValues[0][1], df['Heart Rate'].max())

  normValues[1][0] = min(normValues[1][0], df['Respiratory Rate'].min())
  normValues[1][1] = max(normValues[1][1], df['Respiratory Rate'].max())

  normValues[2][0] = min(normValues[2][0], df['O2 Level'].min())
  normValues[2][1] = max(normValues[2][1], df['O2 Level'].max())

#now we have the min/max for each column across our whole dataset from our training data, so we can save this normalization, and apply it to the validation set
print(normValues)



#normalizing the data:
for df in trainData:
  df['Heart Rate'] = (df['Heart Rate'] - normValues[0][0]) / (normValues[0][1] - normValues[0][0])
  df['Respiratory Rate'] = (df['Respiratory Rate'] - normValues[1][0]) / (normValues[1][1] - normValues[1][0])
  df['O2 Level'] = (df['O2 Level'] - normValues[2][0]) / (normValues[2][1] - normValues[2][0])

for df in valData:
  df['Heart Rate'] = (df['Heart Rate'] - normValues[0][0]) / (normValues[0][1] - normValues[0][0])
  df['Respiratory Rate'] = (df['Respiratory Rate'] - normValues[1][0]) / (normValues[1][1] - normValues[1][0])
  df['O2 Level'] = (df['O2 Level'] - normValues[2][0]) / (normValues[2][1] - normValues[2][0])

[[52, 199], [15, 90], [80, 100]]


In [44]:
#Data padding
#inputs to GRU cells are in the form of [batch, timesteps, feature]
#our sequences have multiple lengths, so we need to pad this data to get all of the sequences to have a length of 31 (indices 0-30)
#we can pad the data by adding rows of 0's, and use a masking layer to ignore them when training the model

for i in range(len(trainData)):
  if trainData[i].shape[0] < 31:
    padding = pd.DataFrame(0, index=np.arange(trainData[i].shape[0], 31), columns=trainData[i].columns)
    trainData[i] = pd.concat([trainData[i], padding])

for i in range(len(valData)):
  if valData[i].shape[0] < 31:
    padding = pd.DataFrame(0, index=np.arange(valData[i].shape[0], 31), columns=valData[i].columns)
    valData[i] = pd.concat([valData[i], padding])

# Model Setup

In [31]:
#first model uses 4 GRU layers with 16 unit each
#input is taken in the form of: [batch, timesteps, feature]
#Since our sequences have different lengths, we need to pad them
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(3),
    tf.keras.layers.GRU(16, return_sequences=True),
    tf.keras.layers.GRU(16, return_sequences=True),
    tf.keras.layers.GRU(16, return_sequences=True),
    tf.keras.layers.GRU(16),
    tf.keras.layers.Dense(3)
])

ValueError: Input 0 of layer "gru" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 3)

# Model Training

In [None]:
#our learning rate scheduler. Will gradually shrink the learning rate as training progesses to find a better converging point.
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    0.001,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

checkpoint_filepath = '/content/drive/MyDrive/Colab Notebooks/Deep Learning/Project 2/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=1e-7)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt,
              loss_fn=loss,
              metrics=['accuracy'])
model.summary()

# Testing Script