**Building LSTM Model**

In [12]:
import pandas as pd
from os import listdir
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator

import tensorflow as tf
#from tensorflow import keras

Loading data

In [13]:
data_path = r'../data/final_data'

# Files to read
data_files = listdir(data_path)

# Load data
data = pd.concat([pd.read_csv(data_path + '/' + file) for file in data_files]).drop(columns=['endtime'])

In [14]:
data.head()

Unnamed: 0,insurance,race,marital_status,gender,anchor_age,value,amount,amountuom,label,Died,Unique Stay,sequence_num
0,Medicare,OTHER,SINGLE,F,53,,3.3,mmol/L,Lactic Acid,False,1000201323581541,1
1,Medicare,OTHER,SINGLE,F,53,,421.0,mmHg,Arterial O2 pressure,False,1000201323581541,2
2,Medicare,OTHER,SINGLE,F,53,,2.8,mmol/L,Lactic Acid,False,1000201323581541,3
3,Medicare,OTHER,SINGLE,F,53,,3.1,mmol/L,Lactic Acid,False,1000201323581541,4
4,Medicare,OTHER,SINGLE,F,53,,384.0,mmHg,Arterial O2 pressure,False,1000201323581541,5


Verifying that data includes only CABG patients

In [15]:
CABG_ICDS = ['0210093', '0210098', '0210099', '021009C', '021009F', '021009W',
'02100A3', '02100A8', '02100A9', '02100AC', '02100AF', '02100AW',
'02100J3', '02100J8', '02100J9', '02100JC', '02100JF', '02100JW',
'02100K3', '02100K8', '02100K9', '02100KC', '02100KF', '02100KW',
'02100Z3', '02100Z8', '02100Z9', '02100ZC', '02100ZF',
'0211093', '0211098', '0211099', '021109C', '021109F', '021109W',
'02110A3', '02110A8', '02110A9', '02110AC', '02110AF', '02110AW',
'02110J3', '02110J8', '02110J9', '02110JC', '02110JF', '02110JW',
'02110K3', '02110K8', '02110K9', '02110KC', '02110KF', '02110KW',
'02110Z3', '02110Z8', '02110Z9', '02110ZC', '02110ZF',
'0212093', '0212098', '0212099', '021209C', '021209F', '021209W',
'02120A3', '02120A8', '02120A9', '02120AC', '02120AF', '02120AW',
'02120J3', '02120J8', '02120J9', '02120JC', '02120JF', '02120JW',
'02120K3', '02120K8', '02120K9', '02120KC', '02120KF', '02120KW',
'02120Z3', '02120Z8', '02120Z9', '02120ZC', '02120ZF',
'0213093', '0213098', '0213099', '021309C', '021309F', '021309W',
'02130A3', '02130A8', '02130A9', '02130AC', '02130AF', '02130AW',
'02130J3', '02130J8', '02130J9', '02130JC', '02130JF', '02130JW',
'02130K3', '02130K8', '02130K9', '02130KC', '02130KF', '02130KW',
'02130Z3', '02130Z8', '02130Z9', '02130ZC', '02130ZF',
'3610', '3611', '3612', '3613', '3614', '3615', '3616', '3617', '3619']

In [18]:
#cabg_pts = pd.read_csv('B:/Databases/MIMIC-IV/CABG-filtered/cabg_pts.csv')

In [19]:
# # Filter data to only include CABG patients
# cabg_pts[cabg_pts.icd_code.isin(CABG_ICDS)]

# # Unique patient IDs
# cabg_pts.subject_id.nunique() # 5647 unique patients

# Create unique stay ID
# unique_stay = cabg_pts.subject_id.astype(str) + cabg_pts.hadm_id.astype(str)
# unique_stay = unique_stay.to_list()

# Number of patients in the data
data.shape[0]

1214970

In [20]:
# Number of patients in the data that are CABG patients
#data['Unique Stay'].astype(str).isin(unique_stay).sum()

### Data Preprocessing

In [21]:
# Recoding race data
data['race_new']  = data.race.replace({
                              'WHITE': 'White',
                              'WHITE - OTHER EUROPEAN': 'White',
                              'WHITE - RUSSIAN': 'White',
                              'WHITE - EASTERN EUROPEAN': 'White',
                              'PORTUGUESE': 'White',
                              'WHITE - BRAZILIAN': 'White',
                              # Black
                              'BLACK/AFRICAN AMERICAN': 'Black',
                              'BLACK/AFRICAN': 'Black',
                              'HISPANIC/LATINO - GUATEMALAN': 'Black',
                              # Asian
                              'ASIAN - ASIAN INDIAN': 'Asian',
                              'ASIAN': 'Asian',
                              'ASIAN - CHINESE': 'Asian',
                              'ASIAN - SOUTH EAST ASIAN': 'Asian',
                              'ASIAN - KOREAN': 'Asian',
                              # Hispanic
                              'HISPANIC/LATINO - PUERTO RICAN': 'Hispanic',
                              'HISPANIC/LATINO - DOMINICAN': 'Hispanic',
                              'HISPANIC OR LATINO': 'Hispanic',
                              'HISPANIC': 'Hispanic',
                              'HISPANIC/LATINO - CUBAN': 'Hispanic',
                              'HISPANIC/LATINO - SALVADORAN': 'Hispanic',
                              'SOUTH AMERICAN': 'Hispanic',
                              'HISPANIC/LATINO - COLUMBIAN': 'Hispanic',
                              'HISPANIC/LATINO - CUBAN': 'Hispanic',
                              'HISPANIC/LATINO - HONDURAN': 'Hispanic',
                              'HISPANIC/LATINO - CENTRAL AMERICAN': 'Hispanic',
                              'HISPANIC/LATINO - MEXICAN': 'Hispanic',
                              # Native American/Pacific Islander
                              'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'Native American',
                              'AMERICAN INDIAN/ALASKA NATIVE': 'Native American',
                              'BLACK/CAPE VERDEAN': 'Black',
                              'BLACK/CARIBBEAN ISLAND' : 'Black',
                              # Other
                              'OTHER': 'Other',
                              'MULTIPLE RACE/ETHNICITY': 'Other',
                              # Unknown
                              'UNABLE TO OBTAIN': np.nan,
                              'UNKNOWN': np.nan,
                              'PATIENT DECLINED TO ANSWER': np.nan,
                              })

data.race = data.race_new
data.drop(['race_new'], axis = 1, inplace = True)

data.race.value_counts

<bound method IndexOpsMixin.value_counts of 0        Other
1        Other
2        Other
3        Other
4        Other
         ...  
99995    White
99996    White
99997    White
99998    White
99999    White
Name: race, Length: 1214970, dtype: object>

In [22]:
# Checking for discrepancies in the race data

# Creating data for patient ID
data['subject_id'] = data['Unique Stay'].astype(str).str[0:8]

# Checking if all race data is the same for each patient
data.groupby('subject_id').race.nunique().value_counts()

race
1    4533
0     813
Name: count, dtype: int64

**Cleaning up the group of variables `value`, `amount`, `amountuom`, `label`.**

In [23]:
data.label.value_counts()

label
Heart Rate                      526542
Arterial Blood Pressure mean    336641
Inspired O2 Fraction             71942
Arterial O2 pressure             61075
Ventilator Type                  41940
Platelet Count                   36745
Ventilator Mode                  34464
Creatinine (serum)               34050
Lactic Acid                      31799
Norepinephrine                   25894
Epinephrine                       8281
Total Bilirubin                   4365
Dobutamine                         634
Dopamine                           591
Epinephrine.                         7
Name: count, dtype: int64

There are 2 label values for epinephrine: 'Epinephrine' and 'Epinephrine.'. We will combine these into one label value.
Also, there are 2 different units of measure for epinephrine: 'mcg' and 'mg'. We will convert all values to 'mg'. But we need to make sure that the values that are coded as mcg are actually mcg and not mg (the values for mcg and mg should different by a factor of 1000).

In [24]:
# Remove the '.' from 'Epinephrine.' in the 'label' column
data.loc[data.label == 'Epinephrine.', 'label'] = 'Epinephrine'

# Convert 'amount' from mcg to mg and change 'amountuom' to 'mg'
data.loc[(data.label == 'Epinephrine') & (data.amountuom == 'mcg'), 'amount'] /= 1000
data.loc[(data.label == 'Epinephrine') & (data.amountuom == 'mcg'), 'amountuom'] = 'mg'


In [25]:
data.label.value_counts()

label
Heart Rate                      526542
Arterial Blood Pressure mean    336641
Inspired O2 Fraction             71942
Arterial O2 pressure             61075
Ventilator Type                  41940
Platelet Count                   36745
Ventilator Mode                  34464
Creatinine (serum)               34050
Lactic Acid                      31799
Norepinephrine                   25894
Epinephrine                       8288
Total Bilirubin                   4365
Dobutamine                         634
Dopamine                           591
Name: count, dtype: int64

In [26]:
# Making sure that units and values are consistent
print('Heart rate: ', data[data.label == 'Heart Rate'].amountuom.value_counts(), '\n')

print('MAP: ', data[data.label == 'Arterial Blood Pressure mean'].amountuom.value_counts(), '\n')

print('FiO2: ', data[data.label == 'Inspired O2 Fraction'].amountuom.value_counts(), '\n')

print('PaO2: ', data[data.label == 'Arterial O2 pressure'].amountuom.value_counts(), '\n')

print('Ventilator type: ', data[data.label == 'Ventilator Type'].amountuom.value_counts(), '\n')

print('Ventilator Mode: ', data[data.label == 'Ventilator Mode'].amountuom.value_counts(), '\n')

print('Platelet Count: ', data[data.label == 'Platelet Count'].amountuom.value_counts(), '\n')

print('Creatinine (serum): ', data[data.label == 'Creatinine (serum)'].amountuom.value_counts(), '\n')

print('Lactic Acid: ', data[data.label == 'Lactic Acid'].amountuom.value_counts(), '\n')

print('Norepinephrine: ', data[data.label == 'Norepinephrine'].amountuom.value_counts(), '\n')

print('Epinephrine: ', data[data.label == 'Epinephrine'].amountuom.value_counts(), '\n')

print('Total Bilirubin: ', data[data.label == 'Total Bilirubin'].amountuom.value_counts(), '\n')

print('Dobutamine: ', data[data.label == 'Dobutamine'].amountuom.value_counts(), '\n')

print('Dopamine: ', data[data.label == 'Dopamine'].amountuom.value_counts(), '\n')

Heart rate:  amountuom
bpm    526542
Name: count, dtype: int64 

MAP:  amountuom
mmHg    336641
Name: count, dtype: int64 

FiO2:  Series([], Name: count, dtype: int64) 

PaO2:  amountuom
mmHg    61075
Name: count, dtype: int64 

Ventilator type:  Series([], Name: count, dtype: int64) 

Ventilator Mode:  Series([], Name: count, dtype: int64) 

Platelet Count:  amountuom
K/uL    36739
Name: count, dtype: int64 

Creatinine (serum):  amountuom
mg/dL    34044
Name: count, dtype: int64 

Lactic Acid:  amountuom
mmol/L    31798
Name: count, dtype: int64 

Norepinephrine:  amountuom
mg    25894
Name: count, dtype: int64 

Epinephrine:  amountuom
mg    8288
Name: count, dtype: int64 

Total Bilirubin:  amountuom
mg/dL    4365
Name: count, dtype: int64 

Dobutamine:  amountuom
mg    634
Name: count, dtype: int64 

Dopamine:  amountuom
mg    591
Name: count, dtype: int64 



Encode categorical features

In [27]:
label_encoder = LabelEncoder()

# Encode categorical columns
categorical_columns = ['insurance', 'race', 'marital_status', 'gender', 'value', 'amountuom', 'amount']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col].astype(str))

data.drop(columns = ['subject_id'], inplace = True)

In [28]:
# Encode target variable `Died` (binary classification: 0 = Alive, 1 = Died)
data['Died'] = label_encoder.fit_transform(data['Died'])

# Sort by Unique Stay and sequence number to get events in correct order
data.sort_values(by=['Unique Stay', 'sequence_num'], inplace=True)

In [29]:
# Number of observations
print('Number of observations: ', data['Unique Stay'].nunique())

Number of observations:  5361


In [30]:
# Create sequences of features for each Unique Stay
sequence_data = []
sequence_labels = []

unique_stays = data['Unique Stay'].unique()

Iterate over unique stays

In [31]:
for stay in unique_stays:
    stay_data = data[data['Unique Stay'] == stay]
    stay_features = stay_data[['insurance', 'race', 'marital_status', 'gender', 'anchor_age', 'value', 'amount', 'amountuom']].values
    stay_label = stay_data['Died'].values[-1]  # Use the last event to define the label
    
    # Add the sequence and its corresponding label
    sequence_data.append(stay_features)
    sequence_labels.append(stay_label)

Pad sequences to ensure uniform input length

In [33]:
import keras

In [None]:
#from tensorflow.keras.utils import pad_sequences
sequence_data = keras.utils.pad_sequences(sequence_data, padding='post', dtype='float32', maxlen=50)  # Adjust maxlen as needed

In [36]:
# Convert labels to numpy array
sequence_labels = np.array(sequence_labels)

Train-test split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(sequence_data, sequence_labels, test_size=0.2, random_state=42)

Define custom wrapper for the Keras model to use with RandomizedSearchCV

In [40]:
from keras import layers, Sequential

In [49]:
class KerasModelWrapper(BaseEstimator):
    def __init__(self, learning_rate=0.001, gru_units=64, dropout_rate=0.2, epochs=10, batch_size=32):
        self.learning_rate = learning_rate
        self.gru_units = gru_units
        self.dropout_rate = dropout_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
    def create_model(self):
        model = Sequential([
            layers.InputLayer(input_shape=(X_train.shape[1], X_train.shape[2])), 
            layers.Masking(mask_value=0.0),
            layers.GRU(self.gru_units, return_sequences=False, dropout=self.dropout_rate),
            layers.Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=Adam(learning_rate=self.learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    def fit(self, X, y):
        self.model = self.create_model()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    def predict(self, X):
        return self.model.predict(X)
    def score(self, X, y):
        return self.model.evaluate(X, y, verbose=0)[1]

Instantiate the model wrapper

In [50]:
model = KerasModelWrapper()

Define the parameter grid for tuning

In [51]:
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'gru_units': [32, 64, 128],
    'dropout_rate': [0.2, 0.3, 0.5],
    'batch_size': [16, 32, 64],
    'epochs': [10, 20, 50]
}

RandomizedSearchCV to tune hyperparameters

In [52]:
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42)

Perform the search

In [53]:
random_search_result = random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


I0000 00:00:1731200711.487697   18588 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1731200711.487697   18570 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1731200711.496607   18584 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1731200711.504121   18579 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1731200711.504118   18578 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

[CV] END batch_size=64, dropout_rate=0.2, epochs=20, gru_units=64, learning_rate=0.01; total time=   9.5s
[CV] END batch_size=64, dropout_rate=0.2, epochs=20, gru_units=64, learning_rate=0.01; total time=   9.5s
[CV] END batch_size=16, dropout_rate=0.2, epochs=10, gru_units=128, learning_rate=0.001; total time=  11.5s


2024-11-09 20:05:35.756324: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 6.53GiB (7011437568 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-11-09 20:05:36.003386: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 5.88GiB (6310293504 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-11-09 20:05:36.068934: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 6.53GiB (7011167232 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory


[CV] END batch_size=16, dropout_rate=0.2, epochs=10, gru_units=128, learning_rate=0.001; total time=  24.7s


2024-11-09 20:05:36.255545: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 5.88GiB (6310050304 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-11-09 20:05:50.338444: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 5.29GiB (5679264256 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory


[CV] END batch_size=64, dropout_rate=0.5, epochs=20, gru_units=128, learning_rate=0.01; total time=  39.3s
[CV] END batch_size=64, dropout_rate=0.3, epochs=50, gru_units=128, learning_rate=0.1; total time=  39.4s


2024-11-09 20:05:51.539918: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 5.29GiB (5679045120 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory


[CV] END batch_size=32, dropout_rate=0.5, epochs=20, gru_units=128, learning_rate=0.1; total time=  42.0s


2024-11-09 20:05:53.400461: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 4.76GiB (5111337472 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-11-09 20:05:53.467902: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 4.76GiB (5111140352 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-11-09 20:05:54.366955: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 4.28GiB (4600203776 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory


[CV] END batch_size=16, dropout_rate=0.2, epochs=50, gru_units=128, learning_rate=0.001; total time=  43.0s
[CV] END batch_size=64, dropout_rate=0.2, epochs=20, gru_units=64, learning_rate=0.01; total time=  43.2s
[CV] END batch_size=64, dropout_rate=0.5, epochs=50, gru_units=64, learning_rate=0.01; total time=  43.7s
[CV] END batch_size=64, dropout_rate=0.3, epochs=50, gru_units=128, learning_rate=0.1; total time=  44.8s
[CV] END batch_size=16, dropout_rate=0.2, epochs=10, gru_units=128, learning_rate=0.001; total time=  45.0s
[CV] END batch_size=64, dropout_rate=0.5, epochs=20, gru_units=128, learning_rate=0.01; total time=  45.9s


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGABRT(-6)}

Best parameters and score

In [47]:
print("Best Hyperparameters:", random_search_result.best_params_)
print("Best Accuracy:", random_search_result.best_score_)

NameError: name 'random_search_result' is not defined

Evaluate the best model on the test set

In [48]:
best_model = random_search_result.best_estimator_
test_acc = best_model.score(X_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}')

NameError: name 'random_search_result' is not defined

Save the best model

In [None]:
best_model = random_search_result.best_estimator_

Save the best model to a file

In [None]:
model_save_path = 'gru_best_model.keras'
best_model.model.save(model_save_path)

In [None]:
print(f"Best model saved at: {model_save_path}")

Evaluate the best model on the test set

In [None]:
test_acc = best_model.score(X_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}')

Load the saved model and re-evaluate on the test set

In [None]:
loaded_model = load_model(model_save_path)
loaded_test_loss, loaded_test_acc = loaded_model.evaluate(X_test, y_test, verbose=0)

In [None]:
print(f"Loaded Model Test Accuracy: {loaded_test_acc:.4f}")