# GRU Model
## LOAD IN TWO DATA SETS FOR TESTING

In [23]:
import numpy as np
import pandas as pd
from os import listdir
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Masking, InputLayer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, Masking, InputLayer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
from sklearn.metrics import recall_score, precision_score
import keras_tuner as kt
from keras_tuner import HyperParameters, RandomSearch, Objective
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import make_scorer, f1_score

# Local Path
data_path = r'..\data'

# Files to Read
complete_data_files = [i for i in listdir(data_path) if ('model' in i) & ('complete' in i)]
imputed_data_files = [i for i in listdir(data_path) if ('model' in i) & ('impute' in i)]

# Read and concat data
complete_data = pd.concat([pd.read_csv(data_path + '\\' + file) for file in complete_data_files]).reset_index(drop=True).drop(columns=['time_bucket'])
imputed_data = pd.concat([pd.read_csv(data_path + '\\' + file) for file in imputed_data_files]).reset_index(drop=True).drop(columns=['time_bucket'])

In [24]:
def preprocess_with_stratification(data):
    # Sort and group data by 'Unique Stay' and 'sequence_num' to ensure time order
    df = data.sort_values(by=['stay_id', 'seq_num'])
    
    # 2. Normalize Numerical Columns
    scaler = MinMaxScaler()
    numerical_cols = ['map', 'hr', 'pao2', 'fio2', 'creatinine', 'lactate', 'platelets', 'gcs']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    # 3. Encode Categorical Variables
    encoder = OneHotEncoder(sparse_output=False)
    categorical_cols = ['gender', 'race', 'marital_status', 'insurance']
    encoded_cats = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
    df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)
    
    # 4. Group by stay_id and Create Sequences
    grouped = df.groupby('stay_id')
    sequences = [group.drop(columns=['stay_id', 'mortality']).values for _, group in grouped]
    labels = [group['mortality'].iloc[0] for _, group in grouped]
    
    # 5. Convert Labels to NumPy array for Stratified Splitting
    labels = np.array(labels).astype('float32')
    
    # 6. Pad Sequences to have the same length
    max_seq_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, dtype='float32', padding='post')
    
    # Stratified sampling to maintain label distribution in train/test split
    stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(stratified_split.split(padded_sequences, labels))
    
    # Create train/test sequences and labels based on stratified indices
    X_train = padded_sequences[train_idx]
    X_test = padded_sequences[test_idx]
    y_train = labels[train_idx]
    y_test = labels[test_idx]
    
    return X_train, X_test, y_train, y_test

# Apply preprocessing with stratified sampling to both datasets
complete_X_train, complete_X_test, complete_y_train, complete_y_test = preprocess_with_stratification(complete_data)
imputed_X_train, imputed_X_test, imputed_y_train, imputed_y_test = preprocess_with_stratification(imputed_data)

In [25]:
# Define custom wrapper for the Keras model to use with RandomizedSearchCV
class KerasModelWrapper(BaseEstimator):
    def __init__(self, learning_rate=0.001, gru_units=64, dropout_rate=0.2, epochs=10, batch_size=32):
        self.learning_rate = learning_rate
        self.gru_units = gru_units
        self.dropout_rate = dropout_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None

    def create_model(self):
        model = Sequential([
            InputLayer(input_shape=(imputed_X_train.shape[1], imputed_X_train.shape[2])), 
            Masking(mask_value=0.0),
            GRU(self.gru_units, return_sequences=False, dropout=self.dropout_rate),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=Adam(learning_rate=self.learning_rate), loss='binary_crossentropy', metrics=['accuracy',
                                                                                                            tf.keras.metrics.Precision(name='precision'), 
                                                                                                            tf.keras.metrics.Recall(name='recall'),
                                                                                                            tf.keras.metrics.AUC(name='auc')])
        return model

    def fit(self, X, y):
        self.model = self.create_model()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def score(self, X, y):
        return self.model.evaluate(X, y, verbose=0)[1]

def tune_model(X_train, y_train, X_test, y_test, tune_type):
    # Instantiate the model wrapper
    model = KerasModelWrapper()
    
    # Define the parameter grid for tuning
    param_grid = {
        'learning_rate': [0.001, 0.01, 0.1],
        'gru_units': [32, 64, 128],
        'dropout_rate': [0.2, 0.3, 0.5],
        'batch_size': [16, 32, 64],
        'epochs': [5]
    }

    f1_scorer = make_scorer(f1_score)

    # RandomizedSearchCV to tune hyperparameters
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=3, verbose=2, scoring = f1_scorer, n_jobs=4, random_state=42)
    
    # Perform the search
    random_search_result = random_search.fit(X_train, y_train)
    
    # Best parameters and score
    print("Best Hyperparameters:", random_search_result.best_params_)
    print("Best Accuracy:", random_search_result.best_score_)
    
    # Evaluate the best model on the test set
    best_model = random_search_result.best_estimator_
    test_acc = best_model.score(X_test, y_test)
    print(f'Test Recall: {test_acc:.4f}')
    
    # Save the best model
    best_model = random_search_result.best_estimator_
    
    # Save the best model to a file
    model_save_path = f'{tune_type}_gru_best_model.keras'
    best_model.model.save(model_save_path)
    
    print(f"Best model saved at: {model_save_path}")
    
    # Evaluate the best model on the test set
    test_acc = best_model.score(X_test, y_test)
    print(f'Test Accuracy: {test_acc:.4f}')

In [26]:
tune_model(X_train = complete_X_train, y_train = complete_y_train, X_test = complete_X_test, y_test = complete_y_test, tune_type = 'complete_f1')

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [22]:
tune_model(imputed_X_train, imputed_y_train, imputed_X_test, imputed_y_test, 'imputed_f1')

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [16]:
# Load the saved model and re-evaluate on the test set
loaded_model = load_model('imputed_gru_best_model.keras')
model_metrics = loaded_model.evaluate(imputed_X_test, imputed_y_test, verbose=0)
model_metrics

[nan, 0.9865319728851318, 0.0, 0.0, 0.0]

In [12]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
import numpy as np

# Get model predictions
y_pred_prob = loaded_model.predict(imputed_X_test)  # Predicted probabilities
y_pred_class = (y_pred_prob > 0.5).astype(int)      # Convert probabilities to binary predictions

# Calculate metrics
# auc = roc_auc_score(imputed_y_test, y_pred_prob)
# auprc = average_precision_score(imputed_y_test, y_pred_prob)
precision = precision_score(imputed_y_test, y_pred_class)
recall = recall_score(imputed_y_test, y_pred_class)
f1 = f1_score(imputed_y_test, y_pred_class)

# Print results
# print(f"AUC: {auc:.4f}")
# print(f"AUPRC: {auprc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 424ms/step
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#TODO Change to complete data in model and then tune complete model