# GRU Model
## LOAD IN TWO DATA SETS FOR TESTING

In [1]:
import numpy as np
import pandas as pd
from os import listdir
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Masking, InputLayer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, Masking, InputLayer
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator
import tensorflow as tf
import keras
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
from sklearn.metrics import recall_score, precision_score
import keras_tuner as kt
from keras_tuner import HyperParameters, RandomSearch, Objective
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import make_scorer, f1_score
from sklearn.utils.class_weight import compute_class_weight
from functools import partial
from sklearn.metrics import precision_recall_curve, auc

# Local Path
data_path = r'../data'

# Files to Read
complete_data_files = [i for i in listdir(data_path) if ('model' in i) & ('complete' in i)]
imputed_data_files = [i for i in listdir(data_path) if ('model' in i) & ('impute' in i)]

# Read and concat data
complete_data = pd.concat([pd.read_csv(data_path + '/' + file) for file in complete_data_files]).reset_index(drop=True).drop(columns=['time_bucket'])
imputed_data = pd.concat([pd.read_csv(data_path + '/' + file) for file in imputed_data_files]).reset_index(drop=True).drop(columns=['time_bucket'])

2024-12-10 11:01:47.070806: I tensorflow/core/platform/cpu_feature_guard.cc:181] Beginning TensorFlow 2.15, this package will be updated to install stock TensorFlow 2.15 alongside Intel's TensorFlow CPU extension plugin, which provides all the optimizations available in the package and more. If a compatible version of stock TensorFlow is present, only the extension will get installed. No changes to code or installation setup is needed as a result of this change.
More information on Intel's optimizations for TensorFlow, delivered as TensorFlow extension plugin can be viewed at https://github.com/intel/intel-extension-for-tensorflow.
2024-12-10 11:01:47.070902: I tensorflow/core/platform/cpu_feature_guard.cc:192] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Preprocess with Stratification for Imbalanced Data Set

In [2]:
def preprocess_with_stratification(data):
    # Sort and group data by 'Unique Stay' and 'sequence_num' to ensure time order
    df = data.sort_values(by=['stay_id', 'seq_num'])
    
    # 2. Normalize Numerical Columns
    scaler = MinMaxScaler()
    numerical_cols = ['map', 'hr', 'pao2', 'fio2', 'creatinine', 'lactate', 'platelets', 'gcs']
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    # 3. Encode Categorical Variables
    encoder = OneHotEncoder(sparse_output=False)
    categorical_cols = ['gender', 'race', 'marital_status', 'insurance']
    encoded_cats = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))
    df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)
    
    # 4. Group by stay_id and Create Sequences
    grouped = df.groupby('stay_id')
    sequences = [group.drop(columns=['stay_id', 'mortality']).values for _, group in grouped]
    labels = [group['mortality'].iloc[0] for _, group in grouped]
    
    # 5. Convert Labels to NumPy array for Stratified Splitting
    labels = np.array(labels).astype('float32')
    
    # 6. Pad Sequences to have the same length
    max_seq_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, dtype='float32', padding='post')
    
    # Stratified sampling to maintain label distribution in train/test split
    stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(stratified_split.split(padded_sequences, labels))
    
    # Create train/test sequences and labels based on stratified indices
    X_train = padded_sequences[train_idx]
    X_test = padded_sequences[test_idx]
    y_train = labels[train_idx]
    y_test = labels[test_idx]
    
    return X_train, X_test, y_train, y_test

# Apply preprocessing with stratified sampling to both datasets
complete_X_train, complete_X_test, complete_y_train, complete_y_test = preprocess_with_stratification(complete_data)
imputed_X_train, imputed_X_test, imputed_y_train, imputed_y_test = preprocess_with_stratification(imputed_data)

#### Create Model and Tune Based on F1 Score

In [43]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.base import BaseEstimator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, InputLayer, Masking
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

class KerasModelWrapper(BaseEstimator):
    def __init__(self, learning_rate=0.001, gru_units=64, dropout_rate=0.2, epochs=10, batch_size=32):
        self.learning_rate = learning_rate
        self.gru_units = gru_units
        self.dropout_rate = dropout_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.input_shape = None  # Will be dynamically set during `fit`

    def create_model(self):

        if self.input_shape is None:
            raise ValueError("Input shape must be set before creating the model.")
        model = Sequential([
            InputLayer(input_shape=self.input_shape),
            Masking(mask_value=0.0),
            GRU(self.gru_units, return_sequences=False, dropout=self.dropout_rate),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer=Adam(learning_rate=self.learning_rate), 
                      loss='binary_crossentropy', 
                      metrics=['accuracy', 
                               tf.keras.metrics.Precision(name='precision'), 
                               tf.keras.metrics.Recall(name='recall'),
                               tf.keras.metrics.AUC(name='auc')])
        return model

    def fit(self, X, y):

        # Dynamically set the input shape based on training data
        self.input_shape = (X.shape[1], X.shape[2])
        self.model = self.create_model()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        # Return binary predictions
        probas = self.model.predict(X)
        return (probas > 0.12).astype(int)

    def score(self, X, y):
        # Return f1_score as the scoring metric
        y_pred = self.predict(X)
        return f1_score(y, y_pred, average='weighted')

def tune_model(X_train, y_train, X_test, y_test, tune_type):
    # Instantiate the model wrapper
    model = KerasModelWrapper()

    # Define the parameter grid for tuning
    param_grid = {
        'learning_rate': [0.001, 0.01, 0.1],
        'gru_units': [32, 64, 128],
        'dropout_rate': [0.2, 0.3, 0.5],
        'batch_size': [16, 32, 64],
        'epochs': [10]
    }

    # Create a partial function to fix the threshold at 0.3
    custom_f1_scorer = make_scorer(f1_score, average='weighted')

    # RandomizedSearchCV to tune hyperparameters
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, 
                                       n_iter=10, cv=3, verbose=3, n_jobs=1, 
                                       random_state=42, scoring=custom_f1_scorer)
    # Perform the search
    random_search_result = random_search.fit(X_train, y_train)
    
    # Best parameters and score
    print("Best Hyperparameters:", random_search_result.best_params_)
    print("Best F1 Score:", random_search_result.best_score_)
    
    # Evaluate the best model on the test set
    best_model = random_search_result.best_estimator_
    y_test_pred = best_model.predict(X_test)
    test_f1 = f1_score(y_test, y_test_pred)
    print(f'Test F1 Score: {test_f1:.4f}')
    
    # Save the best model
    model_save_path = f'{tune_type}_gru_best_model.keras'
    best_model.model.save(model_save_path)
    print(f"Best model saved at: {model_save_path}")

In [None]:
# Tune on Complete Dataset and Save
tune_model(X_train = complete_X_train, y_train = complete_y_train, X_test = complete_X_test, y_test = complete_y_test, tune_type = 'complete')

In [None]:
# Tune on Imputed Dataset and Save
tune_model(imputed_X_train, imputed_y_train, imputed_X_test, imputed_y_test, 'imputed')