# Predicting Mortality Risk of ICU Patients Post-CABG

<span style="color:red;"><b>
MODEL BUILDING NOTEBOOK
</span></b>

## Setting up environment and loading data

In [1]:
import os
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score, f1_score
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
import keras_tuner as kt
from keras_tuner import HyperParameters, RandomSearch, Objective
from tqdm.notebook import tqdm
import seaborn as sns
from keras.models import Model
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import math

2024-12-08 18:22:20.655336: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-08 18:22:22.420316: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-08 18:22:22.792653: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-08 18:22:22.938141: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-08 18:22:23.924033: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Ensure reproducibility by setting random seeds
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Setting eagerly to true // https://www.tensorflow.org/api_docs/python/tf/config/run_functions_eagerly
tf.config.run_functions_eagerly(True)

In [3]:
# Set working directory
os.chdir('/sfs/gpfs/tardis/home/krb3ym/Documents/MSDS/DS6050/final_project/Predicting-Mortality-Risk-of-ICU-Patients-Post-Coronary-Artery-Bypass-Graft-Surgery/')

# Verifying GPU availability
print(tf.config.list_physical_devices('GPU'))

# Verifying tensorflow version
print(tf.__version__)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2.17.0


In [5]:
# Load data
directory1 = './data/final_data/chunked_complete_data/'
directory2 = './data/final_data/chunked_imputed_data/'

# List all CSV files in the directory
complete_csv_files = [os.path.join(directory1, file) for file in os.listdir(directory1) if file.endswith('.csv')]
imputed_csv_files = [os.path.join(directory2, file) for file in os.listdir(directory2) if file.endswith('.csv')]

# Read and concatenate all CSV files
complete_dfs = [pd.read_csv(csv_file) for csv_file in complete_csv_files]
imputed_dfs = [pd.read_csv(csv_file) for csv_file in imputed_csv_files]

complete_final_df = pd.concat(complete_dfs, ignore_index=True)
imputed_final_df = pd.concat(imputed_dfs, ignore_index=True)

In [6]:
complete_final_df.stay_id = complete_final_df.stay_id.astype(int).astype(str)
imputed_final_df.stay_id = imputed_final_df.stay_id.astype(int).astype(str)

complete_final_df.time_bucket = pd.to_datetime(complete_final_df.time_bucket)
imputed_final_df.time_bucket = pd.to_datetime(imputed_final_df.time_bucket)

In [7]:
pd.set_option('display.max_columns', None)
complete_final_df.head()

Unnamed: 0,stay_id,time_bucket,seq_num,anchor_age,gender,race,marital_status,insurance,vent,charlson,map,hr,pao2,fio2,creatinine,lactate,platelets,gcs,epinephrine,norepinephrine,phenylephrine,dobutamine,milrinone,dopamine,mortality
0,30004530,2165-07-31 12:00:00,1,63,M,White,DIVORCED,Medicare,0,5,73.333333,71.0,305.0,21.0,1.0,1.3,141.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,30004530,2165-07-31 13:00:00,2,63,M,White,DIVORCED,Medicare,1,5,73.333333,71.0,300.0,100.0,1.0,1.3,141.0,15.0,0.0,0.0,1.0,0.0,0.0,0.0,False
2,30004530,2165-07-31 14:00:00,3,63,M,White,DIVORCED,Medicare,1,5,70.555556,84.0,197.5,40.0,1.0,1.3,141.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,30004530,2165-07-31 15:00:00,4,63,M,White,DIVORCED,Medicare,1,5,67.777778,83.0,95.0,40.0,1.0,1.3,141.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,30004530,2165-07-31 16:00:00,5,63,M,White,DIVORCED,Medicare,1,5,65.0,71.0,96.0,40.0,1.0,1.3,141.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,False


## Model Building

**Creating helper functions and classes for data preprocessing and model building**

In [8]:
# Creating F1 score class
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        precision = self.precision.result()
        recall = self.recall.result()
        return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

In [9]:
# Create class to preprocess data
class Preprocessor:
    def __init__(self, pad_length=156, seed = 42):
        # Set global seeds
        np.random.seed(seed)
        tf.random.set_seed(seed)
        keras.utils.set_random_seed(seed)
        
        # Enable deterministic operations
        tf.config.experimental.enable_op_determinism()

        self.pad_length = pad_length
        self.label_encoders = {}
        self.numerical_scaler = StandardScaler()
        self.numerical_columns = ["anchor_age", "map", "hr", "pao2", "fio2", "gcs", "charlson",
                                "creatinine", "lactate", "platelets", "epinephrine", 
                                "norepinephrine", "phenylephrine", "dobutamine", 
                                "milrinone", "dopamine"]
        self.categorical_columns = ['insurance', 'gender', 'race', 'marital_status']
        self.binary_columns = ['vent']
        self.outcome = ['mortality']

    def fit(self, df):
        """Fit preprocessor on training data"""
        # Initialize and fit label encoders for categorical columns
        for col in self.categorical_columns:
            self.label_encoders[col] = LabelEncoder()
            self.label_encoders[col].fit(df[col].astype(str))
        
        # Fit scaler on numerical columns (reshaping for time series)
        numerical_data = df[self.numerical_columns].values
        self.numerical_scaler.fit(numerical_data)
                
        return self

    def transform(self, df):
        """Transform the data using fitted preprocessors"""
        # Transform categorical columns
        transformed_categorical = df[self.categorical_columns].copy()
        for col in self.categorical_columns:
            transformed_categorical[col] = self.label_encoders[col].transform(df[col].astype(str))

        # Stay ID
        stay_id = df['stay_id'].copy()
        
        # Scale numerical columns
        transformed_numerical = pd.DataFrame(
            self.numerical_scaler.transform(df[self.numerical_columns]),
            columns=self.numerical_columns
        )

        # Binary columns are used as-is (no transformation)
        transformed_binary = df[self.binary_columns].astype(int)
        transformed_outcome = df[self.outcome].astype(int)
        
        # Concatenate data
        transformed_data = pd.concat([stay_id, transformed_categorical, transformed_numerical, transformed_binary, transformed_outcome], axis=1)
        self.transformed_data = transformed_data
        
        # Combine transformed categorical, numerical, and binary columns
        return transformed_data

    def vectorization(self):
        """Convert transformed data to sequence vectors"""
        # Create sequences
        self.sequence_data = []
        self.sequence_labels = []
        
        for stay in self.transformed_data['stay_id'].unique():
            stay_data = self.transformed_data[self.transformed_data['stay_id'] == stay]
            features = stay_data[self.numerical_columns + self.categorical_columns + self.binary_columns].values
            label = stay_data[self.outcome].values[-1]
            
            self.sequence_data.append(features)
            self.sequence_labels.append(label)
        
        # Pad sequences
        self.sequence_data = keras.preprocessing.sequence.pad_sequences(
            self.sequence_data, 
            padding='post',
            maxlen=self.pad_length,
            value=-99
        )

        self.sequence_labels = np.array(self.sequence_labels)
        
        return self

    def train_test_split(self, size = 0.8):
        """Split data into training testing"""
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.sequence_data, self.sequence_labels, test_size=0.2, random_state=42)
        return self

    def Dataset(self, batch_size = 64):
        """Create dataset object"""
        if not hasattr(self, 'X_train'):
            raise ValueError("Must call train_test_split first")

        train_df = tf.data.Dataset.from_tensor_slices((self.X_train, self.y_train))
        train_df = (
            train_df
            .shuffle(buffer_size=len(self.X_train))        # Shuffle the entire dataset in memory
            .batch(batch_size)                        # Batch the data
            .prefetch(buffer_size=tf.data.AUTOTUNE)   # Prefetch batches for performance
        )
        
        val_df = tf.data.Dataset.from_tensor_slices((self.X_val, self.y_val))
        val_df = (
            val_df
            .batch(batch_size)                        # Batch the data
            .prefetch(buffer_size=tf.data.AUTOTUNE)   # Prefetch batches for performance
        )

        return train_df, val_df


In [10]:
# Define custom wrapper for the Keras model to use with keras tuner
class KerasModelWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, class_weight=None, seed = 42, max_epochs=20, max_trials=25):
        # Set global seeds
        np.random.seed(seed)
        tf.random.set_seed(seed)
        self.max_epochs = max_epochs
        self.max_trials = max_trials
        keras.utils.set_random_seed(seed)
        # Enable deterministic operations
        tf.config.experimental.enable_op_determinism()

        self.seed = seed
        self.model = None
        self.best_hp = None
        self.classes_ = np.array([0, 1])  # Binary classification
        self.tuner = None
        self.class_weight = class_weight  # Store class weights
        self.callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                restore_best_weights=True)
        ]

    def create_model(self, hp, df):
        """Create the model with optional hyperparameters."""
        # Define hyperparameters
        hp_units = hp.Int('units', min_value=16, max_value=256, step = 16)
        hp_dropout = hp.Choice('dropout_rate', [0.2, 0.3, 0.4, 0.5])
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

        # Add seed to layer initialization
        initializer = tf.keras.initializers.GlorotUniform(seed=self.seed)

        # Extract the shape of the input data from the Dataset
        for X_batch, _ in df.take(1):
            input_shape = X_batch.shape[1:]  # Exclude the batch dimension
            break
        
        # Defining model architecture
        model = keras.Sequential([
                    keras.layers.InputLayer(shape=((input_shape[0],input_shape[1]))),
                    keras.layers.Masking(mask_value=-99),
                    layers.LSTM(hp_units, return_sequences=False, dropout=hp_dropout, kernel_initializer=initializer, recurrent_initializer=initializer),
                    keras.layers.Dense(1, activation=None, kernel_initializer=initializer)
                ])

        # Defining metrics
        metrics = [
            'accuracy',
            Precision(name='precision'),
            Recall(name='recall'),
            AUC(name='auc'),
            AUC(name='auprc', curve='PR'),
            F1Score(name='f1_score')
        ]

        # Use binary cross-entropy with logits
        loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

        # Compiling model
        model.compile(optimizer=Adam(learning_rate=hp_learning_rate), 
                     loss= loss_fn, 
                     metrics=metrics)
        return model

    def compute_class_weight(self, df):
        """
        Compute class weights automatically if not provided
        
        Args:
            train: TensorFlow dataset
        
        Returns:
            dict: Class weights
        """
        # Extract labels from the dataset
        #y_true = np.array([label.numpy() for _, label in df])
        y_true = np.concatenate([label.numpy().flatten() for _, label in df], axis=0)

        # Compute class distribution
        unique, counts = np.unique(y_true, return_counts=True)
        total_samples = len(y_true)
        
        # Compute balanced class weights
        class_weights = {
            0: total_samples / (2 * counts[0]),
            1: total_samples / (2 * counts[1])
        }
        
        return class_weights
    
    def tune(self, train, validation, project_name = 'LSTM_tuning'):
        """Hyperparameter fine tuning using random grid search"""
        # Automatically compute class weights if not provided
        if self.class_weight is None:
            self.class_weight = self.compute_class_weight(train)

        def build_model(hp, df = train):
            return self.create_model(hp, df = train)
        
        tuner = RandomSearch(
            build_model,
            objective=Objective("val_auprc", direction="max"),
            max_trials=self.max_trials,
            executions_per_trial=1,
            directory='models',
            project_name=project_name,
            seed = self.seed
        )
    
        tuner.search(
            train,
            validation_data=validation,
            epochs=self.max_epochs,
            callbacks=self.callbacks,
            class_weight=self.class_weight
        )
    
        # Save the best hyperparameters and model
        self.best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
        self.model = tuner.get_best_models(num_models=1)[0]
        self.tuner = tuner
    
        return self.best_hp, self.model

    def retrain_best_model(self, train, validation):
        """Retrain the best model to obtain its training history."""
        if self.best_hp is None:
            raise ValueError("No hyperparameters tuned yet. Please run tune().")

        # Automatically compute class weights if not provided
        if self.class_weight is None:
            self.class_weight = self.compute_class_weight(train)
        
        # Rebuild the best model using the best hyperparameters
        best_model = self.create_model(self.best_hp, df = train)
        if best_model is None:
            raise ValueError("Failed to create model")
        
        # Retrain the model and capture the history
        history = best_model.fit(
            train,
            epochs=self.max_epochs,
            validation_data=validation,
            callbacks=self.callbacks,
            verbose=1,
            class_weight=self.class_weight
        )
        if history is None or history.history.get('loss') is None:
            raise RuntimeError("Training failed")
        
        return best_model, history

    def predict(self, df):
        """Return predicted class labels.
        
        Args:
            df: TensorFlow dataset
        
        Returns:
            np.ndarray: Predicted class labels
        """
        if self.model is None:
            raise ValueError("No model has been trained or tuned yet.")

        logits = self.model.predict(df)  # Predict probabilities
        probabilities = tf.nn.sigmoid(logits).numpy()
        return (probabilities > 0.5).astype(int).flatten()  # Convert to binary class labels (0 or 1)
    
    def predict_proba(self, df):
        """Return probability estimates for both classes (negative and positive)"""
        if self.model is None:
            raise ValueError("No model has been trained or tuned yet.")

        logits = self.model.predict(df)  # Predict probabilities
        probabilities = tf.nn.sigmoid(logits).numpy()
        return np.hstack([1 - probabilities, probabilities])  # Class 0 and Class 1 probabilities

#### Building model on complete data

Instantiate the model wrapper, define the parameters for grid tuning, and conduct RandomizedSearchCV to tune hyperparameters

In [11]:
preprocess = Preprocessor()

# Encoding data
preprocess.fit(complete_final_df)
data = preprocess.transform(complete_final_df)

# Vectorization
preprocess.vectorization()

# Splitting train/test
preprocess.train_test_split()

# Creating datasets
train, val = preprocess.Dataset()

2024-12-08 18:24:16.196775: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31027 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:af:00.0, compute capability: 7.0


In [12]:
# Initiate Wrapper
wrapper = KerasModelWrapper()
# Fine tune the model with training and validation set
best_hp, best_model = wrapper.tune(train, val, project_name= 'lstm_tuned_complete')

Trial 25 Complete [00h 01m 44s]
val_auprc: 0.2839135527610779

Best val_auprc So Far: 0.6756983399391174
Total elapsed time: 00h 24m 25s


  saveable.load_own_variables(weights_store.get(inner_path))


In [13]:
# Print best parameters
print(f"Best Hyperparameters: {best_hp.values}")
wrapper.model.summary()  # Show the best model architecture

Best Hyperparameters: {'units': 224, 'dropout_rate': 0.2, 'learning_rate': 0.01}


In [14]:
# Get model metrics
best_model.evaluate(val, verbose = 0)
print(best_model.get_metrics_result())

{'accuracy': 0.9948979616165161, 'auc': 0.8873117566108704, 'auprc': 0.7859706878662109, 'f1_score': 0.7777776718139648, 'loss': 0.25695544481277466, 'precision': 0.7777777910232544, 'recall': 0.7777777910232544}


**Building model on imputed data**

In [16]:
# Fit the preprocessor on the new dataset
preprocess.fit(imputed_final_df)

# Transform the new dataset
imputed_df = preprocess.transform(imputed_final_df)

# Vectorize the transformed data
preprocess.vectorization()

# Split the data into training and validation sets
preprocess.train_test_split()

# Create the datasets for training and validation
train_imp, val_imp = preprocess.Dataset()

  trunc = np.asarray(trunc, dtype=dtype)


In [17]:
# Initiate Wrapper
imputed_wrapper = KerasModelWrapper()
# Fine tune the model with training and validation set
best_imp_hp, best_imp_model = imputed_wrapper.tune(train_imp, val_imp, project_name= 'lstm_tuned_imputed')

2024-12-08 19:13:40.671013: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Reloading Tuner from models/lstm_tuned_imputed/tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))


In [18]:
# Print best parameters
print(f"Best Hyperparameters: {best_imp_hp.values}")
imputed_wrapper.model.summary()  # Show the best model architecture

Best Hyperparameters: {'units': 16, 'dropout_rate': 0.4, 'learning_rate': 0.01}


In [24]:
# Get model metrics
best_imp_model.evaluate(val, verbose = 0)
print(best_imp_model.get_metrics_result())

{'accuracy': 0.9910714030265808, 'auc': 0.8822222352027893, 'auprc': 0.44508033990859985, 'f1_score': 0.5882352590560913, 'loss': 0.4046884775161743, 'precision': 0.625, 'recall': 0.5555555820465088}


In [68]:
class TemporalPreprocessor(Preprocessor):
    def __init__(self, window_size=12, prediction_horizon=24, pad_length=156, stride = 6, seed=42):
        # Validate inputs first
        if not isinstance(window_size, int) or window_size <= 0:
            raise ValueError("window_size must be positive integer")
        if not isinstance(prediction_horizon, int) or prediction_horizon <= 0:
            raise ValueError("prediction_horizon must be positive integer")
            
        # Initialize parent
        super().__init__(pad_length=pad_length, seed=seed)
        
        # Add temporal attributes
        self.time_columns = ['deathtime']
        self.window_size = int(window_size)
        self.prediction_horizon = int(prediction_horizon)
        self._reset_state()
        self.stride = int(stride)

    def _reset_state(self):
        """Reset internal state"""
        self.sequence_data = None
        self.sequence_labels = None

    def fit(self, df):
        """Validate columns and fit preprocessor"""
        # Get available columns
        available_cols = df.columns.tolist()
        
        # Update column lists to only include available columns
        self.categorical_columns = [col for col in self.categorical_columns 
                                  if col in available_cols]
        self.numerical_columns = [col for col in self.numerical_columns 
                                if col in available_cols]
        self.binary_columns = [col for col in self.binary_columns 
                             if col in available_cols]
        
        # Validate essential columns
        essential_cols = ['stay_id', 'deathtime'] + self.outcome
        missing = [col for col in essential_cols if col not in available_cols]
        if missing:
            raise ValueError(f"Missing essential columns: {missing}")
            
        return super().fit(df)

    def transform(self, df):
        """Transform with available columns only"""
        if not hasattr(self, 'label_encoders'):
            raise ValueError("Must call fit() before transform()")
            
        # Convert deathtime
        df['deathtime'] = pd.to_datetime(df['deathtime'], errors='coerce')
        df['time_bucket'] = pd.to_datetime(df['time_bucket'])
        
        # Transform only available columns
        parts = []
        
        # Stay ID
        parts.append(df[['stay_id']].copy())
        
        # Categorical
        if self.categorical_columns:
            transformed_cat = df[self.categorical_columns].copy()
            for col in self.categorical_columns:
                transformed_cat[col] = self.label_encoders[col].transform(
                    df[col].astype(str))
            parts.append(transformed_cat)
            
        # Numerical
        if self.numerical_columns:
            transformed_num = pd.DataFrame(
                self.numerical_scaler.transform(df[self.numerical_columns]),
                columns=self.numerical_columns,
                index=df.index
            )
            parts.append(transformed_num)
            
        # Binary
        if self.binary_columns:
            parts.append(df[self.binary_columns].astype(int))
            
        # Outcome
        parts.append(df[self.outcome].astype(int))
        
        # Deathtime & time bucket
        parts.append(df[['deathtime']])
        parts.append(df[['time_bucket']])

        # Combine all parts
        transformed_data = pd.concat(parts, axis=1)
        
        return transformed_data   

    def create_temporal_windows(self, df):
        """Create sliding windows with labels for temporal prediction using vectorized operations"""
        # Pre-filter stays with sufficient data
        stay_counts = df.groupby('stay_id').size()
        valid_stays = stay_counts[stay_counts >= self.window_size].index
        df_filtered = df[df['stay_id'].isin(valid_stays)]
        
        # Create window indices using numpy operations
        stays = df_filtered.groupby('stay_id')
        
        windows_list = []
        labels_list = []
        
        for stay_id, stay_data in stays:
            # Convert to numpy for faster operations
            values = stay_data[self.numerical_columns + 
                             self.categorical_columns + 
                             self.binary_columns].values
            
            # Create strided array view for windows
            n_windows = (len(stay_data) - self.window_size) // self.stride + 1
            
            # Create strided view of data
            strided_shape = (n_windows, self.window_size, values.shape[1])
            s0, s1 = values.strides[:2]
            
            strided_data = np.lib.stride_tricks.as_strided(
                values,
                shape=strided_shape,
                strides=(self.stride * s0, s0, s1),
                writeable=False
            )

            # Calculate labels vectorized
            window_end_indices = np.arange(self.window_size-1, 
                                         len(stay_data), 
                                         self.stride)[:n_windows]
            
            window_end_times = stay_data['time_bucket'].iloc[window_end_indices].values
            target_times = window_end_times + pd.Timedelta(hours=self.prediction_horizon)
            death_time = stay_data['deathtime'].iloc[0]
            
            if pd.isna(death_time):
                labels = np.zeros(n_windows)
            else:
                labels = (death_time <= target_times).astype(int)
            
            windows_list.append(strided_data)
            labels_list.append(labels)
        
        if not windows_list:
            raise ValueError("No valid windows could be created")
        
        return np.vstack(windows_list), np.concatenate(labels_list)
        
    def transform_temporal(self, df):
        """Transform data with temporal awareness"""
        self._reset_state()
        # Basic transformation
        transformed_data = self.transform(df)  # Use parent's transform
        
        # Create temporal windows
        self.sequence_data, self.sequence_labels = self.create_temporal_windows(transformed_data)
        
        return self

    def prepare_realtime_window(self, current_data):
        """Prepare the most recent window for real-time prediction
        
        Args:
            current_data: DataFrame with current patient data
            
        Returns:
            np.ndarray: Formatted window ready for model input
        """
        # Validate input
        if len(current_data) < self.window_size:
            raise ValueError(f"Not enough data points. Need {self.window_size}, got {len(current_data)}")

        # Transform current data
        transformed = self.transform(current_data)
        
        # Get last window_size hours
        recent_window = transformed.iloc[-self.window_size:]
        
        # Reshape for model input
        window_data = recent_window[self.numerical_columns + 
                                  self.categorical_columns + 
                                  self.binary_columns].values
        
        window_data = np.expand_dims(window_data, axis=0)        
        
        return window_data

    # Override vectorization to prevent conflicts
    def vectorization(self):
        """Not used in temporal processing"""
        raise NotImplementedError("Use transform_temporal instead")

In [109]:
class TemporalLSTMWrapper(KerasModelWrapper, BaseEstimator, ClassifierMixin):
    def __init__(self, model_type='standard', **kwargs):
        self.model_type = model_type
        super().__init__(**kwargs)
        
    def create_model(self, hp, df):
        """Create model based on type"""
        if self.model_type == 'temporal':
            return self._create_temporal_model(hp, df)
        return self._create_standard_model(hp, df)
    
    def _create_temporal_model(self, hp, df):
        """Temporal LSTM model for sliding windows"""
        hp_units = hp.Int('units', min_value=16, max_value=256, step=16)
        hp_dropout = hp.Choice('dropout_rate', [0.2, 0.3, 0.4, 0.5])
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

        # Get temporal input shape
        for X_batch, _ in df.take(1):
            input_shape = X_batch.shape[1:]
            break

        model = keras.Sequential([
            keras.layers.InputLayer(shape=input_shape),
            keras.layers.LSTM(
                hp_units,
                return_sequences=True,  # Required for stacked LSTM
                dropout=hp_dropout,
                kernel_regularizer=keras.regularizers.L1L2(l2=1e-6),
                kernel_initializer=tf.keras.initializers.GlorotUniform(seed=self.seed),
                recurrent_initializer=tf.keras.initializers.GlorotUniform(seed=self.seed)
            ),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(hp_dropout),
            keras.layers.LSTM(
                hp_units//2,
                dropout=hp_dropout,
                kernel_regularizer=keras.regularizers.L1L2(l2=1e-6), 
                kernel_initializer=tf.keras.initializers.GlorotUniform(seed=self.seed),
                recurrent_initializer=tf.keras.initializers.GlorotUniform(seed=self.seed)
            ),    
            keras.layers.BatchNormalization(),
            keras.layers.Dense(1, activation=None)
        ])

        metrics = [
            'accuracy', 
            Precision(name='precision'),
            Recall(name='recall'),
            AUC(name='auc'),
            AUC(name='auprc', curve='PR'),
            F1Score(name='f1_score')
        ]

        model.compile(
            optimizer=Adam(learning_rate=hp_learning_rate),
            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=metrics
        )
        return model

In [110]:
# Initialize and fit dataset
temporal_prep = TemporalPreprocessor(window_size=12, prediction_horizon=48, stride= 2)
temporal_prep.fit(complete_final_df)
transformed_data = temporal_prep.transform(complete_final_df)  # Just transform features
transformed = temporal_prep.transform_temporal(complete_final_df)
temporal_prep.train_test_split()
train_rltime, val_rltime = temporal_prep.Dataset()



In [102]:
from scipy import stats

def inspect_dataset_windows(dataset):
    """Inspect sliding windows in dataset including positive event counts"""
    print("\nSliding Window Analysis:")
    
    try:
        total_windows = 0
        total_positives = 0
        
        # Analyze all batches
        for batch_x, batch_y in dataset:
            total_windows += batch_x.shape[0]
            total_positives += tf.reduce_sum(batch_y).numpy()
            
            # Print first batch details
            if total_windows == batch_x.shape[0]:  # First batch
                print(f"\nBatch size: {batch_x.shape[0]}")
                print(f"Window size: {batch_x.shape[1]}")
                print(f"Features per timestep: {batch_x.shape[2]}")
                
                # Show example window
                example = batch_x[0]
                print("\nExample Window:")
                print(f"First timestep: {example[0]}")
                print(f"Last timestep: {example[-1]}")
                print(f"Output: {batch_y[0]}")
        
        # Print overall statistics
        print(f"\nTotal windows: {total_windows}")
        print(f"Positive events: {total_positives}")
        print(f"Event rate: {(total_positives/total_windows)*100:.2f}%")
        
    except Exception as e:
        print(f"Error inspecting dataset: {str(e)}")

def analyze_window_distribution(df, window_size, stride=6):
    """Analyze distribution of windows and events per patient"""
    try:
        windows_per_patient = {}
        events_per_patient = {}
        
        for stay_id, stay_data in df.groupby('stay_id'):
            n_windows = (len(stay_data) - window_size) // stride + 1
            if n_windows > 0:
                windows_per_patient[stay_id] = n_windows
                events_per_patient[stay_id] = stay_data['mortality'].iloc[-1]
        
        windows = np.array(list(windows_per_patient.values()))
        events = np.array(list(events_per_patient.values()))
        
        # Calculate statistics
        stats_summary = {
            'min_windows': np.min(windows),
            'max_windows': np.max(windows),
            'mean_windows': np.mean(windows),
            'std_windows': np.std(windows),
            'total_events': np.sum(events),
            'event_rate': np.mean(events) * 100
        }
        
        print("\nDistribution Summary:")
        for stat, value in stats_summary.items():
            print(f"{stat}: {value:.2f}")
        
        print("\nWindow Quantiles:")
        quantiles = np.percentile(windows, [10, 25, 50, 75, 90, 99])
        for q, v in zip([10, 25, 50, 75, 90, 99], quantiles):
            print(f"{q}th percentile: {v:.2f}")
            
        return windows_per_patient, events_per_patient
        
    except Exception as e:
        print(f"Error analyzing distribution: {str(e)}")
        return None, None

def run_window_analysis(train_ds, val_ds, transformed_data, window_size=12):
    """Run complete window analysis for both datasets"""
    print("Training Dataset:")
    inspect_dataset_windows(train_ds)
    windows_train, events_train = analyze_window_distribution(transformed_data, window_size)
    
    print("\nValidation Dataset:")
    inspect_dataset_windows(val_ds)
    windows_val, events_val = analyze_window_distribution(transformed_data, window_size)
    
    return (windows_train, events_train), (windows_val, events_val)

In [107]:
windows_train, windows_val = run_window_analysis(train_rltime, val_rltime, transformed_data)

Training Dataset:

Sliding Window Analysis:

Batch size: 64
Window size: 12
Features per timestep: 21

Example Window:
First timestep: [ 0.48349178 -0.42317584 -0.28042401 -0.73267301 -0.87649828  0.27651707
  0.59770333  0.32879551 -0.308316   -0.31986234 -0.17836889 -0.11762086
 -0.212277   -0.08384237  1.56760226 -0.05252904  1.          1.
  5.          1.          0.        ]
Last timestep: [ 0.48349178 -0.28110242  0.53957507 -0.61035676 -0.87649828  0.27651707
  0.59770333  0.32879551 -0.12934513 -0.24528047 -0.17836889 -0.11762086
 -0.212277   -0.08384237  5.40794587 -0.05252904  1.          1.
  5.          1.          0.        ]
Output: 0.0

Total windows: 102925
Positive events: 969.0
Event rate: 0.94%

Distribution Summary:
min_windows: 1.00
max_windows: 333.00
mean_windows: 11.29
std_windows: 18.71
total_events: 57.00
event_rate: 1.46

Window Quantiles:
10th percentile: 3.00
25th percentile: 3.00
50th percentile: 6.00
75th percentile: 11.00
90th percentile: 22.50
99th per

In [111]:
# Create temporal model
sliding_model = TemporalLSTMWrapper(model_type= 'temporal', max_epochs = 30)
sliding_model.tune(train_rltime, val_rltime, project_name= 'sliding_lstm')

Trial 25 Complete [00h 03m 11s]
val_auprc: 0.053677454590797424

Best val_auprc So Far: 0.13953542709350586
Total elapsed time: 00h 45m 47s


  saveable.load_own_variables(weights_store.get(inner_path))


(<keras_tuner.src.engine.hyperparameters.hyperparameters.HyperParameters at 0x7f74e6bc5f30>,
 <Sequential name=sequential, built=True>)

In [None]:
class ProgressiveTemporalPreprocessor(TemporalPreprocessor):
    def create_progressive_windows(self, df, max_hours=100):
        """Create windows of increasing size from admission"""
        windows = []
        labels = []
        
        for stay_id, stay_data in df.groupby('stay_id'):
            # Get admission time
            admission_time = stay_data.index[0]
            death_time = stay_data['deathtime'].iloc[0]
            
            # For each hour threshold
            for hour in range(1, max_hours + 1):
                cutoff_time = admission_time + pd.Timedelta(hours=hour)
                window_data = stay_data[stay_data.index <= cutoff_time]
                
                if len(window_data) > 0:
                    # Get features
                    features = window_data[self.numerical_columns + 
                                        self.categorical_columns + 
                                        self.binary_columns].values
                    
                    # Determine outcome
                    label = 1 if (pd.notna(death_time) and death_time <= cutoff_time) else 0
                    
                    windows.append(features)
                    labels.append(label)
        
        return np.array(windows), np.array(labels)

def evaluate_progressive_prediction(model, preprocessor, test_df, max_hours=100):
    """Evaluate model performance over increasing time windows"""
    aucs = []
    
    for hour in range(1, max_hours + 1):
        # Get data up to hour
        X, y = preprocessor.create_progressive_windows(test_df, max_hours=hour)
        
        # Get predictions
        y_pred = model.predict(X)
        
        # Calculate AUC
        auc = roc_auc_score(y, y_pred)
        aucs.append(auc)
    
    return aucs