In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.callbacks import EarlyStopping

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Input



#from sklearn.preprocessing import OneHotEncoder, StandardScaler
#from sklearn.compose import ColumnTransformer
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Dropout



#from keras.layers import Dense, Dropout


In [None]:
# Read MIMICs CSV file
mimic_df = pd.read_csv("CSV\\exports\\final\\mimic_mean_final.csv")

# Read eICUs CSV file
eicu_df = pd.read_csv("CSV\\exports\\final\\eicu_mean_final.csv")

In [None]:
# concatenate dataframes
df_combined = pd.concat([mimic_df, eicu_df], ignore_index=True)

In [None]:
# Find all categorical columns in mimic
categorical_columns = df_combined.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding to all categorical columns
df_encoded = pd.get_dummies(df_combined, columns=categorical_columns)

In [None]:
# Split the concatenate dataframe
mimic_df = df_encoded.iloc[:55792, :]  # Rows from 0 to 55791
eicu_df = df_encoded.iloc[55792:, :]  # Rows from 55792 to the end

In [None]:
# Group by `subject_id` and `hadm_id` to get unique patient admission records
unique_patients = mimic_df[['subject_id', 'hadm_id']].drop_duplicates()

# Split the unique patients into train, validation, and test sets
train_patients, test_patients = train_test_split(unique_patients, test_size=0.10, random_state=42)
train_patients, validate_patients = train_test_split(train_patients, test_size=0.11, random_state=42)  # 0.11 * 90% ~= 10%

# Merge the patients back with the original data to get the full records
train_set = mimic_df.merge(train_patients, on=['subject_id', 'hadm_id'])
validate_set = mimic_df.merge(validate_patients, on=['subject_id', 'hadm_id'])
test_set = mimic_df.merge(test_patients, on=['subject_id', 'hadm_id'])

# External validation from eICU
X_external = eicu_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_external = eicu_df['hospital_expire_flag']

# Separate features and target for the training, validation, and test sets
X_train = train_set.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_train = train_set['hospital_expire_flag']

X_validate = validate_set.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_validate = validate_set['hospital_expire_flag']

X_test = test_set.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_test = test_set['hospital_expire_flag']

In [None]:
display (X_train)

# Data Preprocessing

- Fill empty cell in training set

- StandardScaler, a popular preprocessing technique, offers a simple yet effective method for standardizing feature values.

In [None]:
# Fill training set missing values

# Step 1: Identify columns with missing values in X_train
missing_columns = X_train.columns[X_train.isnull().any()].tolist()
print(f"Columns with missing values: {missing_columns}")

# Step 2: Loop through each column with missing values and build an ANN to predict missing values
for col in missing_columns:
    print(f"Filling missing values in column: {col}")
    
    # Separate rows with and without missing values in the current column
    missing_rows = X_train[X_train[col].isnull()]
    non_missing_rows = X_train[~X_train[col].isnull()]
    
    # Skip the column if no data is available for training
    if len(missing_rows) == 0 or len(non_missing_rows) == 0:
        print(f"Skipping {col}, insufficient data")
        continue
    
    # Separate features and target for non-missing rows
    X_train_missing = non_missing_rows.drop(columns=missing_columns)  # Exclude other missing columns from features
    y_train_missing = non_missing_rows[col]  # Target is the column we're filling
    
    # Features for the rows with missing values (we'll predict the column for these rows)
    X_test_missing = missing_rows.drop(columns=missing_columns)
    
    # Step 3: Preprocess the data (Standard Scaling)
    scaler = StandardScaler()
    X_train_missing_scaled = scaler.fit_transform(X_train_missing)
    X_test_missing_scaled = scaler.transform(X_test_missing)
    
    # Step 4: Build the ANN model for filling missing values
    model_missing = Sequential()
    model_missing.add(Input(shape=(X_train_missing_scaled.shape[1],)))  # Use Input layer instead of input_shape in Dense
    model_missing.add(Dense(units=64, activation='relu'))
    model_missing.add(Dropout(0.3))
    model_missing.add(Dense(units=32, activation='relu'))
    model_missing.add(Dropout(0.3))
    model_missing.add(Dense(units=1, activation='linear'))  # Linear activation for regression tasks
    
    # Compile the model
    model_missing.compile(optimizer='adam', loss='mean_squared_error')
    
    # Step 5: Train the model
    model_missing.fit(X_train_missing_scaled, y_train_missing, epochs=50, batch_size=32, validation_split=0.1, verbose=0)
    
    # Step 6: Predict the missing values
    predicted_values = model_missing.predict(X_test_missing_scaled)
    
    # Step 7: Fill the missing values in X_train
    X_train.loc[X_train[col].isnull(), col] = predicted_values
    
    print(f"Filled missing values in column: {col}")

# Verify if there are any remaining missing values in X_train
print(X_train.isnull().sum())

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit scaler only on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same transformation to the validation and test data
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)
X_external_scaled = scaler.transform(X_external)  # For external validation

# ANN

In [None]:
# Initialize the ANN
model = Sequential()

# Input layer (using Input(shape=...)) and first hidden layer
model.add(Input(shape=(X_train_scaled.shape[1],)))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.3))

# Second hidden layer with 32 neurons
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

# Third hidden layer with 16 neurons
model.add(Dense(units=16, activation='relu'))
model.add(Dropout(0.3))

# Output layer with a single neuron for binary classification
model.add(Dense(units=1, activation='sigmoid'))


## Compiling the Model

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Training the ANN

EarlyStopping prevent overfitting by stopping training when the validation loss stops improving.

In [None]:
from sklearn.utils import class_weight

"""
Handle Class Imbalance:

Class Weighting: During training penalize the
misclassification of the minority class
(non-survivors) more heavily.
"""

# Calculate class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced', 
                                                  classes=np.unique(y_train), 
                                                  y=y_train)

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with class weights
history = model.fit(X_train_scaled, y_train, 
                    validation_data=(X_validate_scaled, y_validate),
                    epochs=100, batch_size=32, 
                    callbacks=[early_stopping], 
                    class_weight={0: class_weights[0], 1: class_weights[1]})

In [None]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_scaled, y_train, 
                    validation_data=(X_validate_scaled, y_validate),
                    epochs=100, batch_size=32, callbacks=[early_stopping])

## Evaluating the Model

In [None]:
# Evaluate on the test set
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

# Evaluate on the external validation set
external_loss, external_acc = model.evaluate(X_external_scaled, y_external)
print(f"External Validation Accuracy: {external_acc * 100:.2f}%")

In [None]:
# Predict classes on the test set
#y_pred_test = (model.predict(X_test_scaled) > 0.5).astype("int32")

# Classification report with zero_division parameter to handle undefined precision
#print(classification_report(y_test, y_pred_test, zero_division=0))

# Confusion matrix
#print(confusion_matrix(y_test, y_pred_test))



# Predict probabilities instead of classes
y_pred_prob = model.predict(X_test_scaled)

# Adjust the threshold (e.g., 0.3)
y_pred_adjusted = (y_pred_prob > 0.3).astype("int32")

# Classification report and confusion matrix with the adjusted threshold
print(classification_report(y_test, y_pred_adjusted, zero_division=0))
print(confusion_matrix(y_test, y_pred_adjusted))


# Fill missing values

In [None]:
# Separate rows with and without missing values in the 'Lactate' column (as an example)
missing_rows = mimic_df[mimic_df['Lactate'].isnull()]
non_missing_rows = mimic_df[~mimic_df['Lactate'].isnull()]

# Separate features and target for non-missing rows
X_train_missing = non_missing_rows.drop(columns=['Lactate', 'subject_id', 'hadm_id', 'row_count', 'hospital_expire_flag', 'los'])  # Features
y_train_missing = non_missing_rows['Lactate']  # Target (Lactate)

# Features for the rows with missing values (we'll predict Lactate for these)
X_test_missing = missing_rows.drop(columns=['Lactate', 'subject_id', 'hadm_id', 'row_count', 'hospital_expire_flag', 'los'])


In [None]:
display(X_test_missing)

In [None]:
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Scale the features
scaler = StandardScaler()
X_train_missing_scaled = scaler.fit_transform(X_train_missing)
X_test_missing_scaled = scaler.transform(X_test_missing)

# Build the ANN model
model_missing = Sequential()

# Input layer and first hidden layer
model_missing.add(Dense(units=64, activation='relu', input_shape=(X_train_missing_scaled.shape[1],)))
model_missing.add(Dropout(0.3))

# Second hidden layer
model_missing.add(Dense(units=32, activation='relu'))
model_missing.add(Dropout(0.3))

# Output layer (predict Lactate)
model_missing.add(Dense(units=1, activation='linear'))  # Linear activation for regression

# Compile the model
model_missing.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model_missing.fit(X_train_missing_scaled, y_train_missing, epochs=50, batch_size=32, validation_split=0.1)


In [None]:
# Predict the missing Lactate values
predicted_lactate = model_missing.predict(X_test_missing_scaled)

# Fill the missing values back into the mimic_df
mimic_df.loc[mimic_df['Lactate'].isnull(), 'Lactate'] = predicted_lactate


In [None]:
display(mimic_df.head(90))