# Import

In [None]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split

In [None]:
# Read MIMICs CSV file
mimic_df = pd.read_csv("CSV\\exports\\final\\mimic_mean_median_min_max_final.csv")

# Read eICUs CSV file
eicu_df = pd.read_csv("CSV\\exports\\final\\eicu_mean_median_min_max_final.csv")

In [None]:
day = 10

# Filter icu stay less than 10 days
mimic_df = mimic_df[mimic_df['los'] < day]

# Filter icu stay less than 10 days
eicu_df = eicu_df[eicu_df['los'] < day]

In [None]:
"""
I'm gonna concat and split the mimic and icu
at this point. I must create the same columns
from the tranformation of categorical data.
"""
row_count = mimic_df.shape[0]
print(f"Row count: {row_count}")

# Concat dataframes
df_combined = pd.concat([mimic_df, eicu_df], ignore_index=True)

# Find all categorical columns in mimic
categorical_columns = df_combined.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding to all categorical columns
df_encoded = pd.get_dummies(df_combined, columns=categorical_columns)

# Split the concatenate dataframe
mimic_df = df_encoded.iloc[:row_count, :]
eicu_df = df_encoded.iloc[row_count:, :]

In [None]:
total_test_val_perc = 0.2
split_between_test_val_perc = 0.5

# Group data by subject_id and hadm_id
grouped_df = mimic_df.groupby(['subject_id', 'hadm_id'])

# Get a new dataframe with one row per patient (subject_id, hadm_id) pair
patient_df = grouped_df['hospital_expire_flag'].first().reset_index()

# Split the patient_df into training (80%), validation (10%), and test (10%) while keeping the ratio of hospital_expired_flag
train, temp = train_test_split(patient_df, test_size=total_test_val_perc, stratify=patient_df['hospital_expire_flag'], random_state=42)
val, test = train_test_split(temp, test_size=split_between_test_val_perc, stratify=temp['hospital_expire_flag'], random_state=42)

# Step 4: Merge back with the original df to get the rows for each patient in the splits
train_df = mimic_df.merge(train[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')
val_df = mimic_df.merge(val[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')
test_df = mimic_df.merge(test[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')

# Check the sizes of the splits
print(f'Training set size: {train_df.shape[0]}')
print(f'Validation set size: {val_df.shape[0]}')
print(f'Test set size: {test_df.shape[0]}')

In [None]:
# External validation from eICU
X_external = eicu_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_external = eicu_df['los']

# Separate features and target for the training, validation, and test sets
X_train = train_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_train = train_df['los']

X_validate = val_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_validate = val_df['los']

X_test = test_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_test = test_df['los']

# Load Train - Validation - Test & External Validation Sets

In [None]:
# Define subfolder
subfolder = "o2_interpolation_impute"

# Load CSV files into corresponding variables
X_external = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_external.csv")
y_external = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_external.csv")
X_train = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_train.csv")
y_train = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_train.csv")
X_validate = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_validate.csv")
y_validate = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_validate.csv")
X_test = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_test.csv")
y_test = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_test.csv")

# o01 Simple GAN

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Early Stopping
def impute_missing_values_with_gan(X, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_missing = X.copy()
    mask = X_missing.isna()
    X_missing.fillna(0, inplace=True)
    
    # Prepare dataset and dataloader
    dataset = prepare_dataset(X_missing.values)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_dim = X.shape[1]

    # Initialize Generator and Discriminator
    generator = Generator(input_dim)
    discriminator = Discriminator(input_dim)

    # Optimizers
    optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

    # Loss function
    adversarial_loss = nn.BCELoss()

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        for i, real_data in enumerate(dataloader):
            # Adversarial ground truths
            valid = torch.ones(real_data.size(0), 1)
            fake = torch.zeros(real_data.size(0), 1)

            # Train Generator
            optimizer_G.zero_grad()

            # Generate data and replace missing values with generated data
            gen_data = generator(real_data)
            gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

            # Generator loss
            g_loss = adversarial_loss(discriminator(gen_data), valid)
            g_loss.backward()
            optimizer_G.step()

            # Train Discriminator
            optimizer_D.zero_grad()

            # Real and fake losses
            real_loss = adversarial_loss(discriminator(real_data), valid)
            fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
            d_loss = (real_loss + fake_loss) / 2

            d_loss.backward()
            optimizer_D.step()

        # Display progress
        print(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

        # Early stopping
        if g_loss.item() < best_loss:
            best_loss = g_loss.item()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
            break

    # Impute missing values using the trained generator
    X_imputed = X_missing.copy()
    X_imputed = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
    X_imputed = generator(X_imputed).detach().numpy()
    X_imputed[mask.values] = X_missing.values[mask.values]

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_gan(X_external)
X_train_imputed = impute_missing_values_with_gan(X_train)
X_validate_imputed = impute_missing_values_with_gan(X_validate)
X_test_imputed = impute_missing_values_with_gan(X_test)

# o02 Run GAN multiple times

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Multiple Passes and Early Stopping
def impute_missing_values_with_multiple_passes(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_imputed = X.copy()
    mask = X_imputed.isna()
    X_imputed.fillna(0, inplace=True)
    
    # Input dimension
    input_dim = X.shape[1]

    # Multiple passes
    for pass_num in range(num_passes):
        print(f"\n--- Imputation Pass {pass_num + 1}/{num_passes} ---\n")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)

        # Optimizers
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

        # Loss function
        adversarial_loss = nn.BCELoss()

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Adversarial ground truths
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)

                # Train Generator
                optimizer_G.zero_grad()

                # Generate data and replace missing values with generated data
                gen_data = generator(real_data)
                gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

                # Generator loss
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()

                # Train Discriminator
                optimizer_D.zero_grad()

                # Real and fake losses
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2

                d_loss.backward()
                optimizer_D.step()

            # Display progress
            print(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Early stopping
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
                break

        # Update X_imputed with refined values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        X_imputed.values[mask.values] = refined_data[mask.values]  # Only update missing values

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_multiple_passes(X_external)
X_train_imputed = impute_missing_values_with_multiple_passes(X_train)
X_validate_imputed = impute_missing_values_with_multiple_passes(X_validate)
X_test_imputed = impute_missing_values_with_multiple_passes(X_test)

# o03 Run GAN multiple times with more layers

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Multiple Passes and Early Stopping
def impute_missing_values_with_multiple_passes(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_imputed = X.copy()
    mask = X_imputed.isna()
    X_imputed.fillna(0, inplace=True)
    
    # Input dimension
    input_dim = X.shape[1]

    # Multiple passes
    for pass_num in range(num_passes):
        print(f"\n--- Imputation Pass {pass_num + 1}/{num_passes} ---\n")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)

        # Optimizers
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

        # Loss function
        adversarial_loss = nn.BCELoss()

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Adversarial ground truths
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)

                # Train Generator
                optimizer_G.zero_grad()

                # Generate data and replace missing values with generated data
                gen_data = generator(real_data)
                gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

                # Generator loss
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()

                # Train Discriminator
                optimizer_D.zero_grad()

                # Real and fake losses
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2

                d_loss.backward()
                optimizer_D.step()

            # Display progress
            print(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Early stopping
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
                break

        # Update X_imputed with refined values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        X_imputed.values[mask.values] = refined_data[mask.values]  # Only update missing values

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_multiple_passes(X_external)
X_train_imputed = impute_missing_values_with_multiple_passes(X_train)
X_validate_imputed = impute_missing_values_with_multiple_passes(X_validate)
X_test_imputed = impute_missing_values_with_multiple_passes(X_test)

# Save dataframes

In [None]:
external_total_missing_values = X_external_imputed.isna().sum().sum()
train_total_missing_values = X_train_imputed.isna().sum().sum()
validation_total_missing_values = X_validate_imputed.isna().sum().sum()
test_total_missing_values = X_test_imputed.isna().sum().sum()
print(external_total_missing_values, 'missing values in external dataset\n')
print(train_total_missing_values, 'missing values in train dataset\n')
print(validation_total_missing_values, 'missing values in validation dataset\n')
print(test_total_missing_values, 'missing values in test dataset\n')

In [None]:
X_external = X_external.reset_index(drop=True)
X_external_imputed = X_external_imputed.reset_index(drop=True)

X_train = X_train.reset_index(drop=True)
X_train_imputed = X_train_imputed.reset_index(drop=True)

X_validate = X_validate.reset_index(drop=True)
X_validate_imputed = X_validate_imputed.reset_index(drop=True)

X_test = X_test.reset_index(drop=True)
X_test_imputed = X_test_imputed.reset_index(drop=True)

In [None]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_external.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df

X_external_imputed = replace_columns(X_external_imputed, X_external, columns_to_replace)
print("External replacement completed.")

In [None]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_train.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_train_imputed = replace_columns(X_train_imputed, X_train, columns_to_replace)
print("Train replacement completed.")

In [None]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_validate.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_validate_imputed = replace_columns(X_validate_imputed, X_validate, columns_to_replace)
print("Validate replacement completed.")

In [None]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_test.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_test_imputed = replace_columns(X_test_imputed, X_test, columns_to_replace)
print("Test replacement completed.")

In [None]:
# Path
save_path = 'CSV/exports/impute/o6_GAN/'

# Check if the directory exists, and if not, create it
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save external validation set from eICU
X_external_imputed.to_csv(save_path + 'X_external.csv', index=False)
y_external.to_csv(save_path + 'y_external.csv', index=False)

# Save training, validation, and test sets
X_train_imputed.to_csv(save_path + 'X_train.csv', index=False)
y_train.to_csv(save_path + 'y_train.csv', index=False)

X_validate_imputed.to_csv(save_path + 'X_validate.csv', index=False)
y_validate.to_csv(save_path + 'y_validate.csv', index=False)

X_test_imputed.to_csv(save_path + 'X_test.csv', index=False)
y_test.to_csv(save_path + 'y_test.csv', index=False)