# Import

In [5]:
import os
import logging
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split

In [6]:
# Read MIMICs CSV file
mimic_df = pd.read_csv("CSV\\exports\\final\\mimic_mean_median_min_max_final.csv")

# Read eICUs CSV file
eicu_df = pd.read_csv("CSV\\exports\\final\\eicu_mean_median_min_max_final.csv")

In [7]:
day = 10

# Filter icu stay less than 10 days
mimic_df = mimic_df[mimic_df['los'] < day]

# Filter icu stay less than 10 days
eicu_df = eicu_df[eicu_df['los'] < day]

In [8]:
"""
I'm gonna concat and split the mimic and icu
at this point. I must create the same columns
from the tranformation of categorical data.
"""
row_count = mimic_df.shape[0]
print(f"Row count: {row_count}")

# Concat dataframes
df_combined = pd.concat([mimic_df, eicu_df], ignore_index=True)

# Find all categorical columns in mimic
categorical_columns = df_combined.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding to all categorical columns
df_encoded = pd.get_dummies(df_combined, columns=categorical_columns)

# Split the concatenate dataframe
mimic_df = df_encoded.iloc[:row_count, :]
eicu_df = df_encoded.iloc[row_count:, :]

Row count: 51040


In [10]:
total_test_val_perc = 0.2
split_between_test_val_perc = 0.5

# Group data by subject_id and hadm_id
grouped_df = mimic_df.groupby(['subject_id', 'hadm_id'])

# Get a new dataframe with one row per patient (subject_id, hadm_id) pair
patient_df = grouped_df['hospital_expire_flag'].first().reset_index()

# Split the patient_df into training (80%), validation (10%), and test (10%) while keeping the ratio of hospital_expired_flag
train, temp = train_test_split(patient_df, test_size=total_test_val_perc, stratify=patient_df['hospital_expire_flag'], random_state=42)
val, test = train_test_split(temp, test_size=split_between_test_val_perc, stratify=temp['hospital_expire_flag'], random_state=42)

# Step 4: Merge back with the original df to get the rows for each patient in the splits
train_df = mimic_df.merge(train[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')
val_df = mimic_df.merge(val[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')
test_df = mimic_df.merge(test[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')

# Check the sizes of the splits
print(f'Training set size: {train_df.shape[0]}')
print(f'Validation set size: {val_df.shape[0]}')
print(f'Test set size: {test_df.shape[0]}')

Training set size: 40832
Validation set size: 5104
Test set size: 5104


In [11]:
# External validation from eICU
X_external = eicu_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_external = eicu_df['los']

# Separate features and target for the training, validation, and test sets
X_train = train_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_train = train_df['los']

X_validate = val_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_validate = val_df['los']

X_test = test_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_test = test_df['los']

# Load Train - Validation - Test & External Validation Sets

In [None]:
# Define subfolder
subfolder = "o2_interpolation_impute"

# Load CSV files into corresponding variables
X_external = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_external.csv")
y_external = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_external.csv")
X_train = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_train.csv")
y_train = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_train.csv")
X_validate = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_validate.csv")
y_validate = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_validate.csv")
X_test = pd.read_csv(f"CSV/exports/impute/{subfolder}/X_test.csv")
y_test = pd.read_csv(f"CSV/exports/impute/{subfolder}/y_test.csv")

# o01 Simple GAN

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Early Stopping
def impute_missing_values_with_gan(X, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_missing = X.copy()
    mask = X_missing.isna()
    X_missing.fillna(0, inplace=True)
    
    # Prepare dataset and dataloader
    dataset = prepare_dataset(X_missing.values)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_dim = X.shape[1]

    # Initialize Generator and Discriminator
    generator = Generator(input_dim)
    discriminator = Discriminator(input_dim)

    # Device placement
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    generator.to(device)
    discriminator.to(device)

    # Optimizers
    optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

    # Loss function
    adversarial_loss = nn.BCELoss().to(device)

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        for i, real_data in enumerate(dataloader):
            # Move data to the correct device
            real_data = real_data.to(device)

            # Adversarial ground truths
            valid = torch.ones(real_data.size(0), 1).to(device)
            fake = torch.zeros(real_data.size(0), 1).to(device)

            # Train Generator
            optimizer_G.zero_grad()

            # Generate data and replace missing values with generated data
            gen_data = generator(real_data)
            batch_mask = mask.iloc[i * batch_size:(i + 1) * batch_size].values
            gen_data[batch_mask] = real_data[batch_mask]

            # Generator loss
            g_loss = adversarial_loss(discriminator(gen_data), valid)
            g_loss.backward()
            optimizer_G.step()

            # Train Discriminator
            optimizer_D.zero_grad()

            # Real and fake losses
            real_loss = adversarial_loss(discriminator(real_data), valid)
            fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
            d_loss = (real_loss + fake_loss) / 2

            d_loss.backward()
            optimizer_D.step()

        # Logging progress
        logging.basicConfig(level=logging.INFO)
        logging.info(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

        # Early stopping
        if g_loss.item() < best_loss:
            best_loss = g_loss.item()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            logging.info(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
            break

    # Impute missing values using the trained generator
    X_imputed = X_missing.copy()
    X_imputed = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32).to(device)
    X_imputed = generator(X_imputed).detach().cpu().numpy()  # Move back to CPU for numpy compatibility
    X_imputed[mask.values] = X_missing.values[mask.values]

    return pd.DataFrame(X_imputed, columns=X.columns)

# Save models (optional)
# torch.save(generator.state_dict(), "generator.pth")
# torch.save(discriminator.state_dict(), "discriminator.pth")

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_gan(X_external)
X_train_imputed = impute_missing_values_with_gan(X_train)
X_validate_imputed = impute_missing_values_with_gan(X_validate)
X_test_imputed = impute_missing_values_with_gan(X_test)

# o02 Run GAN multiple times

In [22]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Multiple Passes and Early Stopping
def impute_missing_values_with_multiple_passes(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_imputed = X.copy()
    mask = X_imputed.isna()
    X_imputed.fillna(0, inplace=True)
    
    # Input dimension
    input_dim = X.shape[1]

    # Multiple passes
    for pass_num in range(num_passes):
        print(f"\n--- Imputation Pass {pass_num + 1}/{num_passes} ---\n")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)

        # Optimizers
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

        # Loss function
        adversarial_loss = nn.BCELoss()

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Adversarial ground truths
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)

                # Train Generator
                optimizer_G.zero_grad()

                # Generate data and replace missing values with generated data
                gen_data = generator(real_data)
                gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

                # Generator loss
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()

                # Train Discriminator
                optimizer_D.zero_grad()

                # Real and fake losses
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2

                d_loss.backward()
                optimizer_D.step()

            # Display progress
            #print(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Logging progress
            logging.basicConfig(level=logging.INFO)
            logging.info(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Early stopping
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
                break

        # Update X_imputed with refined values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        X_imputed.values[mask.values] = refined_data[mask.values]  # Only update missing values

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_multiple_passes(X_external)
X_train_imputed = impute_missing_values_with_multiple_passes(X_train)
X_validate_imputed = impute_missing_values_with_multiple_passes(X_validate)
X_test_imputed = impute_missing_values_with_multiple_passes(X_test)


--- Imputation Pass 1/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 1.0082497596740723, Discriminator Loss: 0.5422036647796631
INFO:root:Epoch 2/1000 - Generator Loss: 1.0394421815872192, Discriminator Loss: 0.6764670014381409
INFO:root:Epoch 3/1000 - Generator Loss: 2.589684247970581, Discriminator Loss: 0.4176105558872223
INFO:root:Epoch 4/1000 - Generator Loss: 1.7230149507522583, Discriminator Loss: 0.39542990922927856
INFO:root:Epoch 5/1000 - Generator Loss: 2.265847682952881, Discriminator Loss: 0.5392922163009644
INFO:root:Epoch 6/1000 - Generator Loss: 2.0950677394866943, Discriminator Loss: 0.5000324845314026
INFO:root:Epoch 7/1000 - Generator Loss: 1.3824129104614258, Discriminator Loss: 0.450406938791275
INFO:root:Epoch 8/1000 - Generator Loss: 0.7340129613876343, Discriminator Loss: 0.8645924925804138
INFO:root:Epoch 9/1000 - Generator Loss: 1.25557541847229, Discriminator Loss: 0.47079285979270935
INFO:root:Epoch 10/1000 - Generator Loss: 2.1566567420959473, Discriminator Loss: 0.3441897928714752
IN

Early stopping at epoch 18 - Generator Loss: 1.1662495136260986

--- Imputation Pass 2/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 2.394395589828491, Discriminator Loss: 0.44875815510749817
INFO:root:Epoch 2/1000 - Generator Loss: 1.752920150756836, Discriminator Loss: 0.430203378200531
INFO:root:Epoch 3/1000 - Generator Loss: 1.6148054599761963, Discriminator Loss: 0.4096754789352417
INFO:root:Epoch 4/1000 - Generator Loss: 1.868181824684143, Discriminator Loss: 0.22493763267993927
INFO:root:Epoch 5/1000 - Generator Loss: 1.9125481843948364, Discriminator Loss: 0.3880309462547302
INFO:root:Epoch 6/1000 - Generator Loss: 1.5893750190734863, Discriminator Loss: 0.3431832790374756
INFO:root:Epoch 7/1000 - Generator Loss: 2.5374791622161865, Discriminator Loss: 0.2048741579055786
INFO:root:Epoch 8/1000 - Generator Loss: 1.961550235748291, Discriminator Loss: 0.2524179220199585
INFO:root:Epoch 9/1000 - Generator Loss: 0.9677797555923462, Discriminator Loss: 0.5013019442558289
INFO:root:Epoch 10/1000 - Generator Loss: 0.8962404131889343, Discriminator Loss: 0.5983597040176392
IN

Early stopping at epoch 21 - Generator Loss: 1.6633814573287964

--- Imputation Pass 3/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 1.829801082611084, Discriminator Loss: 0.4282008409500122
INFO:root:Epoch 2/1000 - Generator Loss: 1.6474900245666504, Discriminator Loss: 0.5971099138259888
INFO:root:Epoch 3/1000 - Generator Loss: 1.4369300603866577, Discriminator Loss: 0.32113298773765564
INFO:root:Epoch 4/1000 - Generator Loss: 1.4276542663574219, Discriminator Loss: 0.5142741203308105
INFO:root:Epoch 5/1000 - Generator Loss: 1.2652450799942017, Discriminator Loss: 0.6639271974563599
INFO:root:Epoch 6/1000 - Generator Loss: 1.6130473613739014, Discriminator Loss: 0.34582555294036865
INFO:root:Epoch 7/1000 - Generator Loss: 1.256985068321228, Discriminator Loss: 0.43211841583251953
INFO:root:Epoch 8/1000 - Generator Loss: 1.5107439756393433, Discriminator Loss: 0.42448848485946655
INFO:root:Epoch 9/1000 - Generator Loss: 1.7163219451904297, Discriminator Loss: 0.40907660126686096
INFO:root:Epoch 10/1000 - Generator Loss: 1.5173746347427368, Discriminator Loss: 0.3813726007938

Early stopping at epoch 21 - Generator Loss: 1.4881471395492554

--- Imputation Pass 1/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 2.4057345390319824, Discriminator Loss: 0.3147991895675659
INFO:root:Epoch 2/1000 - Generator Loss: 0.8814415335655212, Discriminator Loss: 0.4204072952270508
INFO:root:Epoch 3/1000 - Generator Loss: 0.9709899425506592, Discriminator Loss: 0.5099101662635803
INFO:root:Epoch 4/1000 - Generator Loss: 1.5976848602294922, Discriminator Loss: 0.45520010590553284
INFO:root:Epoch 5/1000 - Generator Loss: 1.5165610313415527, Discriminator Loss: 0.4048083424568176
INFO:root:Epoch 6/1000 - Generator Loss: 2.6727261543273926, Discriminator Loss: 1.4006140232086182
INFO:root:Epoch 7/1000 - Generator Loss: 2.068336009979248, Discriminator Loss: 0.2970495820045471
INFO:root:Epoch 8/1000 - Generator Loss: 0.9433561563491821, Discriminator Loss: 1.268726110458374
INFO:root:Epoch 9/1000 - Generator Loss: 3.3379454612731934, Discriminator Loss: 0.4845249056816101
INFO:root:Epoch 10/1000 - Generator Loss: 2.6959967613220215, Discriminator Loss: 0.26887989044189453

Early stopping at epoch 12 - Generator Loss: 2.4376678466796875

--- Imputation Pass 2/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 2.684602737426758, Discriminator Loss: 0.2201017141342163
INFO:root:Epoch 2/1000 - Generator Loss: 1.6774810552597046, Discriminator Loss: 0.5878006815910339
INFO:root:Epoch 3/1000 - Generator Loss: 0.5851223468780518, Discriminator Loss: 0.5692627429962158
INFO:root:Epoch 4/1000 - Generator Loss: 1.6462422609329224, Discriminator Loss: 0.33479762077331543
INFO:root:Epoch 5/1000 - Generator Loss: 1.6301050186157227, Discriminator Loss: 0.30858755111694336
INFO:root:Epoch 6/1000 - Generator Loss: 1.4128875732421875, Discriminator Loss: 0.5154665112495422
INFO:root:Epoch 7/1000 - Generator Loss: 2.9197819232940674, Discriminator Loss: 0.28425318002700806
INFO:root:Epoch 8/1000 - Generator Loss: 2.1828725337982178, Discriminator Loss: 0.3826148509979248
INFO:root:Epoch 9/1000 - Generator Loss: 1.7192368507385254, Discriminator Loss: 0.2480093240737915
INFO:root:Epoch 10/1000 - Generator Loss: 1.8703583478927612, Discriminator Loss: 0.25554087758064

Early stopping at epoch 13 - Generator Loss: 3.5306384563446045

--- Imputation Pass 3/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 3.6860761642456055, Discriminator Loss: 0.11381968855857849
INFO:root:Epoch 2/1000 - Generator Loss: 3.303877115249634, Discriminator Loss: 0.5801141262054443
INFO:root:Epoch 3/1000 - Generator Loss: 0.8494479656219482, Discriminator Loss: 1.0243263244628906
INFO:root:Epoch 4/1000 - Generator Loss: 1.9596960544586182, Discriminator Loss: 0.3046152591705322
INFO:root:Epoch 5/1000 - Generator Loss: 1.6356396675109863, Discriminator Loss: 0.3846980333328247
INFO:root:Epoch 6/1000 - Generator Loss: 3.7746381759643555, Discriminator Loss: 0.267703652381897
INFO:root:Epoch 7/1000 - Generator Loss: 2.0661325454711914, Discriminator Loss: 0.15901914238929749
INFO:root:Epoch 8/1000 - Generator Loss: 2.1351068019866943, Discriminator Loss: 0.2607773542404175
INFO:root:Epoch 9/1000 - Generator Loss: 1.202926516532898, Discriminator Loss: 0.5531449317932129
INFO:root:Epoch 10/1000 - Generator Loss: 2.2369563579559326, Discriminator Loss: 0.2535463571548462


Early stopping at epoch 13 - Generator Loss: 1.7050392627716064

--- Imputation Pass 1/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 1.297393798828125, Discriminator Loss: 0.44704121351242065
INFO:root:Epoch 2/1000 - Generator Loss: 2.6656785011291504, Discriminator Loss: 0.09690233319997787
INFO:root:Epoch 3/1000 - Generator Loss: 3.368907928466797, Discriminator Loss: 0.3314264118671417
INFO:root:Epoch 4/1000 - Generator Loss: 2.142860174179077, Discriminator Loss: 0.42715418338775635
INFO:root:Epoch 5/1000 - Generator Loss: 1.9437092542648315, Discriminator Loss: 0.29888617992401123
INFO:root:Epoch 6/1000 - Generator Loss: 3.652674913406372, Discriminator Loss: 0.504962682723999
INFO:root:Epoch 7/1000 - Generator Loss: 6.129988193511963, Discriminator Loss: 2.514206886291504
INFO:root:Epoch 8/1000 - Generator Loss: 1.6948474645614624, Discriminator Loss: 0.710557758808136
INFO:root:Epoch 9/1000 - Generator Loss: 4.771276950836182, Discriminator Loss: 0.4144764244556427
INFO:root:Epoch 10/1000 - Generator Loss: 1.7355247735977173, Discriminator Loss: 0.33168894052505493
INF

Early stopping at epoch 22 - Generator Loss: 2.573007345199585

--- Imputation Pass 2/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 0.9816348552703857, Discriminator Loss: 0.4399782121181488
INFO:root:Epoch 2/1000 - Generator Loss: 1.7063993215560913, Discriminator Loss: 0.3047957122325897
INFO:root:Epoch 3/1000 - Generator Loss: 3.127580404281616, Discriminator Loss: 0.3877939283847809
INFO:root:Epoch 4/1000 - Generator Loss: 1.448107123374939, Discriminator Loss: 0.890807032585144
INFO:root:Epoch 5/1000 - Generator Loss: 1.1145659685134888, Discriminator Loss: 0.38379257917404175
INFO:root:Epoch 6/1000 - Generator Loss: 1.351382851600647, Discriminator Loss: 0.5762107372283936
INFO:root:Epoch 7/1000 - Generator Loss: 1.2451422214508057, Discriminator Loss: 0.835567831993103
INFO:root:Epoch 8/1000 - Generator Loss: 1.8186486959457397, Discriminator Loss: 0.6141198873519897
INFO:root:Epoch 9/1000 - Generator Loss: 0.7294478416442871, Discriminator Loss: 0.7219887971878052
INFO:root:Epoch 10/1000 - Generator Loss: 1.9627013206481934, Discriminator Loss: 0.6058574914932251
INF

Early stopping at epoch 19 - Generator Loss: 1.9590567350387573

--- Imputation Pass 3/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 3.8123714923858643, Discriminator Loss: 0.3910503685474396
INFO:root:Epoch 2/1000 - Generator Loss: 2.2676925659179688, Discriminator Loss: 1.5035704374313354
INFO:root:Epoch 3/1000 - Generator Loss: 0.5126456022262573, Discriminator Loss: 0.9762917160987854
INFO:root:Epoch 4/1000 - Generator Loss: 1.528992772102356, Discriminator Loss: 0.4011877775192261
INFO:root:Epoch 5/1000 - Generator Loss: 1.5248061418533325, Discriminator Loss: 0.3986969590187073
INFO:root:Epoch 6/1000 - Generator Loss: 2.1725447177886963, Discriminator Loss: 0.6821770668029785
INFO:root:Epoch 7/1000 - Generator Loss: 2.441396951675415, Discriminator Loss: 0.6338729858398438
INFO:root:Epoch 8/1000 - Generator Loss: 1.8738764524459839, Discriminator Loss: 0.64334636926651
INFO:root:Epoch 9/1000 - Generator Loss: 1.7176944017410278, Discriminator Loss: 0.64170241355896
INFO:root:Epoch 10/1000 - Generator Loss: 0.628803551197052, Discriminator Loss: 2.8911068439483643
INFO:r

Early stopping at epoch 13 - Generator Loss: 1.6316622495651245

--- Imputation Pass 1/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 4.151882648468018, Discriminator Loss: 0.6270756721496582
INFO:root:Epoch 2/1000 - Generator Loss: 2.7622272968292236, Discriminator Loss: 0.1614200919866562
INFO:root:Epoch 3/1000 - Generator Loss: 3.5745601654052734, Discriminator Loss: 0.05827218294143677
INFO:root:Epoch 4/1000 - Generator Loss: 3.646433115005493, Discriminator Loss: 1.0920857191085815
INFO:root:Epoch 5/1000 - Generator Loss: 2.3976762294769287, Discriminator Loss: 1.139224886894226
INFO:root:Epoch 6/1000 - Generator Loss: 2.28826642036438, Discriminator Loss: 0.13038185238838196
INFO:root:Epoch 7/1000 - Generator Loss: 2.066287040710449, Discriminator Loss: 0.19943952560424805
INFO:root:Epoch 8/1000 - Generator Loss: 1.9305462837219238, Discriminator Loss: 0.40234142541885376
INFO:root:Epoch 9/1000 - Generator Loss: 1.8910999298095703, Discriminator Loss: 2.280714988708496
INFO:root:Epoch 10/1000 - Generator Loss: 5.218822479248047, Discriminator Loss: 1.0946160554885864
INF

Early stopping at epoch 29 - Generator Loss: 2.106024980545044

--- Imputation Pass 2/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 5.5709919929504395, Discriminator Loss: 1.0839076042175293
INFO:root:Epoch 2/1000 - Generator Loss: 2.964562177658081, Discriminator Loss: 0.0833107978105545
INFO:root:Epoch 3/1000 - Generator Loss: 2.9127371311187744, Discriminator Loss: 0.2790181338787079
INFO:root:Epoch 4/1000 - Generator Loss: 3.130985975265503, Discriminator Loss: 3.5889933109283447
INFO:root:Epoch 5/1000 - Generator Loss: 3.240677833557129, Discriminator Loss: 0.18484221398830414
INFO:root:Epoch 6/1000 - Generator Loss: 2.6178717613220215, Discriminator Loss: 0.26956987380981445
INFO:root:Epoch 7/1000 - Generator Loss: 4.347444534301758, Discriminator Loss: 0.12983649969100952
INFO:root:Epoch 8/1000 - Generator Loss: 1.6717772483825684, Discriminator Loss: 0.3316692113876343
INFO:root:Epoch 9/1000 - Generator Loss: 2.439965009689331, Discriminator Loss: 0.2308366596698761
INFO:root:Epoch 10/1000 - Generator Loss: 1.359175205230713, Discriminator Loss: 0.24769434332847595
I

Early stopping at epoch 22 - Generator Loss: 1.8190360069274902

--- Imputation Pass 3/3 ---



INFO:root:Epoch 1/1000 - Generator Loss: 2.461106300354004, Discriminator Loss: 0.108174629509449
INFO:root:Epoch 2/1000 - Generator Loss: 3.7974936962127686, Discriminator Loss: 0.19220669567584991
INFO:root:Epoch 3/1000 - Generator Loss: 2.274738073348999, Discriminator Loss: 0.11671611666679382
INFO:root:Epoch 4/1000 - Generator Loss: 3.8480770587921143, Discriminator Loss: 0.06884589791297913
INFO:root:Epoch 5/1000 - Generator Loss: 2.05767560005188, Discriminator Loss: 0.3324459195137024
INFO:root:Epoch 6/1000 - Generator Loss: 2.1923422813415527, Discriminator Loss: 0.16906222701072693
INFO:root:Epoch 7/1000 - Generator Loss: 0.9525883793830872, Discriminator Loss: 2.6885037422180176
INFO:root:Epoch 8/1000 - Generator Loss: 2.0926873683929443, Discriminator Loss: 1.260503888130188
INFO:root:Epoch 9/1000 - Generator Loss: 4.128427505493164, Discriminator Loss: 1.2086541652679443
INFO:root:Epoch 10/1000 - Generator Loss: 4.917284965515137, Discriminator Loss: 1.1284611225128174
INF

Early stopping at epoch 17 - Generator Loss: 1.0050984621047974


# o03 Run GAN multiple times with more layers

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Multiple Passes and Early Stopping
def impute_missing_values_with_multiple_passes(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_imputed = X.copy()
    mask = X_imputed.isna()
    X_imputed.fillna(0, inplace=True)
    
    # Input dimension
    input_dim = X.shape[1]

    # Multiple passes
    for pass_num in range(num_passes):
        print(f"\n--- Imputation Pass {pass_num + 1}/{num_passes} ---\n")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)

        # Optimizers
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

        # Loss function
        adversarial_loss = nn.BCELoss()

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Adversarial ground truths
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)

                # Train Generator
                optimizer_G.zero_grad()

                # Generate data and replace missing values with generated data
                gen_data = generator(real_data)
                gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

                # Generator loss
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()

                # Train Discriminator
                optimizer_D.zero_grad()

                # Real and fake losses
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2

                d_loss.backward()
                optimizer_D.step()

            # Display progress
            print(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Early stopping
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
                break

        # Update X_imputed with refined values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        X_imputed.values[mask.values] = refined_data[mask.values]  # Only update missing values

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_multiple_passes(X_external)
X_train_imputed = impute_missing_values_with_multiple_passes(X_train)
X_validate_imputed = impute_missing_values_with_multiple_passes(X_validate)
X_test_imputed = impute_missing_values_with_multiple_passes(X_test)

# Save dataframes

In [24]:
external_total_missing_values = X_external_imputed.isna().sum().sum()
train_total_missing_values = X_train_imputed.isna().sum().sum()
validation_total_missing_values = X_validate_imputed.isna().sum().sum()
test_total_missing_values = X_test_imputed.isna().sum().sum()
print(external_total_missing_values, 'missing values in external dataset\n')
print(train_total_missing_values, 'missing values in train dataset\n')
print(validation_total_missing_values, 'missing values in validation dataset\n')
print(test_total_missing_values, 'missing values in test dataset\n')

0 missing values in external dataset

0 missing values in train dataset

0 missing values in validation dataset

0 missing values in test dataset



In [25]:
X_external = X_external.reset_index(drop=True)
X_external_imputed = X_external_imputed.reset_index(drop=True)

X_train = X_train.reset_index(drop=True)
X_train_imputed = X_train_imputed.reset_index(drop=True)

X_validate = X_validate.reset_index(drop=True)
X_validate_imputed = X_validate_imputed.reset_index(drop=True)

X_test = X_test.reset_index(drop=True)
X_test_imputed = X_test_imputed.reset_index(drop=True)

In [26]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_external.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df

X_external_imputed = replace_columns(X_external_imputed, X_external, columns_to_replace)
print("External replacement completed.")

External replacement completed.


In [27]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_train.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_train_imputed = replace_columns(X_train_imputed, X_train, columns_to_replace)
print("Train replacement completed.")

Train replacement completed.


In [28]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_validate.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_validate_imputed = replace_columns(X_validate_imputed, X_validate, columns_to_replace)
print("Validate replacement completed.")

Validate replacement completed.


In [29]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_test.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_test_imputed = replace_columns(X_test_imputed, X_test, columns_to_replace)
print("Test replacement completed.")

Test replacement completed.


In [30]:
# Path
save_path = 'CSV/exports/impute/o6_GAN/o02/'

# Check if the directory exists, and if not, create it
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save external validation set from eICU
X_external_imputed.to_csv(save_path + 'X_external.csv', index=False)
y_external.to_csv(save_path + 'y_external.csv', index=False)

# Save training, validation, and test sets
X_train_imputed.to_csv(save_path + 'X_train.csv', index=False)
y_train.to_csv(save_path + 'y_train.csv', index=False)

X_validate_imputed.to_csv(save_path + 'X_validate.csv', index=False)
y_validate.to_csv(save_path + 'y_validate.csv', index=False)

X_test_imputed.to_csv(save_path + 'X_test.csv', index=False)
y_test.to_csv(save_path + 'y_test.csv', index=False)

# Test