# Import

In [1]:
import os
import logging
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

# Load Train - Validation - Test & External Validation Sets

In [3]:
# Define subfolder
subfolder = "split_set"
name = "o4"

# Load CSV files into corresponding variables
logging.info(f"Loading X_external from ../CSV/exports/{subfolder}/{name}_X_external.csv")
X_external = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_X_external.csv")

logging.info(f"Loading y_external from ../CSV/exports/{subfolder}/{name}_y_external_los.csv")
y_external = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_y_external_los.csv")

logging.info(f"Loading X_train from ../CSV/exports/{subfolder}/{name}_X_train.csv")
X_train = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_X_train.csv")

logging.info(f"Loading y_train from ../CSV/exports/{subfolder}/{name}_y_train_los.csv")
y_train = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_y_train_los.csv")

logging.info(f"Loading X_validate from ../CSV/exports/{subfolder}/{name}_X_validate.csv")
X_validate = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_X_validate.csv")

logging.info(f"Loading y_validate from ../CSV/exports/{subfolder}/{name}_y_validate_los.csv")
y_validate = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_y_validate_los.csv")

logging.info(f"Loading X_test from ../CSV/exports/{subfolder}/{name}_X_test.csv")
X_test = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_X_test.csv")

logging.info(f"Loading y_test from ../CSV/exports/{subfolder}/{name}_y_test_los.csv")
y_test = pd.read_csv(f"../CSV/exports/{subfolder}/{name}_y_test_los.csv")

2024-12-15 20:11:33,241 - INFO - Loading X_external from ../CSV/exports/split_set/o4_X_external.csv
2024-12-15 20:11:34,887 - INFO - Loading y_external from ../CSV/exports/split_set/o4_y_external_los.csv
2024-12-15 20:11:34,904 - INFO - Loading X_train from ../CSV/exports/split_set/o4_X_train.csv
2024-12-15 20:11:35,867 - INFO - Loading y_train from ../CSV/exports/split_set/o4_y_train_los.csv
2024-12-15 20:11:35,883 - INFO - Loading X_validate from ../CSV/exports/split_set/o4_X_validate.csv
2024-12-15 20:11:36,028 - INFO - Loading y_validate from ../CSV/exports/split_set/o4_y_validate_los.csv
2024-12-15 20:11:36,033 - INFO - Loading X_test from ../CSV/exports/split_set/o4_X_test.csv
2024-12-15 20:11:36,182 - INFO - Loading y_test from ../CSV/exports/split_set/o4_y_test_los.csv


# o01 Simple GAN

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Early Stopping
def impute_missing_values_with_gan(X, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_missing = X.copy()
    mask = X_missing.isna()
    X_missing.fillna(0, inplace=True)
    
    # Prepare dataset and dataloader
    dataset = prepare_dataset(X_missing.values)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    input_dim = X.shape[1]

    # Initialize Generator and Discriminator
    generator = Generator(input_dim)
    discriminator = Discriminator(input_dim)

    # Device placement
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    generator.to(device)
    discriminator.to(device)

    # Optimizers
    optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

    # Loss function
    adversarial_loss = nn.BCELoss().to(device)

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        for i, real_data in enumerate(dataloader):
            # Move data to the correct device
            real_data = real_data.to(device)

            # Adversarial ground truths
            valid = torch.ones(real_data.size(0), 1).to(device)
            fake = torch.zeros(real_data.size(0), 1).to(device)

            # Train Generator
            optimizer_G.zero_grad()

            # Generate data and replace missing values with generated data
            gen_data = generator(real_data)
            batch_mask = mask.iloc[i * batch_size:(i + 1) * batch_size].values
            gen_data[batch_mask] = real_data[batch_mask]

            # Generator loss
            g_loss = adversarial_loss(discriminator(gen_data), valid)
            g_loss.backward()
            optimizer_G.step()

            # Train Discriminator
            optimizer_D.zero_grad()

            # Real and fake losses
            real_loss = adversarial_loss(discriminator(real_data), valid)
            fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
            d_loss = (real_loss + fake_loss) / 2

            d_loss.backward()
            optimizer_D.step()

        # Logging progress
        logging.basicConfig(level=logging.INFO)
        logging.info(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

        # Early stopping
        if g_loss.item() < best_loss:
            best_loss = g_loss.item()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            logging.info(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
            break

    # Impute missing values using the trained generator
    X_imputed = X_missing.copy()
    X_imputed = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32).to(device)
    X_imputed = generator(X_imputed).detach().cpu().numpy()  # Move back to CPU for numpy compatibility
    X_imputed[mask.values] = X_missing.values[mask.values]

    return pd.DataFrame(X_imputed, columns=X.columns)

# Save models (optional)
# torch.save(generator.state_dict(), "generator.pth")
# torch.save(discriminator.state_dict(), "discriminator.pth")

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_gan(X_external)
X_train_imputed = impute_missing_values_with_gan(X_train)
X_validate_imputed = impute_missing_values_with_gan(X_validate)
X_test_imputed = impute_missing_values_with_gan(X_test)

# o02 Run GAN multiple times

In [None]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Multiple Passes and Early Stopping
def impute_missing_values_with_multiple_passes(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_imputed = X.copy()
    mask = X_imputed.isna()
    X_imputed.fillna(0, inplace=True)
    
    # Input dimension
    input_dim = X.shape[1]

    # Multiple passes
    for pass_num in range(num_passes):
        print(f"\n--- Imputation Pass {pass_num + 1}/{num_passes} ---\n")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)

        # Optimizers
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

        # Loss function
        adversarial_loss = nn.BCELoss()

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Adversarial ground truths
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)

                # Train Generator
                optimizer_G.zero_grad()

                # Generate data and replace missing values with generated data
                gen_data = generator(real_data)
                gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

                # Generator loss
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()

                # Train Discriminator
                optimizer_D.zero_grad()

                # Real and fake losses
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2

                d_loss.backward()
                optimizer_D.step()

            # Display progress
            #print(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Logging progress
            logging.basicConfig(level=logging.INFO)
            logging.info(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Early stopping
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
                break

        # Update X_imputed with refined values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        X_imputed.values[mask.values] = refined_data[mask.values]  # Only update missing values

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_multiple_passes(X_external)
X_train_imputed = impute_missing_values_with_multiple_passes(X_train)
X_validate_imputed = impute_missing_values_with_multiple_passes(X_validate)
X_test_imputed = impute_missing_values_with_multiple_passes(X_test)

# o03 Run GAN multiple times with more layers

In [4]:
# Custom Dataset class for PyTorch
def prepare_dataset(X):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Ensure data is converted to numeric type to avoid object type errors
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(X)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

# GAN Imputation Function with Multiple Passes and Early Stopping
def impute_missing_values_with_multiple_passes(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):
    # Prepare data and mask for missing values
    X_imputed = X.copy()
    mask = X_imputed.isna()
    X_imputed.fillna(0, inplace=True)
    
    # Input dimension
    input_dim = X.shape[1]

    # Multiple passes
    for pass_num in range(num_passes):
        logging.info(f"\n--- Imputation Pass {pass_num + 1}/{num_passes} ---\n")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)

        # Optimizers
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

        # Loss function
        adversarial_loss = nn.BCELoss()

        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Adversarial ground truths
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)

                # Train Generator
                optimizer_G.zero_grad()

                # Generate data and replace missing values with generated data
                gen_data = generator(real_data)
                gen_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values] = real_data[mask.iloc[i * batch_size:(i + 1) * batch_size].values]

                # Generator loss
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()

                # Train Discriminator
                optimizer_D.zero_grad()

                # Real and fake losses
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2

                d_loss.backward()
                optimizer_D.step()

            # Display progress
            logging.info(f"Epoch {epoch + 1}/{epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

            # Early stopping
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1} - Generator Loss: {g_loss.item()}")
                break

        # Update X_imputed with refined values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        X_imputed.values[mask.values] = refined_data[mask.values]  # Only update missing values

    return pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values for each dataset separately
X_external_imputed = impute_missing_values_with_multiple_passes(X_external)
X_train_imputed = impute_missing_values_with_multiple_passes(X_train)
X_validate_imputed = impute_missing_values_with_multiple_passes(X_validate)
X_test_imputed = impute_missing_values_with_multiple_passes(X_test)

2024-12-15 20:11:44,463 - INFO - 
--- Imputation Pass 1/3 ---

2024-12-15 20:12:04,419 - INFO - Epoch 1/1000 - Generator Loss: 1.0747894048690796, Discriminator Loss: 1.30485999584198
2024-12-15 20:12:20,620 - INFO - Epoch 2/1000 - Generator Loss: 1.54397451877594, Discriminator Loss: 0.44356492161750793
2024-12-15 20:12:37,300 - INFO - Epoch 3/1000 - Generator Loss: 0.8169678449630737, Discriminator Loss: 0.49852806329727173
2024-12-15 20:12:54,606 - INFO - Epoch 4/1000 - Generator Loss: 0.8335322141647339, Discriminator Loss: 0.7359015941619873
2024-12-15 20:13:11,794 - INFO - Epoch 5/1000 - Generator Loss: 1.3846280574798584, Discriminator Loss: 0.7499973177909851
2024-12-15 20:13:28,861 - INFO - Epoch 6/1000 - Generator Loss: 0.6470097303390503, Discriminator Loss: 0.6486812829971313
2024-12-15 20:13:46,190 - INFO - Epoch 7/1000 - Generator Loss: 1.4880684614181519, Discriminator Loss: 0.43448740243911743
2024-12-15 20:14:03,668 - INFO - Epoch 8/1000 - Generator Loss: 1.03450882434

Early stopping at epoch 16 - Generator Loss: 1.1156466007232666


2024-12-15 20:16:32,487 - INFO - 
--- Imputation Pass 2/3 ---

2024-12-15 20:16:50,043 - INFO - Epoch 1/1000 - Generator Loss: 0.31663355231285095, Discriminator Loss: 1.1050056219100952
2024-12-15 20:17:06,616 - INFO - Epoch 2/1000 - Generator Loss: 0.7460512518882751, Discriminator Loss: 0.48876112699508667
2024-12-15 20:17:23,888 - INFO - Epoch 3/1000 - Generator Loss: 0.4728429317474365, Discriminator Loss: 1.0725009441375732
2024-12-15 20:17:40,678 - INFO - Epoch 4/1000 - Generator Loss: 0.8350852131843567, Discriminator Loss: 0.7614063620567322
2024-12-15 20:17:57,264 - INFO - Epoch 5/1000 - Generator Loss: 0.8276912569999695, Discriminator Loss: 0.5790270566940308
2024-12-15 20:18:15,041 - INFO - Epoch 6/1000 - Generator Loss: 0.5297325849533081, Discriminator Loss: 0.7068414092063904
2024-12-15 20:18:32,163 - INFO - Epoch 7/1000 - Generator Loss: 1.3719007968902588, Discriminator Loss: 0.48656290769577026
2024-12-15 20:18:49,714 - INFO - Epoch 8/1000 - Generator Loss: 0.6725568

Early stopping at epoch 11 - Generator Loss: 1.0925068855285645


2024-12-15 20:19:47,583 - INFO - 
--- Imputation Pass 3/3 ---

2024-12-15 20:20:05,172 - INFO - Epoch 1/1000 - Generator Loss: 1.2086658477783203, Discriminator Loss: 0.4787372350692749
2024-12-15 20:20:21,568 - INFO - Epoch 2/1000 - Generator Loss: 0.7568482160568237, Discriminator Loss: 1.2621774673461914
2024-12-15 20:20:38,895 - INFO - Epoch 3/1000 - Generator Loss: 0.9534355998039246, Discriminator Loss: 0.5031024217605591
2024-12-15 20:20:55,816 - INFO - Epoch 4/1000 - Generator Loss: 0.979882538318634, Discriminator Loss: 0.7164821624755859
2024-12-15 20:21:13,340 - INFO - Epoch 5/1000 - Generator Loss: 0.8582409620285034, Discriminator Loss: 0.44867050647735596
2024-12-15 20:21:31,137 - INFO - Epoch 6/1000 - Generator Loss: 0.45888277888298035, Discriminator Loss: 0.9068809151649475
2024-12-15 20:21:48,630 - INFO - Epoch 7/1000 - Generator Loss: 1.1851071119308472, Discriminator Loss: 0.6434530019760132
2024-12-15 20:22:06,910 - INFO - Epoch 8/1000 - Generator Loss: 1.009891629

Early stopping at epoch 16 - Generator Loss: 0.8439058661460876


2024-12-15 20:24:38,909 - INFO - 
--- Imputation Pass 1/3 ---

2024-12-15 20:24:48,354 - INFO - Epoch 1/1000 - Generator Loss: 1.3271923065185547, Discriminator Loss: 0.563438892364502
2024-12-15 20:24:56,159 - INFO - Epoch 2/1000 - Generator Loss: 1.1219958066940308, Discriminator Loss: 0.35019057989120483
2024-12-15 20:25:04,933 - INFO - Epoch 3/1000 - Generator Loss: 0.7714702486991882, Discriminator Loss: 0.7006574869155884
2024-12-15 20:25:13,150 - INFO - Epoch 4/1000 - Generator Loss: 2.10231351852417, Discriminator Loss: 0.5426560640335083
2024-12-15 20:25:22,515 - INFO - Epoch 5/1000 - Generator Loss: 0.843502402305603, Discriminator Loss: 0.6731006503105164
2024-12-15 20:25:30,733 - INFO - Epoch 6/1000 - Generator Loss: 1.686538577079773, Discriminator Loss: 0.39538073539733887
2024-12-15 20:25:39,799 - INFO - Epoch 7/1000 - Generator Loss: 1.7751681804656982, Discriminator Loss: 0.25392577052116394
2024-12-15 20:25:48,196 - INFO - Epoch 8/1000 - Generator Loss: 1.490135312080

Early stopping at epoch 13 - Generator Loss: 1.4669846296310425


2024-12-15 20:26:35,680 - INFO - 
--- Imputation Pass 2/3 ---

2024-12-15 20:26:45,042 - INFO - Epoch 1/1000 - Generator Loss: 0.45326852798461914, Discriminator Loss: 0.7565370202064514
2024-12-15 20:26:53,025 - INFO - Epoch 2/1000 - Generator Loss: 2.9247403144836426, Discriminator Loss: 0.5761120915412903
2024-12-15 20:27:01,765 - INFO - Epoch 3/1000 - Generator Loss: 0.7050061225891113, Discriminator Loss: 0.6115669012069702
2024-12-15 20:27:10,381 - INFO - Epoch 4/1000 - Generator Loss: 1.405816674232483, Discriminator Loss: 0.900619387626648
2024-12-15 20:27:19,103 - INFO - Epoch 5/1000 - Generator Loss: 1.6139945983886719, Discriminator Loss: 0.45358848571777344
2024-12-15 20:27:28,125 - INFO - Epoch 6/1000 - Generator Loss: 1.526914119720459, Discriminator Loss: 0.3692247271537781
2024-12-15 20:27:36,922 - INFO - Epoch 7/1000 - Generator Loss: 1.0742970705032349, Discriminator Loss: 0.5782427787780762
2024-12-15 20:27:46,349 - INFO - Epoch 8/1000 - Generator Loss: 1.52593171596

Early stopping at epoch 11 - Generator Loss: 1.5421669483184814


2024-12-15 20:28:14,762 - INFO - 
--- Imputation Pass 3/3 ---

2024-12-15 20:28:23,673 - INFO - Epoch 1/1000 - Generator Loss: 3.3230063915252686, Discriminator Loss: 3.0241923332214355
2024-12-15 20:28:32,347 - INFO - Epoch 2/1000 - Generator Loss: 1.6814552545547485, Discriminator Loss: 0.6391303539276123
2024-12-15 20:28:40,409 - INFO - Epoch 3/1000 - Generator Loss: 1.0187832117080688, Discriminator Loss: 0.5255692601203918
2024-12-15 20:28:49,508 - INFO - Epoch 4/1000 - Generator Loss: 0.9439082741737366, Discriminator Loss: 0.527639627456665
2024-12-15 20:28:57,997 - INFO - Epoch 5/1000 - Generator Loss: 1.2998247146606445, Discriminator Loss: 0.6741635799407959
2024-12-15 20:29:07,186 - INFO - Epoch 6/1000 - Generator Loss: 0.7202398777008057, Discriminator Loss: 0.5032034516334534
2024-12-15 20:29:15,496 - INFO - Epoch 7/1000 - Generator Loss: 1.1760680675506592, Discriminator Loss: 0.34310466051101685
2024-12-15 20:29:24,729 - INFO - Epoch 8/1000 - Generator Loss: 0.9272780418

Early stopping at epoch 16 - Generator Loss: 0.9700429439544678


2024-12-15 20:30:39,910 - INFO - 
--- Imputation Pass 1/3 ---

2024-12-15 20:30:41,159 - INFO - Epoch 1/1000 - Generator Loss: 2.196349859237671, Discriminator Loss: 0.1490233689546585
2024-12-15 20:30:42,190 - INFO - Epoch 2/1000 - Generator Loss: 2.130969762802124, Discriminator Loss: 0.23882807791233063
2024-12-15 20:30:43,157 - INFO - Epoch 3/1000 - Generator Loss: 1.4125478267669678, Discriminator Loss: 0.3225172758102417
2024-12-15 20:30:44,136 - INFO - Epoch 4/1000 - Generator Loss: 6.5022711753845215, Discriminator Loss: 3.0890088081359863
2024-12-15 20:30:45,101 - INFO - Epoch 5/1000 - Generator Loss: 0.9934859871864319, Discriminator Loss: 0.8561071157455444
2024-12-15 20:30:46,071 - INFO - Epoch 6/1000 - Generator Loss: 2.848104238510132, Discriminator Loss: 0.6010442972183228
2024-12-15 20:30:47,041 - INFO - Epoch 7/1000 - Generator Loss: 4.104102611541748, Discriminator Loss: 1.798781156539917
2024-12-15 20:30:47,996 - INFO - Epoch 8/1000 - Generator Loss: 1.38829278945922

Early stopping at epoch 19 - Generator Loss: 2.6762821674346924


2024-12-15 20:31:00,947 - INFO - Epoch 1/1000 - Generator Loss: 6.511636734008789, Discriminator Loss: 3.1273419857025146
2024-12-15 20:31:01,926 - INFO - Epoch 2/1000 - Generator Loss: 1.7458586692810059, Discriminator Loss: 0.2569155991077423
2024-12-15 20:31:02,937 - INFO - Epoch 3/1000 - Generator Loss: 2.7525248527526855, Discriminator Loss: 0.4538579285144806
2024-12-15 20:31:03,901 - INFO - Epoch 4/1000 - Generator Loss: 2.6167635917663574, Discriminator Loss: 1.566697597503662
2024-12-15 20:31:04,871 - INFO - Epoch 5/1000 - Generator Loss: 2.7925329208374023, Discriminator Loss: 1.7390623092651367
2024-12-15 20:31:05,828 - INFO - Epoch 6/1000 - Generator Loss: 1.523999571800232, Discriminator Loss: 1.9946774244308472
2024-12-15 20:31:06,798 - INFO - Epoch 7/1000 - Generator Loss: 2.475281000137329, Discriminator Loss: 0.6025263667106628
2024-12-15 20:31:07,770 - INFO - Epoch 8/1000 - Generator Loss: 0.7063435912132263, Discriminator Loss: 0.9137990474700928
2024-12-15 20:31:08,

Early stopping at epoch 18 - Generator Loss: 1.2624757289886475


2024-12-15 20:31:18,757 - INFO - 
--- Imputation Pass 3/3 ---

2024-12-15 20:31:19,862 - INFO - Epoch 1/1000 - Generator Loss: 2.3089189529418945, Discriminator Loss: 0.1619698703289032
2024-12-15 20:31:20,839 - INFO - Epoch 2/1000 - Generator Loss: 1.7068437337875366, Discriminator Loss: 0.22026391327381134
2024-12-15 20:31:21,831 - INFO - Epoch 3/1000 - Generator Loss: 5.350123405456543, Discriminator Loss: 2.3201537132263184
2024-12-15 20:31:22,827 - INFO - Epoch 4/1000 - Generator Loss: 3.1551671028137207, Discriminator Loss: 1.4398109912872314
2024-12-15 20:31:23,794 - INFO - Epoch 5/1000 - Generator Loss: 2.2355706691741943, Discriminator Loss: 0.8548267483711243
2024-12-15 20:31:24,910 - INFO - Epoch 6/1000 - Generator Loss: 1.7704933881759644, Discriminator Loss: 0.8258506059646606
2024-12-15 20:31:26,095 - INFO - Epoch 7/1000 - Generator Loss: 1.0017517805099487, Discriminator Loss: 1.0646023750305176
2024-12-15 20:31:27,261 - INFO - Epoch 8/1000 - Generator Loss: 0.7478488087

Early stopping at epoch 18 - Generator Loss: 2.772284984588623


2024-12-15 20:31:37,617 - INFO - 
--- Imputation Pass 1/3 ---

2024-12-15 20:31:38,727 - INFO - Epoch 1/1000 - Generator Loss: 2.1367287635803223, Discriminator Loss: 0.14177684485912323
2024-12-15 20:31:39,702 - INFO - Epoch 2/1000 - Generator Loss: 1.9269686937332153, Discriminator Loss: 0.14973507821559906
2024-12-15 20:31:40,830 - INFO - Epoch 3/1000 - Generator Loss: 3.029080629348755, Discriminator Loss: 0.12929172813892365
2024-12-15 20:31:42,018 - INFO - Epoch 4/1000 - Generator Loss: 6.7752580642700195, Discriminator Loss: 2.547776937484741
2024-12-15 20:31:43,227 - INFO - Epoch 5/1000 - Generator Loss: 2.8426356315612793, Discriminator Loss: 1.428961992263794
2024-12-15 20:31:44,401 - INFO - Epoch 6/1000 - Generator Loss: 3.743220329284668, Discriminator Loss: 0.549566924571991
2024-12-15 20:31:45,372 - INFO - Epoch 7/1000 - Generator Loss: 1.5729196071624756, Discriminator Loss: 0.31706130504608154
2024-12-15 20:31:46,354 - INFO - Epoch 8/1000 - Generator Loss: 2.13966345787

Early stopping at epoch 24 - Generator Loss: 1.3399971723556519


2024-12-15 20:32:03,881 - INFO - 
--- Imputation Pass 2/3 ---

2024-12-15 20:32:04,986 - INFO - Epoch 1/1000 - Generator Loss: 2.486748218536377, Discriminator Loss: 0.1497921347618103
2024-12-15 20:32:05,959 - INFO - Epoch 2/1000 - Generator Loss: 1.8400861024856567, Discriminator Loss: 0.2548302710056305
2024-12-15 20:32:06,952 - INFO - Epoch 3/1000 - Generator Loss: 1.2466868162155151, Discriminator Loss: 0.4656599164009094
2024-12-15 20:32:07,921 - INFO - Epoch 4/1000 - Generator Loss: 3.7990012168884277, Discriminator Loss: 1.189254641532898
2024-12-15 20:32:08,900 - INFO - Epoch 5/1000 - Generator Loss: 3.1964035034179688, Discriminator Loss: 0.8369903564453125
2024-12-15 20:32:09,858 - INFO - Epoch 6/1000 - Generator Loss: 1.4493598937988281, Discriminator Loss: 0.43827635049819946
2024-12-15 20:32:10,822 - INFO - Epoch 7/1000 - Generator Loss: 1.3341928720474243, Discriminator Loss: 1.3142163753509521
2024-12-15 20:32:11,776 - INFO - Epoch 8/1000 - Generator Loss: 1.12696695327

Early stopping at epoch 21 - Generator Loss: 0.8624267578125


2024-12-15 20:32:25,999 - INFO - 
--- Imputation Pass 3/3 ---

2024-12-15 20:32:27,116 - INFO - Epoch 1/1000 - Generator Loss: 4.024667739868164, Discriminator Loss: 1.3190240859985352
2024-12-15 20:32:28,116 - INFO - Epoch 2/1000 - Generator Loss: 4.638310432434082, Discriminator Loss: 1.074483871459961
2024-12-15 20:32:29,292 - INFO - Epoch 3/1000 - Generator Loss: 2.920468807220459, Discriminator Loss: 0.11640326678752899
2024-12-15 20:32:30,480 - INFO - Epoch 4/1000 - Generator Loss: 2.711681842803955, Discriminator Loss: 0.192236989736557
2024-12-15 20:32:31,650 - INFO - Epoch 5/1000 - Generator Loss: 2.3497092723846436, Discriminator Loss: 0.28061193227767944
2024-12-15 20:32:32,761 - INFO - Epoch 6/1000 - Generator Loss: 2.023092269897461, Discriminator Loss: 2.508610725402832
2024-12-15 20:32:33,769 - INFO - Epoch 7/1000 - Generator Loss: 1.4030590057373047, Discriminator Loss: 0.516247034072876
2024-12-15 20:32:34,730 - INFO - Epoch 8/1000 - Generator Loss: 2.757786989212036, 

Early stopping at epoch 19 - Generator Loss: 1.1578110456466675


# Save dataframes

In [5]:
external_total_missing_values = X_external_imputed.isna().sum().sum()
train_total_missing_values = X_train_imputed.isna().sum().sum()
validation_total_missing_values = X_validate_imputed.isna().sum().sum()
test_total_missing_values = X_test_imputed.isna().sum().sum()
print(external_total_missing_values, 'missing values in external dataset\n')
print(train_total_missing_values, 'missing values in train dataset\n')
print(validation_total_missing_values, 'missing values in validation dataset\n')
print(test_total_missing_values, 'missing values in test dataset\n')

0 missing values in external dataset

0 missing values in train dataset

0 missing values in validation dataset

0 missing values in test dataset



In [6]:
# Prepare dataframe in order to replace ages and race with the original ones.
# Reset indices and log information
logging.info("Resetting index for X_external")
X_external = X_external.reset_index(drop=True)

logging.info("Resetting index for X_external_imputed")
X_external_imputed = X_external_imputed.reset_index(drop=True)

logging.info("Resetting index for X_train")
X_train = X_train.reset_index(drop=True)

logging.info("Resetting index for X_train_imputed")
X_train_imputed = X_train_imputed.reset_index(drop=True)

logging.info("Resetting index for X_validate")
X_validate = X_validate.reset_index(drop=True)

logging.info("Resetting index for X_validate_imputed")
X_validate_imputed = X_validate_imputed.reset_index(drop=True)

logging.info("Resetting index for X_test")
X_test = X_test.reset_index(drop=True)

logging.info("Resetting index for X_test_imputed")
X_test_imputed = X_test_imputed.reset_index(drop=True)

2024-12-15 20:46:06,897 - INFO - Resetting index for X_external
2024-12-15 20:46:06,959 - INFO - Resetting index for X_external_imputed
2024-12-15 20:46:07,014 - INFO - Resetting index for X_train
2024-12-15 20:46:07,047 - INFO - Resetting index for X_train_imputed
2024-12-15 20:46:07,079 - INFO - Resetting index for X_validate
2024-12-15 20:46:07,089 - INFO - Resetting index for X_validate_imputed
2024-12-15 20:46:07,094 - INFO - Resetting index for X_test
2024-12-15 20:46:07,099 - INFO - Resetting index for X_test_imputed


In [7]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_external.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df

X_external_imputed = replace_columns(X_external_imputed, X_external, columns_to_replace)
logging.info("External replacement completed.")

2024-12-15 20:46:09,179 - INFO - External replacement completed.


In [8]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_train.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_train_imputed = replace_columns(X_train_imputed, X_train, columns_to_replace)
logging.info("Train replacement completed.")

2024-12-15 20:46:10,462 - INFO - Train replacement completed.


In [9]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_validate.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_validate_imputed = replace_columns(X_validate_imputed, X_validate, columns_to_replace)
logging.info("Validate replacement completed.")

2024-12-15 20:46:11,067 - INFO - Validate replacement completed.


In [10]:
# List of columns to replace: 'age' and columns starting with 'race_'
columns_to_replace = ['age'] + [col for col in X_test.columns if col.startswith('race_')]

# Function to replace specified columns in the imputed dataframe with original data
def replace_columns(imputed_df, original_df, columns):
    imputed_df[columns] = original_df[columns]
    return imputed_df


X_test_imputed = replace_columns(X_test_imputed, X_test, columns_to_replace)
logging.info("Test replacement completed.")

2024-12-15 20:46:11,335 - INFO - Test replacement completed.


In [11]:
save_path = '../CSV/exports/impute/o1_GAN/o01/'
logging.info(f"Save path set to: {save_path}")

# Check if the directory exists, and if not, create it
if not os.path.exists(save_path):
    logging.info(f"Directory does not exist. Creating directory: {save_path}")
    os.makedirs(save_path)
else:
    logging.info(f"Directory already exists: {save_path}")

# Save external validation set from eICU
logging.info(f"Saving X_external_imputed to {save_path + name + '_X_external.csv'}")
X_external_imputed.to_csv(save_path + name + '_X_external.csv', index=False)

logging.info(f"Saving y_external to {save_path + name + '_y_external.csv'}")
y_external.to_csv(save_path + name + '_y_external.csv', index=False)

# Save training set
logging.info(f"Saving X_train_imputed to {save_path + name + '_X_train.csv'}")
X_train_imputed.to_csv(save_path + name + '_X_train.csv', index=False)

logging.info(f"Saving y_train to {save_path + name + '_y_train.csv'}")
y_train.to_csv(save_path + name + '_y_train.csv', index=False)

# Save validation set
logging.info(f"Saving X_validate_imputed to {save_path + name + '_X_validate.csv'}")
X_validate_imputed.to_csv(save_path + name + '_X_validate.csv', index=False)

logging.info(f"Saving y_validate to {save_path + name + '_y_validate.csv'}")
y_validate.to_csv(save_path + name + '_y_validate.csv', index=False)

# Save test set
logging.info(f"Saving X_test_imputed to {save_path + name + '_X_test.csv'}")
X_test_imputed.to_csv(save_path + name + '_X_test.csv', index=False)

logging.info(f"Saving y_test to {save_path + name + '_y_test.csv'}")
y_test.to_csv(save_path + name + '_y_test.csv', index=False)

2024-12-15 20:46:14,234 - INFO - Save path set to: ../CSV/exports/impute/o1_GAN/o01/
2024-12-15 20:46:14,236 - INFO - Directory already exists: ../CSV/exports/impute/o1_GAN/o01/
2024-12-15 20:46:14,237 - INFO - Saving X_external_imputed to ../CSV/exports/impute/o1_GAN/o01/o4_X_external.csv
2024-12-15 20:46:26,622 - INFO - Saving y_external to ../CSV/exports/impute/o1_GAN/o01/o4_y_external.csv
2024-12-15 20:46:26,684 - INFO - Saving X_train_imputed to ../CSV/exports/impute/o1_GAN/o01/o4_X_train.csv
2024-12-15 20:46:33,107 - INFO - Saving y_train to ../CSV/exports/impute/o1_GAN/o01/o4_y_train.csv
2024-12-15 20:46:33,164 - INFO - Saving X_validate_imputed to ../CSV/exports/impute/o1_GAN/o01/o4_X_validate.csv
2024-12-15 20:46:33,982 - INFO - Saving y_validate to ../CSV/exports/impute/o1_GAN/o01/o4_y_validate.csv
2024-12-15 20:46:33,992 - INFO - Saving X_test_imputed to ../CSV/exports/impute/o1_GAN/o01/o4_X_test.csv
2024-12-15 20:46:34,809 - INFO - Saving y_test to ../CSV/exports/impute/o1_