In [1]:
import pandas as pd
import numpy as np
import os
import logging

# Iterative imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

# GAN imputation using PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# CSVs Directory 
data_path = "../CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")

2025-02-10 19:05:39,477 - INFO - Loading... -> o1_X_external.csv
2025-02-10 19:05:46,724 - INFO - Loading... -> o1_X_test.csv
2025-02-10 19:05:47,221 - INFO - Loading... -> o1_X_train.csv
2025-02-10 19:05:51,200 - INFO - Loading... -> o1_X_validate.csv
2025-02-10 19:05:51,700 - INFO - Loading... -> o1_y_external_los.csv
2025-02-10 19:05:51,747 - INFO - Loading... -> o1_y_external_mortality.csv
2025-02-10 19:05:51,777 - INFO - Loading... -> o1_y_test_los.csv
2025-02-10 19:05:51,787 - INFO - Loading... -> o1_y_test_mortality.csv
2025-02-10 19:05:51,793 - INFO - Loading... -> o1_y_train_los.csv
2025-02-10 19:05:51,839 - INFO - Loading... -> o1_y_train_mortality.csv
2025-02-10 19:05:51,861 - INFO - Loading... -> o1_y_validate_los.csv
2025-02-10 19:05:51,872 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-02-10 19:05:51,878 - INFO - Loading... -> o2_X_external.csv
2025-02-10 19:05:55,462 - INFO - Loading... -> o2_X_test.csv
2025-02-10 19:05:55,733 - INFO - Loading... -> o2_X_train.c

In [4]:
percent = 30

# Calculate the percentage of missing values per row
missing_percentage_per_row = o3_X_train.isnull().mean(axis=1) * 100

# Count the number of rows with missing values up to percent
missing_rows = (missing_percentage_per_row <= percent).sum()

# Get the total number of rows and columns
total_rows, total_columns = o3_X_train.shape

percent_between = missing_rows*100/total_rows

# Display the results
print(f"Total Rows: {total_rows}, Total Columns: {total_columns}")
print(f"Number of rows with missing values up to {percent} %: {missing_rows}")
print (f"The percentage between total and missing values sets are {percent_between:.2f}%")

Total Rows: 40832, Total Columns: 345
Number of rows with missing values up to 30 %: 10912
The percentage between total and missing values sets are 26.72%


# GAN Imputation Method

In [5]:
# Custom Dataset class for PyTorch
def prepare_dataset(data):
    class CustomDataset(Dataset):
        def __init__(self, data):
            # Convert to float32 tensor
            self.data = torch.tensor(data.astype(np.float32), dtype=torch.float32)

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]
    
    return CustomDataset(data)

# Define Generator model
class Generator(nn.Module):
    def __init__(self, input_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        return self.main(x)

# Define Discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

In [6]:
# GAN Imputation Function
def gan_impute_method(X, num_passes=3, epochs=1000, batch_size=64, learning_rate=0.0002, patience=10):

    X_imputed = X.copy()
    # Create a mask of missing values
    mask = X_imputed.isna()
    # Fill missing values with 0 as a starting point
    X_imputed.fillna(0, inplace=True)
    
    input_dim = X.shape[1]
    
    for pass_num in range(num_passes):
        logging.info(f"GAN Imputation Pass {pass_num + 1}/{num_passes}")
        
        # Prepare dataset and dataloader
        dataset = prepare_dataset(X_imputed.values)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        # Initialize Generator and Discriminator
        generator = Generator(input_dim)
        discriminator = Discriminator(input_dim)
        
        optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
        optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)
        adversarial_loss = nn.BCELoss()
        
        best_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(epochs):
            for i, real_data in enumerate(dataloader):
                # Prepare labels
                valid = torch.ones(real_data.size(0), 1)
                fake = torch.zeros(real_data.size(0), 1)
                
                # Train Generator
                optimizer_G.zero_grad()
                gen_data = generator(real_data)
                
                # Get the corresponding mask for this batch.
                # Assume the DataLoader returns rows in order.
                start_idx = i * batch_size
                end_idx = min((i + 1) * batch_size, len(X_imputed))
                batch_mask = torch.tensor(mask.iloc[start_idx:end_idx].values, dtype=torch.bool)
                
                # Do not update the values that were originally observed.
                gen_data[batch_mask] = real_data[batch_mask]
                
                g_loss = adversarial_loss(discriminator(gen_data), valid)
                g_loss.backward()
                optimizer_G.step()
                
                # Train Discriminator
                optimizer_D.zero_grad()
                real_loss = adversarial_loss(discriminator(real_data), valid)
                fake_loss = adversarial_loss(discriminator(gen_data.detach()), fake)
                d_loss = (real_loss + fake_loss) / 2
                d_loss.backward()
                optimizer_D.step()
            
            logging.info(f"Epoch {epoch + 1}/{epochs} - G_loss: {g_loss.item():.4f}, D_loss: {d_loss.item():.4f}")
            
            # Early stopping based on generator loss
            if g_loss.item() < best_loss:
                best_loss = g_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                logging.info(f"Early stopping at epoch {epoch + 1} with G_loss: {g_loss.item():.4f}")
                break
        
        # After training, update the missing values
        X_imputed_tensor = torch.tensor(X_imputed.values.astype(np.float32), dtype=torch.float32)
        refined_data = generator(X_imputed_tensor).detach().numpy()
        # Only update the positions where data was missing
        X_imputed.values[mask.values] = refined_data[mask.values]
    
    return pd.DataFrame(X_imputed, columns=X.columns)

# Iterative Imputation Method

In [7]:
"""
Impute using IterativeImputer with an ExtraTreesRegressor.
Returns a DataFrame with imputed values.
"""
def iterative_impute_method(data, random_state=0):
    base_estimator = ExtraTreesRegressor(n_estimators=10, random_state=random_state)
    imputer = IterativeImputer(random_state=random_state, estimator=base_estimator)
    imputed = pd.DataFrame(imputer.fit_transform(data),
                           index=data.index,
                           columns=data.columns)
    return imputed

# Hierarchical Imputation Function

In [8]:
# Dynamic hierarchical imputation function

"""
Perform hierarchical imputation by splitting rows into groups based on a dynamic set of thresholds.
    
Parameters:

      thresholds : List of thresholds that specify group widths (e.g., [0.1, 0.1, 0.8] for three groups),
                      or [0.05, 0.05, 0.1, 0.1, 0.7]
                      or [0.05]*20.
                      These will be converted to cumulative thresholds. No matter what the sum of the
                      thresholds must be 1
      estimator  : Optional. An estimator for IterativeImputer. If None, defaults to ExtraTreesRegressor.
    
"""


def hierarchical_impute_dynamic(df, thresholds, methods, random_state=0):
    if len(thresholds) != len(methods):
        raise ValueError("The number of thresholds must equal the number of methods provided.")
    
    df_copy = df.copy()
    # Compute the missing percentage for each row.
    df_copy['missing_pct'] = df_copy.isnull().mean(axis=1)
    cols = df_copy.columns.drop('missing_pct')
    
    # Compute global means for each column.
    global_means = df_copy[cols].mean().fillna(0)
    
    # Prepare an empty DataFrame for imputed results.
    imputed_df = pd.DataFrame(index=df_copy.index, columns=cols)
    
    # Compute cumulative thresholds.
    cum_thresholds = np.cumsum(thresholds)
    if not np.isclose(cum_thresholds[-1], 1.0):
        raise ValueError("The sum of thresholds must be 1.0 (or very close to it).")
    
    previous_imputed = None
    
    # Process each group in sequence.
    for i, upper_bound in enumerate(cum_thresholds):
        if i == 0:
            lower_bound = 0.0
            idx = df_copy.index[df_copy['missing_pct'] <= upper_bound]
        else:
            lower_bound = cum_thresholds[i - 1]
            idx = df_copy.index[(df_copy['missing_pct'] > lower_bound) & (df_copy['missing_pct'] <= upper_bound)]
        
        group_data = df_copy.loc[idx, cols].copy()
        # For any column completely missing in this group, fill with the global mean.
        for col in group_data.columns:
            if group_data[col].isnull().all():
                group_data[col] = global_means[col]
        
        # Count the total missing values in the current group.
        missing_count = group_data.isnull().sum().sum()
        
        logging.info(f"Group {i+1} (missing_pct in ({lower_bound:.2f}, {upper_bound:.2f}]): "
                     f"{group_data.shape[0]} rows, {missing_count} missing values")
        
        if group_data.empty:
            continue
        
        # Combine previously imputed rows (if any) with current group data.
        if previous_imputed is None:
            combined = group_data
        else:
            combined = pd.concat([previous_imputed, group_data])
        
        # Apply the selected imputation method for this group.
        current_method = methods[i]
        try:
            combined_imputed = current_method(combined, random_state=random_state)
        except TypeError:
            combined_imputed = current_method(combined)
        
        # Extract imputed values corresponding to the current group.
        group_imputed = combined_imputed.loc[idx]
        imputed_df.loc[idx] = group_imputed
        
        # Update previous_imputed by combining with the newly imputed rows.
        if previous_imputed is None:
            previous_imputed = group_imputed.copy()
        else:
            previous_imputed = pd.concat([previous_imputed, group_imputed])
    
    if imputed_df.isnull().values.any():
        raise ValueError("NaN values remain after hierarchical imputation!")
    
    return imputed_df

# Set Up Thresholds and Imputation Methods we are going to use

In [11]:
"""
dynamic_thresholds : List of thresholds that specify group widths
                    e.g. [0.1, 0.1, 0.8]
                    or   [0.05, 0.05, 0.1, 0.1, 0.7]
                    or   [0.05]*20.
                    These will be converted to cumulative thresholds.
                    No matter what the sum of the thresholds must be 1
"""


# Set up threshold
dynamic_thresholds = [0.04] * 25

# Setting Up which methon we are going to use for every threshold.
# The number of methods must be the same as the number of thresholds.
dynamic_methods = [iterative_impute_method] * 7 + [gan_impute_method] * 18

"""
# List of dataset names
datasets_to_impute = [
    "o1_X_train", "o2_X_train", "o3_X_train", "o4_X_train",
    "o1_X_test", "o2_X_test", "o3_X_test", "o4_X_test",
    "o1_X_validate", "o2_X_validate", "o3_X_validate", "o4_X_validate",
    "o1_X_external", "o2_X_external", "o3_X_external", "o4_X_external"
]
"""

# List of dataset names
datasets_to_impute = [
    "o3_X_train",
    "o3_X_test",
    "o3_X_validate",
    "o3_X_external"
]

# Create the save path.
output_path = '../CSV/exports/impute/o4_Hierarchical/'
os.makedirs(output_path, exist_ok=True)

# Function Call

In [None]:
imputed_datasets = {}
# Assuming your CSVs have already been loaded into a dictionary called 'dataframes'
for dataset_name in datasets_to_impute:
    if dataset_name in globals().get("dataframes", {}):
        logging.info(f"Processing dataset: {dataset_name}")
        df = dataframes[dataset_name]
        try:
            imputed_df = hierarchical_impute_dynamic(df, thresholds=dynamic_thresholds, methods=dynamic_methods, random_state=0)
            if imputed_df.isnull().values.any():
                raise ValueError(f"NaN values found in dataset {dataset_name} after imputation.")
            imputed_datasets[dataset_name] = imputed_df
            output_file = os.path.join(output_path, f"{dataset_name}.csv")
            imputed_df.to_csv(output_file, index=False)
            logging.info(f"Imputed dataset saved as {output_file}")
        except ValueError as e:
            logging.error(f"Error in {dataset_name}: {e}")
    else:
        logging.warning(f"Dataset {dataset_name} not found!")

logging.info("All datasets have been imputed successfully.")

2025-02-10 19:06:34,766 - INFO - Processing dataset: o3_X_train
2025-02-10 19:06:35,309 - INFO - Group 1 (missing_pct in (0.00, 0.04]): 0 rows, 0 missing values
2025-02-10 19:06:35,383 - INFO - Group 2 (missing_pct in (0.04, 0.08]): 16 rows, 0 missing values
2025-02-10 19:06:39,566 - INFO - Group 3 (missing_pct in (0.08, 0.12]): 160 rows, 5440 missing values
2025-02-10 19:08:47,644 - INFO - Group 4 (missing_pct in (0.12, 0.16]): 560 rows, 27584 missing values
2025-02-10 19:17:43,257 - INFO - Group 5 (missing_pct in (0.16, 0.20]): 2112 rows, 131712 missing values
2025-02-10 19:57:53,701 - INFO - Group 6 (missing_pct in (0.20, 0.24]): 2688 rows, 205824 missing values
2025-02-10 21:41:34,201 - INFO - Group 7 (missing_pct in (0.24, 0.28]): 3840 rows, 346816 missing values
