In [1]:
import pandas as pd
import numpy as np
import os
import logging
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# CSVs Directory 
data_path = "../CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df  # if you need them as global variables
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")

2025-02-01 11:33:59,055 - INFO - Loading... -> o1_X_external.csv
2025-02-01 11:34:06,469 - INFO - Loading... -> o1_X_test.csv
2025-02-01 11:34:07,013 - INFO - Loading... -> o1_X_train.csv
2025-02-01 11:34:11,347 - INFO - Loading... -> o1_X_validate.csv
2025-02-01 11:34:11,903 - INFO - Loading... -> o1_y_external_los.csv
2025-02-01 11:34:11,945 - INFO - Loading... -> o1_y_external_mortality.csv
2025-02-01 11:34:11,976 - INFO - Loading... -> o1_y_test_los.csv
2025-02-01 11:34:11,986 - INFO - Loading... -> o1_y_test_mortality.csv
2025-02-01 11:34:11,991 - INFO - Loading... -> o1_y_train_los.csv
2025-02-01 11:34:12,031 - INFO - Loading... -> o1_y_train_mortality.csv
2025-02-01 11:34:12,049 - INFO - Loading... -> o1_y_validate_los.csv
2025-02-01 11:34:12,059 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-02-01 11:34:12,064 - INFO - Loading... -> o2_X_external.csv
2025-02-01 11:34:15,749 - INFO - Loading... -> o2_X_test.csv
2025-02-01 11:34:16,030 - INFO - Loading... -> o2_X_train.c

In [None]:
display(o1_X_train)

In [4]:
"""
Perform hierarchical imputation on a dataframe by splitting rows into groups based on missingness.
    
Groups:
    - Group 1: missing percentage <= thresholds[0]
    - Group 2: thresholds[0] < missing percentage <= thresholds[1]
    - Group 3: missing percentage > thresholds[1]
       
Returns:
    df_imputed: DataFrame with imputed values.
"""

def hierarchical_impute(df, thresholds=[0.10, 0.20, 1.00]):
    df_copy = df.copy()
    # Compute the percentage of missing values for each row
    df_copy['missing_pct'] = df_copy.isnull().mean(axis=1)
    # Get the original columns (excluding our helper column)
    cols = df_copy.columns.drop('missing_pct')
    
    """
    Compute global means for each column (from the entire dataframe)
    We want that because there are columns beween sets that are all NaN
    and if so we replace them with 0.
    """
    global_means = df_copy[cols].mean()
    # In case any global mean is still NaN (column entirely missing in the full dataset), replace with 0
    global_means = global_means.fillna(0)
    
    # Create an empty DataFrame to store imputed results
    imputed_df = pd.DataFrame(index=df_copy.index, columns=cols)
    
    # We'll use ExtraTreesRegressor in IterativeImputer to improve numerical stability.
    base_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0)
    
    # ----- Group 1: Rows with <= thresholds[0] missing values -----
    idx_group1 = df_copy.index[df_copy['missing_pct'] <= thresholds[0]]
    group1 = df_copy.loc[idx_group1, cols].copy()
    
    # For any column that is completely missing in group1, we fill it with the global mean
    for col in group1.columns:
        if group1[col].isnull().all():
            group1.loc[:, col] = global_means[col]
    if not group1.empty:
        logging.info(f"Group 1 (<= {thresholds[0]*100:.0f}% missing): {group1.shape[0]} rows")
        imputer1 = IterativeImputer(random_state=0, estimator=base_estimator)
        group1_imputed = pd.DataFrame(imputer1.fit_transform(group1),
                                      index=group1.index,
                                      columns=group1.columns)
        imputed_df.loc[idx_group1] = group1_imputed
    else:
        group1_imputed = pd.DataFrame()
    
    # ----- Group 2: Rows with > thresholds[0] and <= thresholds[1] missing values -----
    idx_group2 = df_copy.index[(df_copy['missing_pct'] > thresholds[0]) & (df_copy['missing_pct'] <= thresholds[1])]
    group2 = df_copy.loc[idx_group2, cols].copy()
    for col in group2.columns:
        if group2[col].isnull().all():
            group2.loc[:, col] = global_means[col]
    if not group2.empty:
        logging.info(f"Group 2 (> {thresholds[0]*100:.0f}% and <= {thresholds[1]*100:.0f}% missing): {group2.shape[0]} rows")
        # Combine already imputed group1 rows with group2 rows
        combined = pd.concat([group1_imputed, group2])
        imputer2 = IterativeImputer(random_state=0, estimator=base_estimator)
        combined_imputed = pd.DataFrame(imputer2.fit_transform(combined),
                                        index=combined.index,
                                        columns=combined.columns)
        # Extract the imputed values for group2
        group2_imputed = combined_imputed.loc[idx_group2]
        imputed_df.loc[idx_group2] = group2_imputed
    else:
        logging.info("No rows found for Group 2.")
    
    # ----- Group 3: Rows with > thresholds[1] missing values -----
    idx_group3 = df_copy.index[df_copy['missing_pct'] > thresholds[1]]
    group3 = df_copy.loc[idx_group3, cols].copy()
    for col in group3.columns:
        if group3[col].isnull().all():
            group3.loc[:, col] = global_means[col]
    if not group3.empty:
        logging.info(f"Group 3 (> {thresholds[1]*100:.0f}% missing): {group3.shape[0]} rows")
        # Combine already imputed rows (Group 1 & 2) with Group 3 rows
        imputed_so_far = imputed_df.loc[idx_group1.union(idx_group2)]
        combined2 = pd.concat([imputed_so_far, group3])
        imputer3 = IterativeImputer(random_state=0, estimator=base_estimator)
        combined2_imputed = pd.DataFrame(imputer3.fit_transform(combined2),
                                         index=combined2.index,
                                         columns=combined2.columns)
        # Extract imputed rows for group3
        group3_imputed = combined2_imputed.loc[idx_group3]
        imputed_df.loc[idx_group3] = group3_imputed
    else:
        logging.info("No rows found for Group 3.")
    
    # Final check: if any NaN remains, raise an error
    if imputed_df.isnull().values.any():
        raise ValueError("NaN values remain after hierarchical imputation!")
    
    return imputed_df

In [None]:
# List of datasets
datasets_to_impute = [
    "o1_X_train", "o2_X_train", "o3_X_train", "o4_X_train",
    "o1_X_test", "o2_X_test", "o3_X_test", "o4_X_test",
    "o1_X_validate", "o2_X_validate", "o3_X_validate", "o4_X_validate",
    "o1_X_external", "o2_X_external", "o3_X_external", "o4_X_external"
]

# Set the output path for imputed CSVs
output_path = '../CSV/exports/impute/hierarchical/'
os.makedirs(output_path, exist_ok=True)

# Process and impute each dataset using the hierarchical strategy
imputed_datasets = {}
for dataset_name in datasets_to_impute:
    if dataset_name in dataframes:
        logging.info(f"Processing dataset: {dataset_name}")
        df = dataframes[dataset_name]
        try:
            imputed_df = hierarchical_impute(df, thresholds=[0.10, 0.20, 1.00])
            
            # Validate no NaN values remain
            if imputed_df.isnull().values.any():
                raise ValueError(f"NaN values found in dataset {dataset_name} after imputation.")
            
            imputed_datasets[dataset_name] = imputed_df
            
            # Save the imputed dataset
            output_file = os.path.join(output_path, f"{dataset_name}.csv")
            imputed_df.to_csv(output_file, index=False)
            logging.info(f"Imputed dataset saved as {output_file}")
        except ValueError as e:
            logging.error(f"Error in {dataset_name}: {e}")
    else:
        logging.warning(f"Dataset {dataset_name} not found!")

logging.info("All datasets have been imputed successfully.")

2025-02-01 11:35:01,855 - INFO - Processing dataset: o1_X_train
2025-02-01 11:35:03,027 - INFO - Group 1 (<= 10% missing): 48 rows
2025-02-01 11:35:07,623 - INFO - Group 2 (> 10% and <= 20% missing): 8496 rows
2025-02-01 13:43:03,593 - INFO - Group 3 (> 20% missing): 113952 rows
