# CIR-03: Hierarchical Imputation Framework

In [1]:
import pandas as pd
import numpy as np
import os
import io
import sys
import time
import logging
import copy
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from matplotlib.patches import Patch
from xgboost import XGBRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_is_fitted

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, Dense, Concatenate, GRU
from tensorflow.keras.optimizers import Adam

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-2.log')
logger.info("This is being logged to CIR-2.log")

2025-05-05 18:32:15,166 - INFO - Switched logging to logs/CIR-2.log
2025-05-05 18:32:15,166 - INFO - This is being logged to CIR-2.log


In [4]:
# Load datasets
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("+++++++++++++++++CIR-2+++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-05-05 18:32:15,182 - INFO - +++++++++++++++++CIR-2+++++++++++++++++++++++++
2025-05-05 18:32:15,182 - INFO - Start Loading Dataframes.
2025-05-05 18:32:15,182 - INFO - Loading... -> o1_X_external.csv
2025-05-05 18:32:22,038 - INFO - Loading... -> o1_X_test.csv
2025-05-05 18:32:22,541 - INFO - Loading... -> o1_X_train.csv
2025-05-05 18:32:26,827 - INFO - Loading... -> o1_X_validate.csv
2025-05-05 18:32:27,373 - INFO - Loading... -> o1_y_external_los.csv
2025-05-05 18:32:27,411 - INFO - Loading... -> o1_y_external_mortality.csv
2025-05-05 18:32:27,442 - INFO - Loading... -> o1_y_test_los.csv
2025-05-05 18:32:27,458 - INFO - Loading... -> o1_y_test_mortality.csv
2025-05-05 18:32:27,458 - INFO - Loading... -> o1_y_train_los.csv
2025-05-05 18:32:27,505 - INFO - Loading... -> o1_y_train_mortality.csv
2025-05-05 18:32:27,511 - INFO - Loading... -> o1_y_validate_los.csv
2025-05-05 18:32:27,527 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-05-05 18:32:27,527 - INFO - Loading... -

# CIR-14: Implement Row Segmentation by Missingness

In [5]:
# Build log file
switch_log_file('logs/CIR-14.log')
logger.info("This is being logged to CIR-14.log")

2025-05-05 18:32:41,443 - INFO - Switched logging to logs/CIR-14.log
2025-05-05 18:32:41,443 - INFO - This is being logged to CIR-14.log


In [6]:
"""
Segments the dataframe rows into categories based on the percentage of missing values.
"""
def segment_rows_by_missingness(df: pd.DataFrame):
    row_missing_perc = df.isnull().mean(axis=1)

    segments = {
        'very_low_missing 0% < 20%': df[(row_missing_perc <= 0.20)],
        'low_missing 21% <= 40%': df[(row_missing_perc > 0.20) & (row_missing_perc <= 0.40)],
        'moderate_missing 41% <= 60%': df[(row_missing_perc > 0.40) & (row_missing_perc <= 0.60)],
        'high_missing > 60%': df[(row_missing_perc > 0.60)]
    }

    row_indices = {
        name: segment.index.tolist() for name, segment in segments.items()
    }

    return segments, row_indices, row_missing_perc

In [7]:
# Segment and log all X_ dataframes
logging.info("---------------")
for var_name, df in dataframes.items():
    if not var_name.startswith("o") or "_X_" not in var_name:
        continue  # Skip non-feature or target datasets

    logging.info(f"Segmenting rows by missingness for: {var_name}")
    logging.info(f"{var_name} - Total rows: {df.shape[0]}")

    segments, row_indices, row_missing_perc = segment_rows_by_missingness(df)

    for segment_name, segment_df in segments.items():
        logging.info(f"{var_name} - {segment_name}: {len(segment_df)} rows")
    logging.info("---------------")

2025-05-05 18:32:41,463 - INFO - ---------------
2025-05-05 18:32:41,463 - INFO - Segmenting rows by missingness for: o1_X_external
2025-05-05 18:32:41,463 - INFO - o1_X_external - Total rows: 234720
2025-05-05 18:32:41,875 - INFO - o1_X_external - very_low_missing 0% < 20%: 576 rows
2025-05-05 18:32:41,875 - INFO - o1_X_external - low_missing 21% <= 40%: 49729 rows
2025-05-05 18:32:41,875 - INFO - o1_X_external - moderate_missing 41% <= 60%: 153407 rows
2025-05-05 18:32:41,875 - INFO - o1_X_external - high_missing > 60%: 31008 rows
2025-05-05 18:32:41,875 - INFO - ---------------
2025-05-05 18:32:41,875 - INFO - Segmenting rows by missingness for: o1_X_test
2025-05-05 18:32:41,875 - INFO - o1_X_test - Total rows: 15312
2025-05-05 18:32:41,944 - INFO - o1_X_test - very_low_missing 0% < 20%: 1248 rows
2025-05-05 18:32:41,944 - INFO - o1_X_test - low_missing 21% <= 40%: 7872 rows
2025-05-05 18:32:41,944 - INFO - o1_X_test - moderate_missing 41% <= 60%: 5424 rows
2025-05-05 18:32:41,944 -

In [8]:
# Ensure output directory exists
base_plot_path = "figures/CIR-14"
os.makedirs(base_plot_path, exist_ok=True)

# Seaborn aesthetic settings
sns.set(style="whitegrid", context="talk", palette="deep")

for var_name, df in dataframes.items():
    if not var_name.startswith("o") or "_X_" not in var_name:
        continue  # Skip targets

    logging.info(f"Processing missing distribution plot for {var_name}")

    # Calculate row-wise missingness
    row_missing_perc = df.isnull().mean(axis=1)
    segments, _, _ = segment_rows_by_missingness(df)

    # Prepare summary box content
    summary_text = (
        f"Total rows: {len(df):,}\n"
        f"Very low (≤20%): {len(segments['very_low_missing 0% < 20%']):,}\n"
        f"Low (21–40%): {len(segments['low_missing 21% <= 40%']):,}\n"
        f"Moderate (41–60%): {len(segments['moderate_missing 41% <= 60%']):,}\n"
        f"High (>60%): {len(segments['high_missing > 60%']):,}"
    )

    # Create the figure
    fig, ax = plt.subplots(figsize=(12, 7))
    sns.histplot(row_missing_perc, bins=20, kde=True, color='#2c7fb8', edgecolor='black', ax=ax)

    # Customize titles and labels
    ax.set_title(f"Row-wise Missing Value Distribution\n{var_name}", fontsize=18, fontweight='bold')
    ax.set_xlabel("Proportion of Missing Values", fontsize=15)
    ax.set_ylabel("Number of Rows", fontsize=15)

    # Add summary box to top-right
    ax.text(
        0.99, 0.95, summary_text,
        transform=ax.transAxes,
        fontsize=12,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle="round,pad=0.4", facecolor='whitesmoke', alpha=0.85, edgecolor='gray')
    )

    # Add grid with transparency
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
    ax.tick_params(axis='both', labelsize=12)

    # Optional: Add watermark tag
    ax.text(0.01, 0.01, "CIR-14", transform=ax.transAxes,
            fontsize=10, color='gray', alpha=0.7, ha='left', va='bottom')

    # Save the figure
    plt.tight_layout()
    plot_filename = os.path.join(base_plot_path, f"{var_name}_missing_distribution.png")
    fig.savefig(plot_filename, dpi=300)
    plt.close(fig)

    logging.info(f"Saved professional missingness plot to {plot_filename}")

2025-05-05 18:32:43,204 - INFO - Processing missing distribution plot for o1_X_external
2025-05-05 18:32:46,078 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_external_missing_distribution.png
2025-05-05 18:32:46,078 - INFO - Processing missing distribution plot for o1_X_test
2025-05-05 18:32:46,982 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_test_missing_distribution.png
2025-05-05 18:32:46,982 - INFO - Processing missing distribution plot for o1_X_train
2025-05-05 18:32:48,717 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_train_missing_distribution.png
2025-05-05 18:32:48,717 - INFO - Processing missing distribution plot for o1_X_validate
2025-05-05 18:32:49,580 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_validate_missing_distribution.png
2025-05-05 18:32:49,580 - INFO - Processing missing distribution plot for o2_X_external
2025-05-05 18:32:51,253 - INFO - Saved professional missingness plot t

# CIR-15: Register Multiple Imputation Methods
## mean, median, knn, iterative, xgboost, gan, LSTM, RNN

In [9]:
# Build log file
switch_log_file('logs/CIR-15.log')
logger.info("This is being logged to CIR-15.log")

2025-05-05 18:33:02,335 - INFO - Switched logging to logs/CIR-15.log
2025-05-05 18:33:02,335 - INFO - This is being logged to CIR-15.log


In [10]:
"""
Impute missing values using XGBoost regression for each column independently.
"""
def xgboost_imputer(df, random_state=0):
    df_imputed = df.copy()

    for col in df.columns:
        if df[col].isnull().sum() == 0:
            continue  # Skip fully observed columns

        # Split rows with and without missing values in this column
        not_null_idx = df[col].notnull()
        null_idx = df[col].isnull()

        X_train = df.loc[not_null_idx].drop(columns=[col])
        y_train = df.loc[not_null_idx, col]
        X_pred = df.loc[null_idx].drop(columns=[col])

        # Skip if nothing to predict
        if X_pred.empty:
            continue

        # Drop columns that are completely NaN
        X_train = X_train.dropna(axis=1, how='all')
        X_pred = X_pred[X_train.columns]  # keep same columns

        # Fill remaining NaNs with column means (simple fallback)
        X_train = X_train.fillna(X_train.mean())
        X_pred = X_pred.fillna(X_train.mean())

        # Train XGBoost model
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=random_state,
            verbosity=0
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_pred)

        # Impute predicted values
        df_imputed.loc[null_idx, col] = y_pred

    return df_imputed

In [11]:
"""
Impute missing values using an LSTM autoencoder.
Works best for dense rows (e.g., <40% missing).
"""
"""
Impute missing values using an LSTM autoencoder.
Works best for dense rows (e.g., <40% missing).
"""

# Cache the model outside the function (top-level variable)
_lstm_model = None

def lstm_imputer(df, random_state=0, epochs=30, batch_size=64):
    global _lstm_model

    df_copy = df.copy()
    idx = df_copy.index
    cols = df_copy.columns

    # Fill missing values and normalize
    df_filled = df_copy.fillna(df_copy.mean())
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df_filled)
    X = df_scaled.reshape((df_scaled.shape[0], 1, df_scaled.shape[1]))
    input_dim = X.shape[2]

    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    # Only build the model once
    if _lstm_model is None:
        input_layer = Input(shape=(1, input_dim))
        encoded = LSTM(64, activation="relu", return_sequences=False)(input_layer)
        repeated = RepeatVector(1)(encoded)
        decoded = LSTM(input_dim, activation="sigmoid", return_sequences=True)(repeated)
        _lstm_model = Model(inputs=input_layer, outputs=decoded)
        _lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

    # Train and log loss
    for epoch in range(epochs):
        history = _lstm_model.fit(X, X, epochs=1, batch_size=batch_size, verbose=0)
        if epoch % 10 == 0 or epoch == epochs - 1:
            logging.info(f"[LSTM Epoch {epoch}] Loss: {history.history['loss'][0]:.4f}")

    # Predict and inverse transform
    X_imputed = _lstm_model.predict(X, verbose=0)
    df_imputed_array = scaler.inverse_transform(X_imputed[:, 0, :])
    df_imputed = pd.DataFrame(df_imputed_array, columns=cols, index=idx)

    # Only fill missing values
    for col in cols:
        missing_mask = df[col].isnull()
        df_copy.loc[missing_mask, col] = df_imputed.loc[missing_mask, col]

    return df_copy

In [12]:
"""
GAN-style imputer for missing data based on GAIN.
Arguments:
    df (pd.DataFrame): Input dataframe with missing values.
    random_state (int): Seed for reproducibility.
    epochs (int): Number of training iterations.
    batch_size (int): Batch size for training.
Returns:
    pd.DataFrame: Imputed dataframe.
"""
def gan_imputer(df, random_state=0, epochs=1000, batch_size=128):
    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    df_copy = df.copy()
    cols = df_copy.columns
    idx = df_copy.index

    # ===== Step 1: Normalize & Create mask =====
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df_copy.fillna(0))  # Fill NA with 0 for scaling
    mask = ~df_copy.isnull().values  # 1 where observed, 0 where missing

    data_dim = df_scaled.shape[1]
    
    # ===== Step 2: Generator =====
    def build_generator():
        inputs = Input(shape=(data_dim * 2,))
        x = Dense(128, activation='relu')(inputs)
        x = Dense(128, activation='relu')(x)
        x = Dense(data_dim, activation='sigmoid')(x)
        return Model(inputs, x)

    # ===== Step 3: Discriminator =====
    def build_discriminator():
        inputs = Input(shape=(data_dim * 2,))
        x = Dense(128, activation='relu')(inputs)
        x = Dense(128, activation='relu')(x)
        x = Dense(data_dim, activation='sigmoid')(x)
        return Model(inputs, x)

    G = build_generator()
    D = build_discriminator()
    G.compile(loss='binary_crossentropy', optimizer=Adam(0.001))
    D.compile(loss='binary_crossentropy', optimizer=Adam(0.001))

    # ===== Step 4: Training =====
    for epoch in range(epochs):
        # === Consistent batch size to avoid retracing ===
        if df_scaled.shape[0] < batch_size:
            repeat_factor = int(np.ceil(batch_size / df_scaled.shape[0]))
            X_batch = np.tile(df_scaled, (repeat_factor, 1))[:batch_size]
            M_batch = np.tile(mask, (repeat_factor, 1))[:batch_size]
        else:
            batch_idx = np.random.choice(df_scaled.shape[0], batch_size, replace=False)
            X_batch = df_scaled[batch_idx]
            M_batch = mask[batch_idx]

        Z_batch = np.random.uniform(0, 0.01, size=X_batch.shape)
        X_hat = M_batch * X_batch + (1 - M_batch) * Z_batch
        G_input = np.concatenate([X_hat, M_batch], axis=1)

        G_sample = G.predict(G_input, verbose=0)
        X_fake = M_batch * X_batch + (1 - M_batch) * G_sample

        D_input_real = np.concatenate([X_batch, M_batch], axis=1)
        D_input_fake = np.concatenate([X_fake, M_batch], axis=1)

        D_loss_real = D.train_on_batch(D_input_real, M_batch)
        D_loss_fake = D.train_on_batch(D_input_fake, M_batch)

        # === Train Generator ===
        G_loss = G.train_on_batch(G_input, M_batch)

        if epoch % 100 == 0:
            logging.info(f"[{epoch}] D_loss: {(D_loss_real + D_loss_fake) / 2:.4f} | G_loss: {G_loss:.4f}")

    # ===== Step 5: Imputation =====
    Z_full = np.random.uniform(0, 0.01, size=df_scaled.shape)
    X_hat_full = mask * df_scaled + (1 - mask) * Z_full
    G_input_full = np.concatenate([X_hat_full, mask], axis=1)

    G_imputed = G.predict(G_input_full, verbose=0)
    X_final = mask * df_scaled + (1 - mask) * G_imputed

    df_imputed_array = scaler.inverse_transform(X_final)
    df_imputed = pd.DataFrame(df_imputed_array, columns=cols, index=idx)

    return df_imputed

In [13]:
"""
Impute missing values using a GRU-based autoencoder.
"""

# Cache model to avoid retracing
_rnn_model = None

def rnn_imputer(df, random_state=0, epochs=30, batch_size=64):
    global _rnn_model

    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    df_copy = df.copy()
    idx = df_copy.index
    cols = df_copy.columns

    # Step 1: Pre-fill missing with mean
    df_filled = df_copy.fillna(df_copy.mean())

    # Step 2: Scale
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df_filled)

    # Step 3: Reshape to 3D [samples, timesteps, features]
    X = df_scaled.reshape((df_scaled.shape[0], 1, df_scaled.shape[1]))
    input_dim = X.shape[2]

    # Step 4: Build GRU Autoencoder once
    if _rnn_model is None:
        input_layer = Input(shape=(1, input_dim))
        encoded = GRU(64, activation='relu', return_sequences=False)(input_layer)
        repeated = RepeatVector(1)(encoded)
        decoded = GRU(input_dim, activation='sigmoid', return_sequences=True)(repeated)
        _rnn_model = Model(inputs=input_layer, outputs=decoded)
        _rnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    # Step 5: Train with loss logging
    for epoch in range(epochs):
        history = _rnn_model.fit(X, X, epochs=1, batch_size=batch_size, verbose=0)
        if epoch % 10 == 0 or epoch == epochs - 1:
            logging.info(f"[RNN Epoch {epoch}] Loss: {history.history['loss'][0]:.4f}")

    # Step 6: Predict (impute)
    X_imputed = _rnn_model.predict(X, verbose=0)
    df_imputed_array = scaler.inverse_transform(X_imputed[:, 0, :])
    df_imputed = pd.DataFrame(df_imputed_array, columns=cols, index=idx)

    # Step 7: Replace only missing values
    for col in cols:
        missing_mask = df[col].isnull()
        df_copy.loc[missing_mask, col] = df_imputed.loc[missing_mask, col]

    return df_copy


In [14]:
# --- Tee class to redirect output to both stdout and logging ---
class Tee:
    def __init__(self, *files, use_logging=False):
        self.files = files
        self.use_logging = use_logging

    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush()
        if self.use_logging:
            for line in obj.rstrip().splitlines():
                logging.info(line)

    def flush(self):
        for f in self.files:
            f.flush()

# --- Iterative Imputer Function ---
def impute_with_iterative(input_df, method, output_path, n_iter, log_verbose_file_path=None):
    logging.info(f"Starting Iterative Imputer with method={method} on input DataFrame of shape {input_df.shape}.")

    data_copy = input_df.copy()
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Estimator selection
    if method == "ExtraTrees":
        estimator = ExtraTreesRegressor(n_estimators=10, random_state=0, n_jobs=-1)
    elif method == "HistGradientBoosting":
        estimator = HistGradientBoostingRegressor(random_state=0)
    else:
        raise ValueError(f"Unsupported method: {method}. Use 'ExtraTrees' or 'HistGradientBoosting'.")

    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=n_iter,
        random_state=0,
        verbose=2,
        sample_posterior=False
    )

    start_time = time.time()

    if log_verbose_file_path is not None:
        os.makedirs(os.path.dirname(log_verbose_file_path), exist_ok=True)
        original_stdout = sys.stdout
        with open(log_verbose_file_path, "w") as log_file:
            sys.stdout = Tee(sys.__stdout__, log_file, use_logging=True)
            try:
                imputed_array = imputer.fit_transform(data_copy)
            finally:
                sys.stdout = original_stdout
    else:
        sys.stdout = Tee(sys.__stdout__, use_logging=True)
        try:
            imputed_array = imputer.fit_transform(data_copy)
        finally:
            sys.stdout = sys.__stdout__

    end_time = time.time()
    runtime = end_time - start_time

    # Retain original index to avoid downstream assignment errors
    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns, index=data_copy.index)
    imputed_df.to_csv(output_path, index=False)

    logging.info(f"Imputation completed in {runtime:.2f} seconds.")
    logging.info(f"Number of NaNs after imputation: {np.isnan(imputed_df.values).sum()}")
    logging.info(f"Imputed dataset saved at {output_path}")

    #describe_output_path = output_path.replace(".csv", "_describe.csv")
    #imputed_df.describe().to_csv(describe_output_path)
    #logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df


In [15]:
# --- Registry of Imputation Methods ---
imputer_registry = {
    "mean": SimpleImputer(strategy="mean"),
    "median": SimpleImputer(strategy="median"),
    "knn": KNNImputer(n_neighbors=5, weights="uniform"),
    "iterative": lambda df: impute_with_iterative(
        input_df=df,
        method="ExtraTrees",
        output_path="imputed_outputs/tmp.csv",  # dummy or default path
        n_iter=20
    ),
    "xgboost": xgboost_imputer,
    "gan": gan_imputer,
    "lstm": lstm_imputer,
    "rnn": rnn_imputer
}

# CIR-16: Build Core Hierarchical Controller Function

In [16]:
"""
Dynamic hierarchical imputer using cumulative row-wise missingness and assigned methods.

Parameters:
    df (pd.DataFrame): Dataset with missing values.
    thresholds (list): List of group widths (must sum to ~1.0).
    method_names (list): List of method names (must match thresholds).
    method_registry (dict): Registered methods with keys as names and values as callables or sklearn objects.
    random_state (int): Random seed for reproducibility.
    return_method_log (bool): Return pd.Series logging method used per row.

Returns:
    imputed_df (pd.DataFrame)
    method_log (pd.Series) — only if return_method_log=True
"""

def hierarchical_impute_dynamic(
    df,
    thresholds,
    method_names,
    method_registry,
    random_state=0,
    return_method_log=False,
    dataset_name=None
):
    if len(thresholds) != len(method_names):
        raise ValueError("The number of thresholds must match the number of methods.")

    df_copy = df.copy()
    df_copy["missing_pct"] = df_copy.isnull().mean(axis=1)
    cols = df_copy.columns.drop("missing_pct")

    global_means = df_copy[cols].mean().fillna(0)
    global_min = df_copy[cols].min()
    global_max = df_copy[cols].max()

    imputed_df = pd.DataFrame(index=df_copy.index, columns=cols)
    method_log = pd.Series(index=df_copy.index, dtype="object")

    cum_thresholds = np.cumsum(thresholds)
    if not np.isclose(cum_thresholds[-1], 1.0):
        raise ValueError("Thresholds must sum to 1.0")

    previous_imputed = None

    # For visualization
    group_names = []
    cumulative_rows = []
    method_names_actual = []
    cumulative_total = 0

    for i, upper_bound in enumerate(cum_thresholds):
        lower_bound = cum_thresholds[i - 1] if i > 0 else 0.0
        idx = df_copy.index[
            (df_copy["missing_pct"] > lower_bound) & (df_copy["missing_pct"] <= upper_bound)
        ]
        group_data = df_copy.loc[idx, cols].copy()

        for col in group_data.columns:
            if group_data[col].isnull().all():
                group_data[col] = global_means[col]

        if group_data.empty:
            continue

        method_name = method_names[i]
        logging.info(f"Group {i+1} ({lower_bound:.2f}, {upper_bound:.2f}] -> {method_name} | {len(group_data)} rows")

        imputer = get_imputer(method_name, method_registry)

        if previous_imputed is None:
            combined = group_data
        else:
            combined = pd.concat([previous_imputed, group_data])

        try:
            if hasattr(imputer, "fit_transform"):
                combined_imputed = imputer.fit_transform(combined)
                combined_imputed = pd.DataFrame(combined_imputed, columns=combined.columns, index=combined.index)
            else:
                combined_imputed = imputer(combined, random_state=random_state)
        except TypeError:
            combined_imputed = imputer(combined)

        group_imputed = combined_imputed.loc[idx].clip(lower=global_min, upper=global_max, axis=1)

        imputed_df.loc[idx] = group_imputed
        method_log.loc[idx] = method_name

        previous_imputed = pd.concat([previous_imputed, group_imputed]) if previous_imputed is not None else group_imputed.copy()

        group_label = f"{int(lower_bound * 100)}%–{int(upper_bound * 100)}%"
        group_names.append(group_label)
        cumulative_total += len(group_data)
        cumulative_rows.append(cumulative_total)
        method_names_actual.append(method_name)

    if imputed_df.isnull().values.any():
        raise ValueError("NaNs remain after hierarchical imputation!")

    # === Plot ===
    output_dir = "figures/CIR-16"
    os.makedirs(output_dir, exist_ok=True)

    unique_methods = list(set(method_names_actual))
    palette = sns.color_palette("tab10", n_colors=len(unique_methods))
    method_color_map = {method: palette[i] for i, method in enumerate(unique_methods)}
    colors = [method_color_map[m] for m in method_names_actual]

    plt.figure(figsize=(10, 10))
    plt.barh(
        y=range(1, len(cumulative_rows) + 1),
        width=cumulative_rows,
        color=colors,
        edgecolor='black'
    )

    plt.yticks(ticks=range(1, len(group_names) + 1), labels=group_names)
    plt.title("Cumulative Rows Used for Imputation by Group", fontsize=14, fontweight='bold')
    plt.ylabel("Missingness Range", fontsize=12)
    plt.xlabel("Cumulative Rows Used", fontsize=12)
    plt.grid(True, axis='x', linestyle='--', alpha=0.6)

    legend_handles = [Patch(color=color, label=method) for method, color in method_color_map.items()]
    plt.legend(handles=legend_handles, title="Imputation Method", loc="lower right")

    plt.tight_layout()

    # === Save plot with dataset-specific name ===
    if dataset_name:
        filename = f"{dataset_name}_cumulative_imputation_rows.png"
    else:
        filename = "cumulative_imputation_rows.png"

    plt.savefig(os.path.join(output_dir, filename), dpi=300)
    plt.close()

    return (imputed_df, method_log) if return_method_log else imputed_df

In [17]:
def get_imputer(method_name, registry):
    imputer = registry.get(method_name)
    if imputer is None:
        raise ValueError(f"Method '{method_name}' not found or not implemented.")
    if hasattr(imputer, "fit") and hasattr(imputer, "transform"):
        return copy.deepcopy(imputer)
    return imputer

In [None]:
"""
Methods we define and can use:
mean, median, knn, iterative, xgboost, gan, lstm, rnn
"""

"""
# List of dataset names to impute
datasets = [
    "o1_X_train", "o1_X_validate", "o1_X_test", "o1_X_external",
    "o2_X_train", "o2_X_validate", "o2_X_test", "o2_X_external",
    "o3_X_train", "o3_X_validate", "o3_X_test", "o3_X_external",
    "o4_X_train", "o4_X_validate", "o4_X_test", "o4_X_external"
]
"""
datasets = [
    "o4_X_validate", "o4_X_test", "o4_X_external"
]

# Define thresholds and corresponding methods
thresholds = [0.05] * 20
method_names = ["knn"] * 2 + ["iterative"] * 4 + ["lstm"] * 4 + ["rnn"] * 4 + ["gan"] * 6

# Loop through and apply imputation
for name in datasets:
    logging.info(f"Imputing: {name}")
    df = globals().get(name)

    if df is None or not isinstance(df, pd.DataFrame):
        logging.info(f"Skipping {name} (not found or not a DataFrame)")
        continue

    try:
        imputed_df, method_log = hierarchical_impute_dynamic(
            df=df,
            thresholds=thresholds,
            method_names=method_names,
            method_registry=imputer_registry,
            random_state=0,
            return_method_log=True,
            dataset_name=name
        )

        output_path = f"CSV/exports/CIR-16/impute/{name}_imputed.csv"
        imputed_df.to_csv(output_path, index=False)
        logging.info(f"Saved: {output_path}")

    except Exception as e:
        logging.info(f"Failed to impute {name}: {e}")

2025-05-05 18:33:02,487 - INFO - Imputing: o4_X_validate
2025-05-05 18:33:02,712 - INFO - Group 2 (0.05, 0.10] -> knn | 12 rows
2025-05-05 18:33:02,781 - INFO - Group 3 (0.10, 0.15] -> iterative | 24 rows
2025-05-05 18:33:02,781 - INFO - Starting Iterative Imputer with method=ExtraTrees on input DataFrame of shape (36, 345).
2025-05-05 18:33:02,796 - INFO - [IterativeImputer] Completing matrix with shape (36, 345)
2025-05-05 18:33:12,232 - INFO - [IterativeImputer] Ending imputation round 1/20, elapsed time 9.43
2025-05-05 18:33:12,235 - INFO - [IterativeImputer] Change: 200.86813354492188, scaled tolerance: 0.8510000109672546
2025-05-05 18:33:20,384 - INFO - [IterativeImputer] Ending imputation round 2/20, elapsed time 17.58
2025-05-05 18:33:20,387 - INFO - [IterativeImputer] Change: 131.36111450195312, scaled tolerance: 0.8510000109672546
2025-05-05 18:33:28,606 - INFO - [IterativeImputer] Ending imputation round 3/20, elapsed time 25.80
2025-05-05 18:33:28,609 - INFO - [IterativeImp







2025-05-05 18:57:06,367 - INFO - [100] D_loss: 0.0425 | G_loss: 0.0731
2025-05-05 18:58:46,645 - INFO - [200] D_loss: 0.0223 | G_loss: 0.0384
2025-05-05 19:00:41,946 - INFO - [300] D_loss: 0.0155 | G_loss: 0.0263
2025-05-05 19:03:01,264 - INFO - [400] D_loss: 0.0120 | G_loss: 0.0202
2025-05-05 19:05:57,597 - INFO - [500] D_loss: 0.0097 | G_loss: 0.0164
2025-05-05 19:09:03,726 - INFO - [600] D_loss: 0.0081 | G_loss: 0.0138
2025-05-05 19:12:34,151 - INFO - [700] D_loss: 0.0070 | G_loss: 0.0119
2025-05-05 19:16:38,685 - INFO - [800] D_loss: 0.0061 | G_loss: 0.0105
2025-05-05 19:21:05,096 - INFO - [900] D_loss: 0.0054 | G_loss: 0.0093


In [None]:
output_path = "CSV/exports/CIR-16/impute/"
os.makedirs(output_path, exist_ok=True)

imputed_df.to_csv(os.path.join(output_path, f"{file_name}.csv"), index=False)
method_log.to_csv(os.path.join(output_path, f"{file_name}_method_log.csv"))

In [None]:
small_data = o4_X_train.iloc[:1000, :]  # picking rows