# CIR-03: Hierarchical Imputation Framework

In [1]:
import pandas as pd
import numpy as np
import os
import io
import logging
import copy

from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

from xgboost import XGBRegressor

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-2.log')
logger.info("This is being logged to CIR-2.log")

2025-05-04 01:54:57,201 - INFO - Switched logging to logs/CIR-2.log
2025-05-04 01:54:57,204 - INFO - This is being logged to CIR-2.log


In [4]:
# Load datasets
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("+++++++++++++++++CIR-2+++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-05-04 01:54:57,217 - INFO - +++++++++++++++++CIR-2+++++++++++++++++++++++++
2025-05-04 01:54:57,218 - INFO - Start Loading Dataframes.
2025-05-04 01:54:57,219 - INFO - Loading... -> o1_X_external.csv
2025-05-04 01:55:04,447 - INFO - Loading... -> o1_X_test.csv
2025-05-04 01:55:05,056 - INFO - Loading... -> o1_X_train.csv
2025-05-04 01:55:09,174 - INFO - Loading... -> o1_X_validate.csv
2025-05-04 01:55:09,683 - INFO - Loading... -> o1_y_external_los.csv
2025-05-04 01:55:09,727 - INFO - Loading... -> o1_y_external_mortality.csv
2025-05-04 01:55:09,755 - INFO - Loading... -> o1_y_test_los.csv
2025-05-04 01:55:09,765 - INFO - Loading... -> o1_y_test_mortality.csv
2025-05-04 01:55:09,769 - INFO - Loading... -> o1_y_train_los.csv
2025-05-04 01:55:09,805 - INFO - Loading... -> o1_y_train_mortality.csv
2025-05-04 01:55:09,822 - INFO - Loading... -> o1_y_validate_los.csv
2025-05-04 01:55:09,832 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-05-04 01:55:09,836 - INFO - Loading... -

# CIR-14: Implement Row Segmentation by Missingness

In [5]:
# Build log file
switch_log_file('logs/CIR-14.log')
logger.info("This is being logged to CIR-14.log")

2025-05-04 01:55:23,452 - INFO - Switched logging to logs/CIR-14.log
2025-05-04 01:55:23,454 - INFO - This is being logged to CIR-14.log


In [6]:
"""
Segments the dataframe rows into categories based on the percentage of missing values.
"""
def segment_rows_by_missingness(df: pd.DataFrame):
    row_missing_perc = df.isnull().mean(axis=1)

    segments = {
        'very_low_missing 0% < 20%': df[(row_missing_perc <= 0.20)],
        'low_missing 21% <= 40%': df[(row_missing_perc > 0.20) & (row_missing_perc <= 0.40)],
        'moderate_missing 41% <= 60%': df[(row_missing_perc > 0.40) & (row_missing_perc <= 0.60)],
        'high_missing > 60%': df[(row_missing_perc > 0.60)]
    }

    row_indices = {
        name: segment.index.tolist() for name, segment in segments.items()
    }

    return segments, row_indices, row_missing_perc

In [7]:
# Segment and log all X_ dataframes
logging.info("---------------")
for var_name, df in dataframes.items():
    if not var_name.startswith("o") or "_X_" not in var_name:
        continue  # Skip non-feature or target datasets

    logging.info(f"Segmenting rows by missingness for: {var_name}")
    logging.info(f"{var_name} - Total rows: {df.shape[0]}")

    segments, row_indices, row_missing_perc = segment_rows_by_missingness(df)

    for segment_name, segment_df in segments.items():
        logging.info(f"{var_name} - {segment_name}: {len(segment_df)} rows")
    logging.info("---------------")

2025-05-04 01:55:23,476 - INFO - ---------------
2025-05-04 01:55:23,477 - INFO - Segmenting rows by missingness for: o1_X_external
2025-05-04 01:55:23,478 - INFO - o1_X_external - Total rows: 234720
2025-05-04 01:55:23,934 - INFO - o1_X_external - very_low_missing 0% < 20%: 576 rows
2025-05-04 01:55:23,935 - INFO - o1_X_external - low_missing 21% <= 40%: 49729 rows
2025-05-04 01:55:23,936 - INFO - o1_X_external - moderate_missing 41% <= 60%: 153407 rows
2025-05-04 01:55:23,938 - INFO - o1_X_external - high_missing > 60%: 31008 rows
2025-05-04 01:55:23,939 - INFO - ---------------
2025-05-04 01:55:23,939 - INFO - Segmenting rows by missingness for: o1_X_test
2025-05-04 01:55:23,940 - INFO - o1_X_test - Total rows: 15312
2025-05-04 01:55:24,002 - INFO - o1_X_test - very_low_missing 0% < 20%: 1248 rows
2025-05-04 01:55:24,003 - INFO - o1_X_test - low_missing 21% <= 40%: 7872 rows
2025-05-04 01:55:24,004 - INFO - o1_X_test - moderate_missing 41% <= 60%: 5424 rows
2025-05-04 01:55:24,005 -

In [8]:
# Ensure output directory exists
base_plot_path = "figures/CIR-14"
os.makedirs(base_plot_path, exist_ok=True)

# Seaborn aesthetic settings
sns.set(style="whitegrid", context="talk", palette="deep")

for var_name, df in dataframes.items():
    if not var_name.startswith("o") or "_X_" not in var_name:
        continue  # Skip targets

    logging.info(f"Processing missing distribution plot for {var_name}")

    # Calculate row-wise missingness
    row_missing_perc = df.isnull().mean(axis=1)
    segments, _, _ = segment_rows_by_missingness(df)

    # Prepare summary box content
    summary_text = (
        f"Total rows: {len(df):,}\n"
        f"Very low (≤20%): {len(segments['very_low_missing 0% < 20%']):,}\n"
        f"Low (21–40%): {len(segments['low_missing 21% <= 40%']):,}\n"
        f"Moderate (41–60%): {len(segments['moderate_missing 41% <= 60%']):,}\n"
        f"High (>60%): {len(segments['high_missing > 60%']):,}"
    )

    # Create the figure
    fig, ax = plt.subplots(figsize=(12, 7))
    sns.histplot(row_missing_perc, bins=20, kde=True, color='#2c7fb8', edgecolor='black', ax=ax)

    # Customize titles and labels
    ax.set_title(f"Row-wise Missing Value Distribution\n{var_name}", fontsize=18, fontweight='bold')
    ax.set_xlabel("Proportion of Missing Values", fontsize=15)
    ax.set_ylabel("Number of Rows", fontsize=15)

    # Add summary box to top-right
    ax.text(
        0.99, 0.95, summary_text,
        transform=ax.transAxes,
        fontsize=12,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle="round,pad=0.4", facecolor='whitesmoke', alpha=0.85, edgecolor='gray')
    )

    # Add grid with transparency
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
    ax.tick_params(axis='both', labelsize=12)

    # Optional: Add watermark tag
    ax.text(0.01, 0.01, "CIR-14", transform=ax.transAxes,
            fontsize=10, color='gray', alpha=0.7, ha='left', va='bottom')

    # Save the figure
    plt.tight_layout()
    plot_filename = os.path.join(base_plot_path, f"{var_name}_missing_distribution.png")
    fig.savefig(plot_filename, dpi=300)
    plt.close(fig)

    logging.info(f"Saved professional missingness plot to {plot_filename}")

2025-05-04 01:55:25,313 - INFO - Processing missing distribution plot for o1_X_external
2025-05-04 01:55:28,165 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_external_missing_distribution.png
2025-05-04 01:55:28,166 - INFO - Processing missing distribution plot for o1_X_test
2025-05-04 01:55:29,062 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_test_missing_distribution.png
2025-05-04 01:55:29,063 - INFO - Processing missing distribution plot for o1_X_train
2025-05-04 01:55:30,797 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_train_missing_distribution.png
2025-05-04 01:55:30,798 - INFO - Processing missing distribution plot for o1_X_validate
2025-05-04 01:55:31,641 - INFO - Saved professional missingness plot to figures/CIR-14\o1_X_validate_missing_distribution.png
2025-05-04 01:55:31,642 - INFO - Processing missing distribution plot for o2_X_external
2025-05-04 01:55:33,320 - INFO - Saved professional missingness plot t

# CIR-15: Register Multiple Imputation Methods
## mean, median, knn, iterative, xgboost, gan, LSTM, RNN

In [9]:
# Build log file
switch_log_file('logs/CIR-15.log')
logger.info("This is being logged to CIR-15.log")

2025-05-04 01:55:44,130 - INFO - Switched logging to logs/CIR-15.log
2025-05-04 01:55:44,131 - INFO - This is being logged to CIR-15.log


In [10]:
def xgboost_imputer(df, random_state=0):
    """
    Impute missing values using XGBoost regression for each column independently.
    
    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        random_state (int): Random seed.
    
    Returns:
        pd.DataFrame: Imputed DataFrame.
    """
    df_imputed = df.copy()

    for col in df.columns:
        if df[col].isnull().sum() == 0:
            continue  # Skip fully observed columns

        # Split rows with and without missing values in this column
        not_null_idx = df[col].notnull()
        null_idx = df[col].isnull()

        X_train = df.loc[not_null_idx].drop(columns=[col])
        y_train = df.loc[not_null_idx, col]
        X_pred = df.loc[null_idx].drop(columns=[col])

        # Skip if nothing to predict
        if X_pred.empty:
            continue

        # Drop columns that are completely NaN
        X_train = X_train.dropna(axis=1, how='all')
        X_pred = X_pred[X_train.columns]  # keep same columns

        # Fill remaining NaNs with column means (simple fallback)
        X_train = X_train.fillna(X_train.mean())
        X_pred = X_pred.fillna(X_train.mean())

        # Train XGBoost model
        model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=random_state,
            verbosity=0
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_pred)

        # Impute predicted values
        df_imputed.loc[null_idx, col] = y_pred

    return df_imputed

In [11]:
def lstm_imputer(df, random_state=0):
    """
    Impute missing values using an LSTM autoencoder.
    Works best for dense rows (e.g., <40% missing).
    """
    df_copy = df.copy()
    idx = df_copy.index
    cols = df_copy.columns

    # Fill initial missing values with column means
    df_filled = df_copy.fillna(df_copy.mean())

    # Scale values to 0-1 for neural network stability
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df_filled)

    # Reshape to 3D [samples, timesteps, features]
    # We'll treat each row as a "sequence" with 1 timestep
    X = df_scaled.reshape((df_scaled.shape[0], 1, df_scaled.shape[1]))

    # LSTM autoencoder
    input_dim = X.shape[2]
    input_layer = Input(shape=(1, input_dim))
    encoded = LSTM(64, activation="relu", return_sequences=False)(input_layer)
    repeated = RepeatVector(1)(encoded)
    decoded = LSTM(input_dim, activation="sigmoid", return_sequences=True)(repeated)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

    # Train
    autoencoder.fit(X, X, epochs=30, batch_size=64, verbose=0)

    # Predict
    X_imputed = autoencoder.predict(X, verbose=0)

    # Reshape back and inverse scale
    df_imputed_array = scaler.inverse_transform(X_imputed[:, 0, :])
    df_imputed = pd.DataFrame(df_imputed_array, columns=cols, index=idx)

    # Replace only originally missing values
    for col in cols:
        missing_mask = df[col].isnull()
        df_copy.loc[missing_mask, col] = df_imputed.loc[missing_mask, col]

    return df_copy


In [12]:
# --- Placeholder Functions for Custom Imputation Models ---

def gan_imputer(df):
    """
    Placeholder for GAN-based imputation.
    Should return imputed DataFrame.
    """
    raise NotImplementedError("GAN imputer not yet implemented.")

def rnn_imputer(df):
    """
    Placeholder for RNN-based imputation.
    Should return imputed DataFrame.
    """
    raise NotImplementedError("RNN imputer not yet implemented.")

In [13]:
# --- Registry of Imputation Methods ---
imputer_registry = {
    "mean": SimpleImputer(strategy="mean"),
    "median": SimpleImputer(strategy="median"),
    "knn": KNNImputer(n_neighbors=5, weights="uniform"),
    "iterative": IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=42),
                                   max_iter=10, random_state=42),
    "xgboost": xgboost_imputer,
    "gan": gan_imputer,
    "lstm": lstm_imputer,
    "rnn": rnn_imputer
}

# CIR-16: Build Core Hierarchical Controller Function

In [14]:
def hierarchical_impute_dynamic(
    df,
    thresholds,
    method_names,
    method_registry,
    random_state=0,
    return_method_log=False
):
    """
    Dynamic hierarchical imputer using cumulative row-wise missingness and assigned methods.

    Parameters:
        df (pd.DataFrame): Dataset with missing values.
        thresholds (list): List of group widths (must sum to ~1.0).
        method_names (list): List of method names (must match thresholds).
        method_registry (dict): Registered methods with keys as names and values as callables or sklearn objects.
        random_state (int): Random seed for reproducibility.
        return_method_log (bool): Return pd.Series logging method used per row.

    Returns:
        imputed_df (pd.DataFrame)
        method_log (pd.Series) — only if return_method_log=True
    """
    if len(thresholds) != len(method_names):
        raise ValueError("The number of thresholds must match the number of methods.")

    df_copy = df.copy()
    df_copy["missing_pct"] = df_copy.isnull().mean(axis=1)
    cols = df_copy.columns.drop("missing_pct")

    global_means = df_copy[cols].mean().fillna(0)
    global_min = df_copy[cols].min()
    global_max = df_copy[cols].max()

    imputed_df = pd.DataFrame(index=df_copy.index, columns=cols)
    method_log = pd.Series(index=df_copy.index, dtype="object")

    cum_thresholds = np.cumsum(thresholds)
    if not np.isclose(cum_thresholds[-1], 1.0):
        raise ValueError("Thresholds must sum to 1.0")

    previous_imputed = None

    for i, upper_bound in enumerate(cum_thresholds):
        lower_bound = cum_thresholds[i - 1] if i > 0 else 0.0
        idx = df_copy.index[
            (df_copy["missing_pct"] > lower_bound) & (df_copy["missing_pct"] <= upper_bound)
        ]
        group_data = df_copy.loc[idx, cols].copy()

        for col in group_data.columns:
            if group_data[col].isnull().all():
                group_data[col] = global_means[col]

        if group_data.empty:
            continue

        method_name = method_names[i]
        logging.info(f"Group {i+1} ({lower_bound:.2f}, {upper_bound:.2f}] -> {method_name} | {len(group_data)} rows")

        imputer = get_imputer(method_name, method_registry)

        if previous_imputed is None:
            combined = group_data
        else:
            combined = pd.concat([previous_imputed, group_data])

        try:
            if hasattr(imputer, "fit_transform"):
                combined_imputed = imputer.fit_transform(combined)
                combined_imputed = pd.DataFrame(combined_imputed, columns=combined.columns, index=combined.index)
            else:
                combined_imputed = imputer(combined, random_state=random_state)
        except TypeError:
            combined_imputed = imputer(combined)

        group_imputed = combined_imputed.loc[idx].clip(lower=global_min, upper=global_max, axis=1)

        imputed_df.loc[idx] = group_imputed
        method_log.loc[idx] = method_name

        previous_imputed = pd.concat([previous_imputed, group_imputed]) if previous_imputed is not None else group_imputed.copy()

    if imputed_df.isnull().values.any():
        raise ValueError("NaNs remain after hierarchical imputation!")

    return (imputed_df, method_log) if return_method_log else imputed_df

In [15]:
def get_imputer(method_name, registry):
    imputer = registry.get(method_name)
    if imputer is None:
        raise ValueError(f"Method '{method_name}' not found or not implemented.")
    if hasattr(imputer, "fit") and hasattr(imputer, "transform"):
        return copy.deepcopy(imputer)
    return imputer

In [None]:
thresholds = [0.04] * 25
method_names = ["lstm"] * 10 + ["xgboost"] * 15

#method_names = ["iterative"] * 3 + ["lstm"] * 12 + ["xgboost"] * 10

imputed_df, method_log = hierarchical_impute_dynamic(
    df=o4_X_train, # define the dataset
    thresholds=thresholds,
    method_names=method_names,
    method_registry=imputer_registry,
    random_state=0,
    return_method_log=True
)

2025-05-04 01:55:44,758 - INFO - Group 2 (0.04, 0.08] -> lstm | 12 rows
2025-05-04 01:55:50,117 - INFO - Group 3 (0.08, 0.12] -> lstm | 120 rows
2025-05-04 01:55:56,383 - INFO - Group 4 (0.12, 0.16] -> lstm | 420 rows
2025-05-04 01:56:04,338 - INFO - Group 5 (0.16, 0.20] -> lstm | 1584 rows
2025-05-04 01:56:20,084 - INFO - Group 6 (0.20, 0.24] -> lstm | 2016 rows
2025-05-04 01:56:48,297 - INFO - Group 7 (0.24, 0.28] -> lstm | 2880 rows
2025-05-04 01:57:27,183 - INFO - Group 8 (0.28, 0.32] -> lstm | 3276 rows
2025-05-04 01:58:18,496 - INFO - Group 9 (0.32, 0.36] -> lstm | 4560 rows
2025-05-04 01:59:31,041 - INFO - Group 10 (0.36, 0.40] -> lstm | 3192 rows
2025-05-04 02:01:09,579 - INFO - Group 11 (0.40, 0.44] -> xgboost | 3072 rows


In [None]:
output_path = "../CSV/exports/CIR-16/impute/o1_lstm-15_xgboost-10/"
os.makedirs(output_path, exist_ok=True)

imputed_df.to_csv(os.path.join(output_path, "o4_X_train_imputed.csv"), index=False)
method_log.to_csv(os.path.join(output_path, "o4_X_train_method_log.csv"))

In [None]:
import os
import matplotlib.pyplot as plt

# --- Tracking for Visualization ---
group_names = []
rows_per_group = []
cumulative_rows = []

cumulative_total = 0

# --- Loop through each group ---
for i, upper_bound in enumerate(cum_thresholds):
    lower_bound = cum_thresholds[i - 1] if i > 0 else 0.0
    idx = df_copy.index[
        (df_copy["missing_pct"] > lower_bound) & (df_copy["missing_pct"] <= upper_bound)
    ]
    group_data = df_copy.loc[idx, cols].copy()

    group_count = len(group_data)
    cumulative_total += group_count

    group_names.append(f"Group {i+1}\n({lower_bound:.2f}-{upper_bound:.2f})")
    rows_per_group.append(group_count)
    cumulative_rows.append(cumulative_total)

    # [.. your existing imputation logic goes here ..]

# --- Visualization ---
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(cumulative_rows) + 1), cumulative_rows, marker='o', linestyle='-', color='blue')
plt.xticks(ticks=range(1, len(group_names) + 1), labels=group_names, rotation=45, ha='right')
plt.title("Cumulative Rows Used for Imputation Across Groups", fontsize=14, fontweight='bold')
plt.xlabel("Group (Missingness Range)", fontsize=12)
plt.ylabel("Cumulative Rows Used", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()

# --- Save the figure ---
output_dir = "figures/CIR-16"
os.makedirs(output_dir, exist_ok=True)
plt.savefig(os.path.join(output_dir, "cumulative_imputation_rows.png"), dpi=300)
plt.show()