# CIR-2: Baseline Imputation Methods

In [1]:
import pandas as pd
import numpy as np
import os
import io
import logging
import time
import datetime
import random
import matplotlib.pyplot as plt
import sys
import re

from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import BayesianRidge

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-2.log')
logger.info("This is being logged to CIR-2.log")

2025-05-03 12:14:26,093 - INFO - Switched logging to logs/CIR-2.log
2025-05-03 12:14:26,096 - INFO - This is being logged to CIR-2.log


In [4]:
# Load datasets
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("+++++++++++++++++CIR-2+++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-05-03 12:14:26,109 - INFO - +++++++++++++++++CIR-2+++++++++++++++++++++++++
2025-05-03 12:14:26,111 - INFO - Start Loading Dataframes.
2025-05-03 12:14:26,112 - INFO - Loading... -> o1_X_external.csv
2025-05-03 12:14:33,391 - INFO - Loading... -> o1_X_test.csv
2025-05-03 12:14:33,974 - INFO - Loading... -> o1_X_train.csv
2025-05-03 12:14:38,428 - INFO - Loading... -> o1_X_validate.csv
2025-05-03 12:14:39,000 - INFO - Loading... -> o1_y_external_los.csv
2025-05-03 12:14:39,050 - INFO - Loading... -> o1_y_external_mortality.csv
2025-05-03 12:14:39,084 - INFO - Loading... -> o1_y_test_los.csv
2025-05-03 12:14:39,096 - INFO - Loading... -> o1_y_test_mortality.csv
2025-05-03 12:14:39,101 - INFO - Loading... -> o1_y_train_los.csv
2025-05-03 12:14:39,150 - INFO - Loading... -> o1_y_train_mortality.csv
2025-05-03 12:14:39,169 - INFO - Loading... -> o1_y_validate_los.csv
2025-05-03 12:14:39,180 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-05-03 12:14:39,185 - INFO - Loading... -

# CIR-23: Implement Iterative Imputer

In [None]:
# Tee class to duplicate stdout
class Tee:
    def __init__(self, *files):
        self.files = files
    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush()
    def flush(self):
        for f in self.files:
            f.flush()

def impute_with_iterative(input_df, method, output_path, n_iter, log_verbose_file_path=None):
    logging.info(f"Starting Iterative Imputer with method={method} on input DataFrame of shape {input_df.shape}.")

    data_copy = input_df.copy()
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Estimator selection
    if method == "ExtraTrees":
        estimator = ExtraTreesRegressor(n_estimators=10, random_state=0, n_jobs=-1)
    elif method == "HistGradientBoosting":
        estimator = HistGradientBoostingRegressor(random_state=0)
    else:
        raise ValueError(f"Unsupported method: {method}. Use 'ExtraTrees', or 'HistGradientBoosting'.")

    # IterativeImputer
    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=n_iter,
        random_state=0,
        verbose=2,
        sample_posterior=False
    )

    start_time = time.time()

    # Handle logging duplication
    if log_verbose_file_path is not None:
        log_dir = os.path.dirname(log_verbose_file_path)
        os.makedirs(log_dir, exist_ok=True)

        original_stdout = sys.stdout  # Save original stdout
        with open(log_verbose_file_path, "w") as log_file:
            sys.stdout = Tee(sys.__stdout__, log_file)

            try:
                # Fit and transform
                imputed_array = imputer.fit_transform(data_copy)
            finally:
                sys.stdout = original_stdout  # Restore stdout
    else:
        # If no logging redirection
        imputed_array = imputer.fit_transform(data_copy)

    end_time = time.time()
    runtime = end_time - start_time

    # Save imputed data
    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns)
    imputed_df.to_csv(output_path, index=False)

    # Logging
    logging.info(f"Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")
    logging.info(f"Basic statistics after imputation:\n{imputed_df.describe()}")

    # Save full describe output to a separate CSV file
    # The logging.info could not represent all the statistics
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df

In [None]:
# Create log file
switch_log_file('logs/CIR-23_ExtraTrees.log')
logger.info("This is being logged to CIR-23_ExtraTrees.log")

# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=3,
    log_verbose_file_path="logs/CIR-23_ExtraTrees.log"
)

In [None]:
# Create log file
switch_log_file('logs/CIR-23_HistGradientBoosting.log')
logger.info("This is being logged to CIR-23_HistGradientBoosting.log")

# HistGradientBoosting estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_HistGradientBoosting.csv",
    method="HistGradientBoosting",
    n_iter=20,
    log_verbose_file_path="logs/CIR-23_HistGradientBoosting.log"
)

# CIR-24 Implement Knn Imputer

In [None]:
# Build log file
switch_log_file('logs/CIR-24.log')
logger.info("This is being logged to CIR-24.log")

In [None]:
def impute_with_knn(input_df, output_path, n_neighbors=5, weights="uniform"):

    logging.info(f"Starting KNN Imputer on input DataFrame of shape {input_df.shape}.")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    data_copy = input_df.copy()

    # Start timing
    start_time = time.time()

    # Create and apply KNNImputer
    imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
    imputed_array = imputer.fit_transform(data_copy)

    # End timing
    end_time = time.time()
    runtime = end_time - start_time

    # Convert to DataFrame and cast to float32
    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns).astype(np.float32)
    imputed_df.to_csv(output_path, index=False)

    # Logging
    logging.info(f"KNN Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")

    # Save .describe() summary
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df

# Run KNN Imputer
o4_X_train_knn_imputed = impute_with_knn(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv",
    n_neighbors=5,
    weights="distance"  # "uniform" or "distance" 
)

In [None]:
# Run KNN Imputer
o4_X_train_knn_imputed = impute_with_knn(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv",
    n_neighbors=5,
    weights="distance"  # "uniform" or "distance" 
)

# CIR-25: Implement Mean - Median Imputer

In [None]:
# Build log file
switch_log_file('logs/CIR-25.log')
logger.info("This is being logged to CIR-25.log")

In [None]:
def impute_with_simple(input_df, strategy, output_path):

    logging.info(f"Starting {strategy.capitalize()} Imputer on input DataFrame of shape {input_df.shape}.")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    data_copy = input_df.copy()

    start_time = time.time()
    
    imputer = SimpleImputer(strategy=strategy)
    imputed_array = imputer.fit_transform(data_copy)

    end_time = time.time()
    runtime = end_time - start_time

    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns).astype(np.float32)
    imputed_df.to_csv(output_path, index=False)

    logging.info(f"{strategy.capitalize()} Imputation completed in {runtime:.2f} seconds.")
    logging.info(f"Number of NaNs after imputation: {np.isnan(imputed_df.values).sum()}")
    logging.info(f"Imputed dataset saved at {output_path}")

    # Save statistics
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df, runtime

In [None]:
# Mean Imputation
mean_imputed, runtime_mean = impute_with_simple(
    input_df=o4_X_train,
    strategy="mean",
    output_path="CSV/exports/CRI-02/o1_impute_baselines/03_mean/o4_X_train_Mean.csv"
)

# Median Imputation
median_imputed, runtime_median = impute_with_simple(
    input_df=o4_X_train,
    strategy="median",
    output_path="CSV/exports/CRI-02/o1_impute_baselines/04_median/o4_X_train_Median.csv"
)


# Histogram Plot

In [None]:
# Build log file
switch_log_file('logs/CIR-24.log')
logger.info("This is being logged to CIR-24.log")

In [None]:
def sanitize_filename(name):
    # Replace characters that are not safe in filenames
    return re.sub(r'[\\/*?:"<>|]', "_", name)

def plot_before_after_distributions(input_df_before, input_df_after, output_folder, sample_features=None):
    os.makedirs(output_folder, exist_ok=True)

    features = list(input_df_before.columns)

    # If sample_features is None ➔ use all features
    if sample_features is None:
        sampled_features = features
    else:
        sampled_features = random.sample(features, min(sample_features, len(features)))

    logging.info(f"Plotting distributions for features: {sampled_features}")

    for feature in sampled_features:
        plt.figure(figsize=(10, 5))
        plt.hist(input_df_before[feature].dropna(), bins=50, alpha=0.5, label='Before Imputation')
        plt.hist(input_df_after[feature].dropna(), bins=50, alpha=0.5, label='After Imputation')

        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.tight_layout()

        # ✅ Use sanitized feature name for the filename
        safe_feature_name = sanitize_filename(feature)
        plot_path = os.path.join(output_folder, f"{safe_feature_name}_distribution_comparison.png")

        plt.savefig(plot_path)
        plt.close()

    logging.info(f"Distribution plots saved in {output_folder}")


def check_extreme_values(input_df_before, input_df_after, threshold=5.0):
    """
    Check for extreme outliers introduced after imputation.
    """
    suspicious_features = []

    for feature in input_df_before.columns:
        before_max = input_df_before[feature].max()
        before_min = input_df_before[feature].min()
        after_max = input_df_after[feature].max()
        after_min = input_df_after[feature].min()

        if before_max != 0 and (after_max > threshold * before_max or after_max < before_max / threshold):
            suspicious_features.append((feature, 'max', before_max, after_max))
        if before_min != 0 and (after_min < before_min / threshold or after_min > threshold * before_min):
            suspicious_features.append((feature, 'min', before_min, after_min))

    suspicious_df = pd.DataFrame(
        suspicious_features, 
        columns=["Feature", "Type", "Before_Value", "After_Value"]
    )

    if not suspicious_df.empty:
        logging.warning(f"Found {len(suspicious_df)} suspicious extreme values after imputation!")

    return suspicious_df

In [None]:
# Load data

# ExtraTrees
#switch_log_file('logs/CIR-27.log')
#logger.info("This is being logged to CIR-24.log")
#data_imputed = pd.read_csv(f"CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_ExtraTrees.csv")
#method_imputed = "ExtraTrees"
#dataset = o4_X_train
#dataset_name = "o4_X_train"

# HistGradientBoosting
#switch_log_file('logs/CIR-27.log')
#logger.info("This is being logged to CIR-24.log")
#data_imputed = pd.read_csv(f"CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_HistGradientBoosting.csv")
#method_imputed = "HistGradientBoosting"
#dataset = o4_X_train
#dataset_name = "o4_X_train"

# knn
#switch_log_file('logs/CIR-24.log')
#logger.info("This is being logged to CIR-24.log")
#data_imputed = pd.read_csv(f"CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv")
#method_imputed = "knn_distance"
#dataset = o4_X_train
#dataset_name = "o4_X_train"

# mean
#switch_log_file('logs/CIR-25.log')
#logger.info("This is being logged to CIR-25.log")
#data_imputed = pd.read_csv(f"CSV/exports/CRI-02/o1_impute_baselines/03_mean/o4_X_train_Mean.csv")
#method_imputed = "Mean"
#dataset = o4_X_train
#dataset_name = "o4_X_train"

# Median
switch_log_file('logs/CIR-25.log')
logger.info("This is being logged to CIR-25.log")
data_imputed = pd.read_csv(f"CSV/exports/CRI-02/o1_impute_baselines/04_median/o4_X_train_Median.csv")
method_imputed = "Median"
dataset = o4_X_train
dataset_name = "o4_X_train"

# Plot distributions before vs after
plot_before_after_distributions(
    input_df_before=dataset,
    input_df_after=data_imputed,
    output_folder=f"figures/CRI-27/o1_impute_baselines/01_iterative/{method_imputed}/",
)

# Check for extreme values
extreme_values_df = check_extreme_values(
    input_df_before=dataset,
    input_df_after=data_imputed,
    threshold=5.0
)

# Save report if needed

output_folder = "CSV/exports/CRI-02/00_extreme_values/"
os.makedirs(output_folder, exist_ok=True) 

if not extreme_values_df.empty:
    output_path = os.path.join(output_folder, f"{dataset_name}_{method_imputed}_extreme_values.csv")
    extreme_values_df.to_csv(output_path, index=False)
    logging.info("Extreme values report saved.")
else:
    logging.info("No suspicious extreme values detected.")

logging.info("+++++++++++++++++++++++++++++++++++++++++++")

# Comparison
## Compaire imputation techniques

In [7]:
mean_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "03_mean", "o4_X_train_Mean.csv")
median_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "04_median", "o4_X_train_Median.csv")
knn_distance = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "02_knn", "o4_X_train_KNN_distance.csv")
knn_uniform = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "02_knn", "o4_X_train_KNN_distance.csv")
iter_extratrees_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "01_iterative", "o4_X_train_Iterative_ExtraTrees.csv")
iter_hgb_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "01_iterative", "o4_X_train_Iterative_HistGradientBoosting.csv")

# Load original and imputed datasets
#o4_X_train = pd.read_csv(mean_path).astype(np.float32)
o4_X_train_Mean = pd.read_csv(mean_path).astype(np.float32)
o4_X_train_Median = pd.read_csv(median_path).astype(np.float32)
o4_X_train_KNN_distance = pd.read_csv(knn_distance).astype(np.float32)
o4_X_train_KNN_uniform = pd.read_csv(knn_uniform).astype(np.float32)
o4_X_train_Iterative_ExtraTrees = pd.read_csv(iter_extratrees_path).astype(np.float32)
o4_X_train_Iterative_HistGradientBoosting = pd.read_csv(iter_hgb_path).astype(np.float32)

In [11]:
def summarize_imputation_results(original_df, imputed_df, method_name, extreme_values_path):
    # Detect where original data had NaNs
    mask_missing = original_df.isna()

    # Calculate difference only at missing locations
    diff = np.abs(imputed_df - original_df)
    mean_change = diff[mask_missing].mean().mean()

    # Count remaining NaNs
    nan_count = np.isnan(imputed_df.values).sum()

    # Load extreme values file and count
    if os.path.exists(extreme_values_path):
        extreme_df = pd.read_csv(extreme_values_path)
        n_extreme = len(extreme_df)
    else:
        n_extreme = 0
        logging.warning(f"Extreme values file not found for method {method_name}: {extreme_values_path}")

    return {
        "method": method_name,
        "nan_count": nan_count,
        "mean_change": round(mean_change, 4),
        "n_extreme_values": n_extreme
    }


In [12]:
results = []

base_extreme_dir = os.path.join("CSV", "exports", "CRI-02", "00_extreme_values")

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Mean, "Mean",
    os.path.join(base_extreme_dir, "o4_X_train_Mean_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Median, "Median",
    os.path.join(base_extreme_dir, "o4_X_train_Median_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_KNN_distance, "KNN (distance)",
    os.path.join(base_extreme_dir, "o4_X_train_knn_distance_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_KNN_uniform, "KNN (uniform)",
    os.path.join(base_extreme_dir, "o4_X_train_knn_uniform_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Iterative_ExtraTrees, "Iterative (ExtraTrees)",
    os.path.join(base_extreme_dir, "o4_X_train_ExtraTrees_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Iterative_HistGradientBoosting, "Iterative (HistGradientBoosting)",
    os.path.join(base_extreme_dir, "o4_X_train_HistGradientBoosting_extreme_values.csv")
))


In [14]:
# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results)

# Sort by mean change (optional)
results_df = results_df.sort_values(by="mean_change", ascending=True).reset_index(drop=True)

# Display
display(results_df)

Unnamed: 0,method,nan_count,mean_change,n_extreme_values
0,Mean,0,,17
1,Median,0,,17
2,KNN (distance),0,,17
3,KNN (uniform),0,,17
4,Iterative (ExtraTrees),0,,17
5,Iterative (HistGradientBoosting),0,,47


# Testing Field

In [None]:
# Create a small subset of the dataframe (for faster testing)
small_data = o4_X_train.iloc[:450, :]  # pick first 50 features