# CIR-2 Baseline Imputation Methods

In [1]:
import pandas as pd
import numpy as np
import os
import io
import logging
import time
import datetime
import random
import matplotlib.pyplot as plt
import sys

from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import BayesianRidge

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler (to console)
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-2.log')
logger.info("This is being logged to CIR-2.log")

2025-04-28 21:46:53,575 - INFO - Switched logging to logs/CIR-2.log
2025-04-28 21:46:53,578 - INFO - This is being logged to CIR-2.log


In [4]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("+++++++++++++++++CIR-2+++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-28 21:46:53,592 - INFO - +++++++++++++++++CIR-2+++++++++++++++++++++++++
2025-04-28 21:46:53,593 - INFO - Start Loading Dataframes.
2025-04-28 21:46:53,594 - INFO - Loading... -> o1_X_external.csv
2025-04-28 21:47:00,730 - INFO - Loading... -> o1_X_test.csv
2025-04-28 21:47:01,299 - INFO - Loading... -> o1_X_train.csv
2025-04-28 21:47:05,432 - INFO - Loading... -> o1_X_validate.csv
2025-04-28 21:47:05,969 - INFO - Loading... -> o1_y_external_los.csv
2025-04-28 21:47:06,014 - INFO - Loading... -> o1_y_external_mortality.csv
2025-04-28 21:47:06,043 - INFO - Loading... -> o1_y_test_los.csv
2025-04-28 21:47:06,053 - INFO - Loading... -> o1_y_test_mortality.csv
2025-04-28 21:47:06,058 - INFO - Loading... -> o1_y_train_los.csv
2025-04-28 21:47:06,096 - INFO - Loading... -> o1_y_train_mortality.csv
2025-04-28 21:47:06,113 - INFO - Loading... -> o1_y_validate_los.csv
2025-04-28 21:47:06,124 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-04-28 21:47:06,128 - INFO - Loading... -

# CIR-23 Implement Iterative Imputer

In [5]:
# Tee class to duplicate stdout
class Tee:
    def __init__(self, *files):
        self.files = files
    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush()
    def flush(self):
        for f in self.files:
            f.flush()

def impute_with_iterative(input_df, method, output_path, n_iter, log_verbose_file_path=None):
    logging.info(f"Starting Iterative Imputer with method={method} on input DataFrame of shape {input_df.shape}.")

    data_copy = input_df.copy()
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Estimator selection
    if method == "ExtraTrees":
        estimator = ExtraTreesRegressor(n_estimators=10, random_state=0, n_jobs=-1)
    elif method == "HistGradientBoosting":
        estimator = HistGradientBoostingRegressor(random_state=0)
    else:
        raise ValueError(f"Unsupported method: {method}. Use 'ExtraTrees', 'BayesianRidge', or 'HistGradientBoosting'.")

    # IterativeImputer
    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=n_iter,
        random_state=0,
        verbose=2,  # Print to screen
        sample_posterior=False
    )

    start_time = time.time()

    if log_verbose_file_path is not None:
        log_dir = os.path.dirname(log_verbose_file_path)
        os.makedirs(log_dir, exist_ok=True)
        log_file = open(log_verbose_file_path, "w")

        # Save original stdout
        original_stdout = sys.stdout

        # Duplicate printing to both console and file
        sys.stdout = Tee(sys.__stdout__, log_file)

    try:
        # Fit and transform
        imputed_array = imputer.fit_transform(data_copy)
    finally:
        if log_verbose_file_path is not None:
            sys.stdout = original_stdout
            log_file.close()

    end_time = time.time()
    runtime = end_time - start_time

    # Save imputed data
    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns)
    imputed_df.to_csv(output_path, index=False)

    # Logging
    logging.info(f"Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")
    logging.info(f"Basic statistics after imputation:\n{imputed_df.describe()}")

    # Save full describe output to a separate CSV file
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df

In [None]:
switch_log_file('logs/CIR-23_ExtraTrees.log')
# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=3,
    log_verbose_file_path="logs/CIR-23_ExtraTrees.log"
)

2025-04-28 21:47:40,246 - INFO - Switched logging to logs/CIR-23_ExtraTrees.log
2025-04-28 21:47:40,248 - INFO - Starting Iterative Imputer with method=ExtraTrees on input DataFrame of shape (30624, 345).


In [None]:
switch_log_file('logs/CIR-23_HistGradientBoosting.log')
logger.info("This is being logged to CIR-23_HistGradientBoosting.log")
# HistGradientBoosting estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_HistGradientBoosting.csv",
    method="HistGradientBoosting",
    n_iter=20,
    log_verbose_file_path="logs/CIR-23_HistGradientBoosting.log"
)

# Plot

In [None]:
def plot_before_after_distributions(input_df_before, input_df_after, output_folder, sample_features=None):
    """
    Plot before and after distributions for selected features and save the figures.
    
    If sample_features is None, it plots for all features.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    features = list(input_df_before.columns)

    # If sample_features is None ➔ use all features
    if sample_features is None:
        sampled_features = features
    else:
        sampled_features = random.sample(features, min(sample_features, len(features)))
    
    logging.info(f"Plotting distributions for features: {sampled_features}")

    for feature in sampled_features:
        plt.figure(figsize=(10,5))
        
        plt.hist(input_df_before[feature].dropna(), bins=50, alpha=0.5, label='Before Imputation')
        plt.hist(input_df_after[feature].dropna(), bins=50, alpha=0.5, label='After Imputation')
        
        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        
        plt.tight_layout()
        
        plot_path = os.path.join(output_folder, f"{feature}_distribution_comparison.png")
        plt.savefig(plot_path)
        plt.close()
    
    logging.info(f"Distribution plots saved in {output_folder}")

def check_extreme_values(input_df_before, input_df_after, threshold=5.0):
    """
    Check for extreme outliers introduced after imputation.
    """
    suspicious_features = []

    for feature in input_df_before.columns:
        before_max = input_df_before[feature].max()
        before_min = input_df_before[feature].min()
        after_max = input_df_after[feature].max()
        after_min = input_df_after[feature].min()

        if before_max != 0 and (after_max > threshold * before_max or after_max < before_max / threshold):
            suspicious_features.append((feature, 'max', before_max, after_max))
        if before_min != 0 and (after_min < before_min / threshold or after_min > threshold * before_min):
            suspicious_features.append((feature, 'min', before_min, after_min))

    suspicious_df = pd.DataFrame(
        suspicious_features, 
        columns=["Feature", "Type", "Before_Value", "After_Value"]
    )

    if not suspicious_df.empty:
        logging.warning(f"Found {len(suspicious_df)} suspicious extreme values after imputation!")

    return suspicious_df

In [None]:
# Load imputed small data
data_imputed = pd.read_csv(f"CSV/exports/CRI-02/o1_impute_baselines/01_iterative/{dataset}_Iterative_{method_imputed}.csv")

# Plot distributions before vs after
plot_before_after_distributions(
    input_df_before=dataset,
    input_df_after=data_imputed,
    output_folder=f"figures/CRI-02/o1_impute_baselines/01_iterative/{method_imputed}/",
)

# Check for extreme values
extreme_values_df = check_extreme_values(
    input_df_before=dataset,
    input_df_after=data_imputed,
    threshold=5.0
)

# Save report if needed
if not extreme_values_df.empty:
    extreme_values_df.to_csv(f"CSV/exports/CRI-02/o1_impute_baselines/01_iterative/{dataset}_{method_imputed}_extreme_values.csv", index=False)
    logging.info("Extreme values report saved.")
else:
    logging.info("No suspicious extreme values detected.")

In [None]:
# Create a small subset of the dataframe (for faster testing)
small_data = o4_X_train.iloc[:450, :]  # pick first 50 features

In [None]:
"""
Play with sample
"""

# Create a small subset of the dataframe (for faster testing)
small_data = o4_X_train.iloc[:450, :]  # pick first 50 features

# Test imputation on small_data
impute_with_iterative(
    input_df=small_data,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/small_o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

In [None]:
def plot_before_after_distributions(input_df_before, input_df_after, output_folder, sample_features=10):
    """
    Plot before and after distributions for random features and save the figures.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    features = list(input_df_before.columns)
    sampled_features = random.sample(features, min(sample_features, len(features)))
    
    logging.info(f"Plotting distributions for features: {sampled_features}")

    for feature in sampled_features:
        plt.figure(figsize=(10,5))
        
        plt.hist(input_df_before[feature].dropna(), bins=50, alpha=0.5, label='Before Imputation')
        plt.hist(input_df_after[feature].dropna(), bins=50, alpha=0.5, label='After Imputation')
        
        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        
        plt.tight_layout()
        
        plot_path = os.path.join(output_folder, f"{feature}_distribution_comparison.png")
        plt.savefig(plot_path)
        plt.close()
    
    logging.info(f"Distribution plots saved in {output_folder}")

def check_extreme_values(input_df_before, input_df_after, threshold=5.0):
    """
    Check for extreme outliers introduced after imputation.
    """
    suspicious_features = []

    for feature in input_df_before.columns:
        before_max = input_df_before[feature].max()
        before_min = input_df_before[feature].min()
        after_max = input_df_after[feature].max()
        after_min = input_df_after[feature].min()

        if before_max != 0 and (after_max > threshold * before_max or after_max < before_max / threshold):
            suspicious_features.append((feature, 'max', before_max, after_max))
        if before_min != 0 and (after_min < before_min / threshold or after_min > threshold * before_min):
            suspicious_features.append((feature, 'min', before_min, after_min))

    suspicious_df = pd.DataFrame(
        suspicious_features, 
        columns=["Feature", "Type", "Before_Value", "After_Value"]
    )

    if not suspicious_df.empty:
        logging.warning(f"Found {len(suspicious_df)} suspicious extreme values after imputation!")

    return suspicious_df

In [None]:
# Load imputed small data
small_data_imputed = pd.read_csv("CSV/exports/CRI-02/o1_impute_baselines/01_iterative/small_o3_X_train_imputed_Iterative_ExtraTrees.csv")

# Plot distributions before vs after
plot_before_after_distributions(
    input_df_before=small_data,
    input_df_after=small_data_imputed,
    output_folder="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/plots_small_test/",
    #sample_features=10
)

# Check for extreme values
extreme_values_df = check_extreme_values(
    input_df_before=small_data,
    input_df_after=small_data_imputed,
    threshold=5.0
)

# Save report if needed
if not extreme_values_df.empty:
    extreme_values_df.to_csv("CSV/exports/CRI-02/o1_impute_baselines/01_iterative/extreme_values_report_small_test.csv", index=False)
    logging.info("Extreme values report saved.")
else:
    logging.info("No suspicious extreme values detected.")

In [None]:
# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

In [None]:
# BayesianRidge estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_BayesianRidge.csv",
    method="BayesianRidge",
    n_iter=20
)