# CIR-2 Baseline Imputation Methods

In [1]:
import pandas as pd
import numpy as np
import os
import logging
import time

import random
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("logs/CIR-22.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-27 17:14:52,818 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-04-27 17:14:52,820 - INFO - Start Loading Dataframes.
2025-04-27 17:14:52,821 - INFO - Loading... -> o1_X_external.csv
2025-04-27 17:14:59,798 - INFO - Loading... -> o1_X_test.csv
2025-04-27 17:15:00,337 - INFO - Loading... -> o1_X_train.csv
2025-04-27 17:15:04,801 - INFO - Loading... -> o1_X_validate.csv
2025-04-27 17:15:05,339 - INFO - Loading... -> o1_y_external_los.csv
2025-04-27 17:15:05,382 - INFO - Loading... -> o1_y_external_mortality.csv
2025-04-27 17:15:05,410 - INFO - Loading... -> o1_y_test_los.csv
2025-04-27 17:15:05,420 - INFO - Loading... -> o1_y_test_mortality.csv
2025-04-27 17:15:05,425 - INFO - Loading... -> o1_y_train_los.csv
2025-04-27 17:15:05,462 - INFO - Loading... -> o1_y_train_mortality.csv
2025-04-27 17:15:05,480 - INFO - Loading... -> o1_y_validate_los.csv
2025-04-27 17:15:05,489 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-04-27 17:15:05,495 - INFO - Loading... -> o2_

# CIR-23 Implement Iterative Imputer

In [4]:
"""
Imputes missing values in a dataframe using
IterativeImputer with ExtraTreesRegressor or BayesianRidge.
    
Parameters:
    input_df (pd.DataFrame): Input dataframe to impute.
    output_path (str): Path to save the imputed CSV.
    method (str): "ExtraTrees" or "BayesianRidge".
    n_iter (int): Number of max iterations for the imputer.
"""

def impute_with_iterative(input_df, output_path, method, n_iter):
    # Start
    logging.info(f"Starting Iterative Imputer with method={method} on input DataFrame of shape {input_df.shape}.")

    # Copy input
    data_copy = input_df.copy()

    # Create output folder if needed
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Choose estimator
    if method == "ExtraTrees":
        estimator = ExtraTreesRegressor(
            n_estimators=10,
            random_state=0,
            n_jobs=-1
        )
    elif method == "BayesianRidge":
        estimator = BayesianRidge()
    else:
        raise ValueError(f"Unsupported method: {method}. Use 'ExtraTrees' or 'BayesianRidge'.")

    # Create imputer
    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=n_iter,
        random_state=0,
        verbose=2,  # Show progress per iteration
        sample_posterior=False
    )

    # Start timing
    start_time = time.time()

    # Fit and transform
    imputed_array = imputer.fit_transform(data_copy)

    # End timing
    end_time = time.time()
    runtime = end_time - start_time

    # Convert back to DataFrame
    imputed_df = pd.DataFrame(
        imputed_array,
        columns=data_copy.columns
    )

    # Save
    imputed_df.to_csv(output_path, index=False)

    # Logs
    logging.info(f"Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")
    logging.info(f"Basic statistics after imputation:\n{imputed_df.describe()}")

    return imputed_df

In [None]:
# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

2025-04-27 17:15:28,549 - INFO - Starting Iterative Imputer with method=ExtraTrees on input DataFrame of shape (30624, 345).


[IterativeImputer] Completing matrix with shape (30624, 345)


In [None]:
# BayesianRidge estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_BayesianRidge.csv",
    method="BayesianRidge",
    n_iter=20
)

# Plot

In [None]:
def plot_before_after_distributions(input_df_before, input_df_after, output_folder, sample_features=None):
    """
    Plot before and after distributions for selected features and save the figures.
    
    If sample_features is None, it plots for all features.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    features = list(input_df_before.columns)

    # If sample_features is None ➔ use all features
    if sample_features is None:
        sampled_features = features
    else:
        sampled_features = random.sample(features, min(sample_features, len(features)))
    
    logging.info(f"Plotting distributions for features: {sampled_features}")

    for feature in sampled_features:
        plt.figure(figsize=(10,5))
        
        plt.hist(input_df_before[feature].dropna(), bins=50, alpha=0.5, label='Before Imputation')
        plt.hist(input_df_after[feature].dropna(), bins=50, alpha=0.5, label='After Imputation')
        
        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        
        plt.tight_layout()
        
        plot_path = os.path.join(output_folder, f"{feature}_distribution_comparison.png")
        plt.savefig(plot_path)
        plt.close()
    
    logging.info(f"Distribution plots saved in {output_folder}")

def check_extreme_values(input_df_before, input_df_after, threshold=5.0):
    """
    Check for extreme outliers introduced after imputation.
    """
    suspicious_features = []

    for feature in input_df_before.columns:
        before_max = input_df_before[feature].max()
        before_min = input_df_before[feature].min()
        after_max = input_df_after[feature].max()
        after_min = input_df_after[feature].min()

        if before_max != 0 and (after_max > threshold * before_max or after_max < before_max / threshold):
            suspicious_features.append((feature, 'max', before_max, after_max))
        if before_min != 0 and (after_min < before_min / threshold or after_min > threshold * before_min):
            suspicious_features.append((feature, 'min', before_min, after_min))

    suspicious_df = pd.DataFrame(
        suspicious_features, 
        columns=["Feature", "Type", "Before_Value", "After_Value"]
    )

    if not suspicious_df.empty:
        logging.warning(f"Found {len(suspicious_df)} suspicious extreme values after imputation!")

    return suspicious_df

In [None]:
"""
Play with sample
"""

# Create a small subset of the dataframe (for faster testing)
small_data = o4_X_train.iloc[:500, :]  # pick first 50 features

# Test imputation on small_data
impute_with_iterative(
    input_df=small_data,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/small_o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

In [None]:
def plot_before_after_distributions(input_df_before, input_df_after, output_folder, sample_features=10):
    """
    Plot before and after distributions for random features and save the figures.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    features = list(input_df_before.columns)
    sampled_features = random.sample(features, min(sample_features, len(features)))
    
    logging.info(f"Plotting distributions for features: {sampled_features}")

    for feature in sampled_features:
        plt.figure(figsize=(10,5))
        
        plt.hist(input_df_before[feature].dropna(), bins=50, alpha=0.5, label='Before Imputation')
        plt.hist(input_df_after[feature].dropna(), bins=50, alpha=0.5, label='After Imputation')
        
        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        
        plt.tight_layout()
        
        plot_path = os.path.join(output_folder, f"{feature}_distribution_comparison.png")
        plt.savefig(plot_path)
        plt.close()
    
    logging.info(f"Distribution plots saved in {output_folder}")

def check_extreme_values(input_df_before, input_df_after, threshold=5.0):
    """
    Check for extreme outliers introduced after imputation.
    """
    suspicious_features = []

    for feature in input_df_before.columns:
        before_max = input_df_before[feature].max()
        before_min = input_df_before[feature].min()
        after_max = input_df_after[feature].max()
        after_min = input_df_after[feature].min()

        if before_max != 0 and (after_max > threshold * before_max or after_max < before_max / threshold):
            suspicious_features.append((feature, 'max', before_max, after_max))
        if before_min != 0 and (after_min < before_min / threshold or after_min > threshold * before_min):
            suspicious_features.append((feature, 'min', before_min, after_min))

    suspicious_df = pd.DataFrame(
        suspicious_features, 
        columns=["Feature", "Type", "Before_Value", "After_Value"]
    )

    if not suspicious_df.empty:
        logging.warning(f"Found {len(suspicious_df)} suspicious extreme values after imputation!")

    return suspicious_df

In [None]:
# Load imputed small data
small_data_imputed = pd.read_csv("CSV/exports/CRI-02/o1_impute_baselines/01_iterative/small_o3_X_train_imputed_Iterative_ExtraTrees.csv")

# Plot distributions before vs after
plot_before_after_distributions(
    input_df_before=small_data,
    input_df_after=small_data_imputed,
    output_folder="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/plots_small_test/",
    #sample_features=10
)

# Check for extreme values
extreme_values_df = check_extreme_values(
    input_df_before=small_data,
    input_df_after=small_data_imputed,
    threshold=5.0
)

# Save report if needed
if not extreme_values_df.empty:
    extreme_values_df.to_csv("CSV/exports/CRI-02/o1_impute_baselines/01_iterative/extreme_values_report_small_test.csv", index=False)
    logging.info("Extreme values report saved.")
else:
    logging.info("No suspicious extreme values detected.")

In [None]:
# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

In [None]:
# BayesianRidge estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_BayesianRidge.csv",
    method="BayesianRidge",
    n_iter=20
)