# CIR-2: Baseline Imputation Methods

In [1]:
import pandas as pd
import numpy as np
import os
import io
import logging
import time
import datetime
import random
import matplotlib.pyplot as plt
import sys
import re

from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import BayesianRidge

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-2.log')
logger.info("This is being logged to CIR-2.log")

2025-05-03 15:36:37,524 - INFO - Switched logging to logs/CIR-2.log
2025-05-03 15:36:37,526 - INFO - This is being logged to CIR-2.log


In [4]:
# Load datasets
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("+++++++++++++++++CIR-2+++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-05-03 15:36:37,539 - INFO - +++++++++++++++++CIR-2+++++++++++++++++++++++++
2025-05-03 15:36:37,540 - INFO - Start Loading Dataframes.
2025-05-03 15:36:37,541 - INFO - Loading... -> o1_X_external.csv
2025-05-03 15:36:44,617 - INFO - Loading... -> o1_X_test.csv
2025-05-03 15:36:45,125 - INFO - Loading... -> o1_X_train.csv
2025-05-03 15:36:49,244 - INFO - Loading... -> o1_X_validate.csv
2025-05-03 15:36:49,841 - INFO - Loading... -> o1_y_external_los.csv
2025-05-03 15:36:49,892 - INFO - Loading... -> o1_y_external_mortality.csv
2025-05-03 15:36:49,924 - INFO - Loading... -> o1_y_test_los.csv
2025-05-03 15:36:49,935 - INFO - Loading... -> o1_y_test_mortality.csv
2025-05-03 15:36:49,939 - INFO - Loading... -> o1_y_train_los.csv
2025-05-03 15:36:49,980 - INFO - Loading... -> o1_y_train_mortality.csv
2025-05-03 15:36:49,998 - INFO - Loading... -> o1_y_validate_los.csv
2025-05-03 15:36:50,008 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-05-03 15:36:50,014 - INFO - Loading... -

# CIR-23: Implement Iterative Imputer

In [6]:
# Tee class to duplicate stdout
class Tee:
    def __init__(self, *files):
        self.files = files
    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush()
    def flush(self):
        for f in self.files:
            f.flush()

def impute_with_iterative(input_df, method, output_path, n_iter, log_verbose_file_path=None):
    logging.info(f"Starting Iterative Imputer with method={method} on input DataFrame of shape {input_df.shape}.")

    data_copy = input_df.copy()
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Estimator selection
    if method == "ExtraTrees":
        estimator = ExtraTreesRegressor(n_estimators=10, random_state=0, n_jobs=-1)
    elif method == "HistGradientBoosting":
        estimator = HistGradientBoostingRegressor(random_state=0)
    else:
        raise ValueError(f"Unsupported method: {method}. Use 'ExtraTrees', or 'HistGradientBoosting'.")

    # IterativeImputer
    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=n_iter,
        random_state=0,
        verbose=2,
        sample_posterior=False
    )

    start_time = time.time()

    # Handle logging duplication
    if log_verbose_file_path is not None:
        log_dir = os.path.dirname(log_verbose_file_path)
        os.makedirs(log_dir, exist_ok=True)

        original_stdout = sys.stdout  # Save original stdout
        with open(log_verbose_file_path, "w") as log_file:
            sys.stdout = Tee(sys.__stdout__, log_file)

            try:
                # Fit and transform
                imputed_array = imputer.fit_transform(data_copy)
            finally:
                sys.stdout = original_stdout  # Restore stdout
    else:
        # If no logging redirection
        imputed_array = imputer.fit_transform(data_copy)

    end_time = time.time()
    runtime = end_time - start_time

    # Save imputed data
    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns)
    imputed_df.to_csv(output_path, index=False)

    # Logging
    logging.info(f"Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")
    logging.info(f"Basic statistics after imputation:\n{imputed_df.describe()}")

    # Save full describe output to a separate CSV file
    # The logging.info could not represent all the statistics
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df

In [9]:
# Create log file
switch_log_file('logs/CIR-23_ExtraTrees.log')
logger.info("This is being logged to CIR-23_ExtraTrees.log")

# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=5,
    log_verbose_file_path="logs/CIR-23_ExtraTrees.log"
)

2025-05-03 16:55:09,688 - INFO - Switched logging to logs/CIR-23_ExtraTrees.log
2025-05-03 16:55:09,690 - INFO - This is being logged to CIR-23_ExtraTrees.log
2025-05-03 16:55:09,691 - INFO - Starting Iterative Imputer with method=ExtraTrees on input DataFrame of shape (30624, 345).
2025-05-03 18:39:40,980 - INFO - Imputation completed in 6262.70 seconds.
2025-05-03 18:39:41,009 - INFO - Number of NaNs after imputation: 0
2025-05-03 18:39:41,011 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_ExtraTrees.csv
2025-05-03 18:39:42,229 - INFO - Basic statistics after imputation:
                age  Alanine_Aminotransferase_(ALT)_(Max)  \
count  30624.000000                          30624.000000   
mean      69.297020                             75.925926   
std       14.930233                            221.373642   
min       18.000000                              1.000000   
25%       60.000000                             29.0000

Unnamed: 0,age,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),Alanine_Aminotransferase_(ALT)_(Min),Albumin_(Max),Albumin_(Mean),Albumin_(Median),Albumin_(Min),Alkaline_Phosphatase_(Max),...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,51.0,46.0,46.0,46.0,46.0,2.90,2.900,2.900,2.90,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,51.0,46.0,46.0,46.0,46.0,2.90,2.900,2.900,2.90,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,51.0,46.0,46.0,46.0,46.0,2.90,2.900,2.900,2.90,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,51.0,46.0,46.0,46.0,46.0,2.90,2.900,2.900,2.90,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,51.0,46.0,46.0,46.0,46.0,2.90,2.900,2.900,2.90,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30619,57.0,63.0,63.0,63.0,63.0,3.66,3.565,3.585,3.47,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30620,57.0,63.0,63.0,63.0,63.0,3.65,3.565,3.590,3.48,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30621,57.0,63.0,63.0,63.0,63.0,3.65,3.535,3.555,3.37,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30622,57.0,63.0,63.0,63.0,63.0,3.69,3.570,3.575,3.45,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Create log file
switch_log_file('logs/CIR-23_HistGradientBoosting.log')
logger.info("This is being logged to CIR-23_HistGradientBoosting.log")

# HistGradientBoosting estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_HistGradientBoosting.csv",
    method="HistGradientBoosting",
    n_iter=5,
    log_verbose_file_path="logs/CIR-23_HistGradientBoosting.log"
)

2025-05-03 18:39:44,959 - INFO - Switched logging to logs/CIR-23_HistGradientBoosting.log
2025-05-03 18:39:44,961 - INFO - This is being logged to CIR-23_HistGradientBoosting.log
2025-05-03 18:39:44,963 - INFO - Starting Iterative Imputer with method=HistGradientBoosting on input DataFrame of shape (30624, 345).
2025-05-03 21:20:16,706 - INFO - Imputation completed in 9622.88 seconds.
2025-05-03 21:20:16,726 - INFO - Number of NaNs after imputation: 0
2025-05-03 21:20:16,727 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o4_X_train_Iterative_HistGradientBoosting.csv
2025-05-03 21:20:17,549 - INFO - Basic statistics after imputation:
                age  Alanine_Aminotransferase_(ALT)_(Max)  \
count  30624.000000                          30624.000000   
mean      69.297020                             77.049240   
std       14.930233                            221.807205   
min       18.000000                              1.000000   
25%       60.00

Unnamed: 0,age,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),Alanine_Aminotransferase_(ALT)_(Min),Albumin_(Max),Albumin_(Mean),Albumin_(Median),Albumin_(Min),Alkaline_Phosphatase_(Max),...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,51.0,46.0,46.0,46.0,46.0,2.900000,2.900000,2.900000,2.900000,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,51.0,46.0,46.0,46.0,46.0,2.900000,2.900000,2.900000,2.900000,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,51.0,46.0,46.0,46.0,46.0,2.900000,2.900000,2.900000,2.900000,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,51.0,46.0,46.0,46.0,46.0,2.900000,2.900000,2.900000,2.900000,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,51.0,46.0,46.0,46.0,46.0,2.900000,2.900000,2.900000,2.900000,113.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30619,57.0,63.0,63.0,63.0,63.0,3.490728,3.451212,3.451151,3.446352,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30620,57.0,63.0,63.0,63.0,63.0,3.491351,3.451212,3.451133,3.447935,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30621,57.0,63.0,63.0,63.0,63.0,3.490657,3.451212,3.451161,3.446273,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30622,57.0,63.0,63.0,63.0,63.0,3.491307,3.451212,3.451150,3.447241,106.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# CIR-24 Implement Knn Imputer

In [11]:
# Build log file
switch_log_file('logs/CIR-24.log')
logger.info("This is being logged to CIR-24.log")

2025-05-03 21:20:19,277 - INFO - Switched logging to logs/CIR-24.log
2025-05-03 21:20:19,279 - INFO - This is being logged to CIR-24.log


In [12]:
def impute_with_knn(input_df, output_path, n_neighbors=5, weights="uniform"):

    logging.info(f"Starting KNN Imputer on input DataFrame of shape {input_df.shape}.")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    data_copy = input_df.copy()

    # Start timing
    start_time = time.time()

    # Create and apply KNNImputer
    imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
    imputed_array = imputer.fit_transform(data_copy)

    # End timing
    end_time = time.time()
    runtime = end_time - start_time

    # Convert to DataFrame and cast to float32
    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns).astype(np.float32)
    imputed_df.to_csv(output_path, index=False)

    # Logging
    logging.info(f"KNN Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")

    # Save .describe() summary
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df

2025-05-03 21:20:19,297 - INFO - Starting KNN Imputer on input DataFrame of shape (30624, 345).
2025-05-03 21:53:24,836 - INFO - KNN Imputation completed in 1976.56 seconds.
2025-05-03 21:53:24,855 - INFO - Number of NaNs after imputation: 0
2025-05-03 21:53:24,857 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv
2025-05-03 21:53:25,685 - INFO - Basic statistics saved at CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance_describe.csv


In [13]:
# Run KNN Imputer with weights = uniform
o4_X_train_knn_imputed = impute_with_knn(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_uniform.csv",
    n_neighbors=5,
    weights="uniform"
)

2025-05-03 21:53:25,701 - INFO - Starting KNN Imputer on input DataFrame of shape (30624, 345).
2025-05-03 22:26:30,940 - INFO - KNN Imputation completed in 1976.92 seconds.
2025-05-03 22:26:30,961 - INFO - Number of NaNs after imputation: 0
2025-05-03 22:26:30,964 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv
2025-05-03 22:26:31,774 - INFO - Basic statistics saved at CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance_describe.csv


In [15]:
# Run KNN Imputer with weights = distance
o4_X_train_knn_imputed = impute_with_knn(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv",
    n_neighbors=5,
    weights="distance"
)

2025-05-03 22:56:30,181 - INFO - Starting KNN Imputer on input DataFrame of shape (30624, 345).
2025-05-03 23:29:56,961 - INFO - KNN Imputation completed in 1997.99 seconds.
2025-05-03 23:29:56,982 - INFO - Number of NaNs after imputation: 0
2025-05-03 23:29:56,983 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance.csv
2025-05-03 23:29:57,803 - INFO - Basic statistics saved at CSV/exports/CRI-02/o1_impute_baselines/02_knn/o4_X_train_KNN_distance_describe.csv


# CIR-25: Implement Mean - Median Imputer

In [16]:
# Build log file
switch_log_file('logs/CIR-25.log')
logger.info("This is being logged to CIR-25.log")

2025-05-03 23:29:57,824 - INFO - Switched logging to logs/CIR-25.log
2025-05-03 23:29:57,826 - INFO - This is being logged to CIR-25.log


In [17]:
def impute_with_simple(input_df, strategy, output_path):

    logging.info(f"Starting {strategy.capitalize()} Imputer on input DataFrame of shape {input_df.shape}.")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    data_copy = input_df.copy()

    start_time = time.time()
    
    imputer = SimpleImputer(strategy=strategy)
    imputed_array = imputer.fit_transform(data_copy)

    end_time = time.time()
    runtime = end_time - start_time

    imputed_df = pd.DataFrame(imputed_array, columns=data_copy.columns).astype(np.float32)
    imputed_df.to_csv(output_path, index=False)

    logging.info(f"{strategy.capitalize()} Imputation completed in {runtime:.2f} seconds.")
    logging.info(f"Number of NaNs after imputation: {np.isnan(imputed_df.values).sum()}")
    logging.info(f"Imputed dataset saved at {output_path}")

    # Save statistics
    describe_output_path = output_path.replace(".csv", "_describe.csv")
    imputed_df.describe().to_csv(describe_output_path)
    logging.info(f"Basic statistics saved at {describe_output_path}")

    return imputed_df, runtime

In [18]:
# Mean Imputation
mean_imputed, runtime_mean = impute_with_simple(
    input_df=o4_X_train,
    strategy="mean",
    output_path="CSV/exports/CRI-02/o1_impute_baselines/03_mean/o4_X_train_Mean.csv"
)

# Median Imputation
median_imputed, runtime_median = impute_with_simple(
    input_df=o4_X_train,
    strategy="median",
    output_path="CSV/exports/CRI-02/o1_impute_baselines/04_median/o4_X_train_Median.csv"
)


2025-05-03 23:29:57,856 - INFO - Starting Mean Imputer on input DataFrame of shape (30624, 345).
2025-05-03 23:30:07,244 - INFO - Mean Imputation completed in 0.27 seconds.
2025-05-03 23:30:07,263 - INFO - Number of NaNs after imputation: 0
2025-05-03 23:30:07,264 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/03_mean/o4_X_train_Mean.csv
2025-05-03 23:30:08,013 - INFO - Basic statistics saved at CSV/exports/CRI-02/o1_impute_baselines/03_mean/o4_X_train_Mean_describe.csv
2025-05-03 23:30:08,021 - INFO - Starting Median Imputer on input DataFrame of shape (30624, 345).
2025-05-03 23:30:17,099 - INFO - Median Imputation completed in 0.93 seconds.
2025-05-03 23:30:17,117 - INFO - Number of NaNs after imputation: 0
2025-05-03 23:30:17,118 - INFO - Imputed dataset saved at CSV/exports/CRI-02/o1_impute_baselines/04_median/o4_X_train_Median.csv
2025-05-03 23:30:17,880 - INFO - Basic statistics saved at CSV/exports/CRI-02/o1_impute_baselines/04_median/o4_X_train_Median

# Histogram Plot

In [19]:
# Build log file
switch_log_file('logs/CIR-24.log')
logger.info("This is being logged to CIR-24.log")

2025-05-03 23:30:17,896 - INFO - Switched logging to logs/CIR-24.log
2025-05-03 23:30:17,898 - INFO - This is being logged to CIR-24.log


In [20]:
def sanitize_filename(name):
    # Replace characters that are not safe in filenames
    return re.sub(r'[\\/*?:"<>|]', "_", name)

def plot_before_after_distributions(input_df_before, input_df_after, output_folder, sample_features=None):
    os.makedirs(output_folder, exist_ok=True)

    features = list(input_df_before.columns)

    # If sample_features is None ➔ use all features
    if sample_features is None:
        sampled_features = features
    else:
        sampled_features = random.sample(features, min(sample_features, len(features)))

    logging.info(f"Plotting distributions for features: {sampled_features}")

    for feature in sampled_features:
        plt.figure(figsize=(10, 5))
        plt.hist(input_df_before[feature].dropna(), bins=50, alpha=0.5, label='Before Imputation')
        plt.hist(input_df_after[feature].dropna(), bins=50, alpha=0.5, label='After Imputation')

        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.tight_layout()

        # ✅ Use sanitized feature name for the filename
        safe_feature_name = sanitize_filename(feature)
        plot_path = os.path.join(output_folder, f"{safe_feature_name}_distribution_comparison.png")

        plt.savefig(plot_path)
        plt.close()

    logging.info(f"Distribution plots saved in {output_folder}")


def check_extreme_values(input_df_before, input_df_after, threshold=5.0):
    """
    Check for extreme outliers introduced after imputation.
    """
    suspicious_features = []

    for feature in input_df_before.columns:
        before_max = input_df_before[feature].max()
        before_min = input_df_before[feature].min()
        after_max = input_df_after[feature].max()
        after_min = input_df_after[feature].min()

        if before_max != 0 and (after_max > threshold * before_max or after_max < before_max / threshold):
            suspicious_features.append((feature, 'max', before_max, after_max))
        if before_min != 0 and (after_min < before_min / threshold or after_min > threshold * before_min):
            suspicious_features.append((feature, 'min', before_min, after_min))

    suspicious_df = pd.DataFrame(
        suspicious_features, 
        columns=["Feature", "Type", "Before_Value", "After_Value"]
    )

    if not suspicious_df.empty:
        logging.warning(f"Found {len(suspicious_df)} suspicious extreme values after imputation!")

    return suspicious_df

In [22]:
# Define your base dataset and setup logger
dataset = o4_X_train
dataset_name = "o4_X_train"

imputation_methods = [
    {
        "method": "ExtraTrees",
        "file": "01_iterative/o4_X_train_Iterative_ExtraTrees.csv",
        "log": "CIR-27"
    },
    {
        "method": "HistGradientBoosting",
        "file": "01_iterative/o4_X_train_Iterative_HistGradientBoosting.csv",
        "log": "CIR-27"
    },
    {
        "method": "knn_distance",
        "file": "02_knn/o4_X_train_KNN_distance.csv",
        "log": "CIR-24"
    },
    {
        "method": "knn_uniform",
        "file": "02_knn/o4_X_train_KNN_uniform.csv",
        "log": "CIR-24"
    },
    {
        "method": "Mean",
        "file": "03_mean/o4_X_train_Mean.csv",
        "log": "CIR-25"
    },
    {
        "method": "Median",
        "file": "04_median/o4_X_train_Median.csv",
        "log": "CIR-25"
    }
]

for impute in imputation_methods:
    method_imputed = impute["method"]
    log_file = f'logs/{impute["log"]}.log'
    switch_log_file(log_file)
    logger.info(f"This is being logged to {impute['log']}.log")

    # Load imputed data
    file_path = f"CSV/exports/CRI-02/o1_impute_baselines/{impute['file']}"
    data_imputed = pd.read_csv(file_path)

    # Plot distributions before vs after
    plot_before_after_distributions(
        input_df_before=dataset,
        input_df_after=data_imputed,
        output_folder=f"figures/CRI-27/o1_impute_baselines/{os.path.dirname(impute['file'])}/{method_imputed}/"
    )

    # Check for extreme values
    extreme_values_df = check_extreme_values(
        input_df_before=dataset,
        input_df_after=data_imputed,
        threshold=5.0
    )

    # Save extreme values report
    output_folder = "CSV/exports/CRI-02/00_extreme_values/"
    os.makedirs(output_folder, exist_ok=True)

    if not extreme_values_df.empty:
        output_path = os.path.join(output_folder, f"{dataset_name}_{method_imputed}_extreme_values.csv")
        extreme_values_df.to_csv(output_path, index=False)
        logging.info("Extreme values report saved.")
    else:
        logging.info("No suspicious extreme values detected.")

    logging.info("+++++++++++++++++++++++++++++++++++++++++++")


2025-05-03 23:43:47,969 - INFO - Switched logging to logs/CIR-27.log
2025-05-03 23:43:47,970 - INFO - This is being logged to CIR-27.log
2025-05-03 23:43:49,328 - INFO - Plotting distributions for features: ['age', 'Alanine_Aminotransferase_(ALT)_(Max)', 'Alanine_Aminotransferase_(ALT)_(Mean)', 'Alanine_Aminotransferase_(ALT)_(Median)', 'Alanine_Aminotransferase_(ALT)_(Min)', 'Albumin_(Max)', 'Albumin_(Mean)', 'Albumin_(Median)', 'Albumin_(Min)', 'Alkaline_Phosphatase_(Max)', 'Alkaline_Phosphatase_(Mean)', 'Alkaline_Phosphatase_(Median)', 'Alkaline_Phosphatase_(Min)', 'Ammonia_(Max)', 'Ammonia_(Mean)', 'Ammonia_(Median)', 'Ammonia_(Min)', 'Amylase_(Max)', 'Amylase_(Mean)', 'Amylase_(Median)', 'Amylase_(Min)', 'Anion_Gap_(Max)', 'Anion_Gap_(Mean)', 'Anion_Gap_(Median)', 'Anion_Gap_(Min)', 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Max)', 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Mean)', 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Median)', 'Arterial_Blood_Pressure_diastolic_(mmHg)_(Min)

# Comparison
## Compaire imputation techniques

In [None]:
mean_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "03_mean", "o4_X_train_Mean.csv")
median_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "04_median", "o4_X_train_Median.csv")
knn_distance = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "02_knn", "o4_X_train_KNN_distance.csv")
knn_uniform = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "02_knn", "o4_X_train_KNN_distance.csv")
iter_extratrees_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "01_iterative", "o4_X_train_Iterative_ExtraTrees.csv")
iter_hgb_path = os.path.join("CSV", "exports", "CRI-02", "o1_impute_baselines", "01_iterative", "o4_X_train_Iterative_HistGradientBoosting.csv")

# Load original and imputed datasets
#o4_X_train = pd.read_csv(mean_path).astype(np.float32)
o4_X_train_Mean = pd.read_csv(mean_path).astype(np.float32)
o4_X_train_Median = pd.read_csv(median_path).astype(np.float32)
o4_X_train_KNN_distance = pd.read_csv(knn_distance).astype(np.float32)
o4_X_train_KNN_uniform = pd.read_csv(knn_uniform).astype(np.float32)
o4_X_train_Iterative_ExtraTrees = pd.read_csv(iter_extratrees_path).astype(np.float32)
o4_X_train_Iterative_HistGradientBoosting = pd.read_csv(iter_hgb_path).astype(np.float32)

In [None]:
def summarize_imputation_results(original_df, imputed_df, method_name, extreme_values_path):
    # Detect where original data had NaNs
    mask_missing = original_df.isna()

    # Calculate difference only at missing locations
    diff = np.abs(imputed_df - original_df)
    mean_change = diff[mask_missing].mean().mean()

    # Count remaining NaNs
    nan_count = np.isnan(imputed_df.values).sum()

    # Load extreme values file and count
    if os.path.exists(extreme_values_path):
        extreme_df = pd.read_csv(extreme_values_path)
        n_extreme = len(extreme_df)
    else:
        n_extreme = 0
        logging.warning(f"Extreme values file not found for method {method_name}: {extreme_values_path}")

    return {
        "method": method_name,
        "nan_count": nan_count,
        "mean_change": round(mean_change, 4),
        "n_extreme_values": n_extreme
    }


In [None]:
results = []

base_extreme_dir = os.path.join("CSV", "exports", "CRI-02", "00_extreme_values")

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Mean, "Mean",
    os.path.join(base_extreme_dir, "o4_X_train_Mean_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Median, "Median",
    os.path.join(base_extreme_dir, "o4_X_train_Median_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_KNN_distance, "KNN (distance)",
    os.path.join(base_extreme_dir, "o4_X_train_knn_distance_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_KNN_uniform, "KNN (uniform)",
    os.path.join(base_extreme_dir, "o4_X_train_knn_uniform_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Iterative_ExtraTrees, "Iterative (ExtraTrees)",
    os.path.join(base_extreme_dir, "o4_X_train_ExtraTrees_extreme_values.csv")
))

results.append(summarize_imputation_results(
    o4_X_train, o4_X_train_Iterative_HistGradientBoosting, "Iterative (HistGradientBoosting)",
    os.path.join(base_extreme_dir, "o4_X_train_HistGradientBoosting_extreme_values.csv")
))


In [None]:
# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results)

# Sort by mean change (optional)
results_df = results_df.sort_values(by="mean_change", ascending=True).reset_index(drop=True)

# Display
display(results_df)