# CIR-2 Baseline Imputation Methods

In [1]:
import pandas as pd
import numpy as np
import os
import logging
import time

from tqdm import tqdm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("logs/CIR-22.log"),
        logging.StreamHandler()
    ]
)

In [None]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-27 16:33:53,141 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-04-27 16:33:53,144 - INFO - Start Loading Dataframes.
2025-04-27 16:33:53,146 - INFO - Loading... -> o1_X_external.csv
2025-04-27 16:34:00,392 - INFO - Loading... -> o1_X_test.csv
2025-04-27 16:34:00,910 - INFO - Loading... -> o1_X_train.csv
2025-04-27 16:34:04,959 - INFO - Loading... -> o1_X_validate.csv
2025-04-27 16:34:05,501 - INFO - Loading... -> o1_y_external_los.csv
2025-04-27 16:34:05,549 - INFO - Loading... -> o1_y_external_mortality.csv
2025-04-27 16:34:05,579 - INFO - Loading... -> o1_y_test_los.csv
2025-04-27 16:34:05,588 - INFO - Loading... -> o1_y_test_mortality.csv
2025-04-27 16:34:05,593 - INFO - Loading... -> o1_y_train_los.csv
2025-04-27 16:34:05,631 - INFO - Loading... -> o1_y_train_mortality.csv
2025-04-27 16:34:05,648 - INFO - Loading... -> o1_y_validate_los.csv
2025-04-27 16:34:05,658 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-04-27 16:34:05,662 - INFO - Loading... -> o2_

# CIR-23 Implement Iterative Imputer

In [None]:
"""
Imputes missing values in a dataframe using
IterativeImputer with ExtraTreesRegressor or BayesianRidge.
    
Parameters:
    input_df (pd.DataFrame): Input dataframe to impute.
    output_path (str): Path to save the imputed CSV.
    method (str): "ExtraTrees" or "BayesianRidge".
    n_iter (int): Number of max iterations for the imputer.
"""

def impute_with_iterative(input_df, output_path, method, n_iter):
    # Start
    logging.info(f"Starting Iterative Imputer with method={method} on input DataFrame of shape {input_df.shape}.")

    # Copy input
    data_copy = input_df.copy()

    # Create output folder if needed
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Choose estimator
    if method == "ExtraTrees":
        estimator = ExtraTreesRegressor(
            n_estimators=10,
            random_state=0,
            n_jobs=-1
        )
    elif method == "BayesianRidge":
        estimator = BayesianRidge()
    else:
        raise ValueError(f"Unsupported method: {method}. Use 'ExtraTrees' or 'BayesianRidge'.")

    # Create imputer
    imputer = IterativeImputer(
        estimator=estimator,
        max_iter=n_iter,
        random_state=0,
        verbose=2,  # Show progress per iteration
        sample_posterior=False
    )

    # Start timing
    start_time = time.time()

    # Fit and transform
    imputed_array = imputer.fit_transform(data_copy)

    # End timing
    end_time = time.time()
    runtime = end_time - start_time

    # Convert back to DataFrame
    imputed_df = pd.DataFrame(
        imputed_array,
        columns=data_copy.columns
    )

    # Save
    imputed_df.to_csv(output_path, index=False)

    # Logs
    logging.info(f"Imputation completed in {runtime:.2f} seconds.")
    nan_count = np.isnan(imputed_df.values).sum()
    logging.info(f"Number of NaNs after imputation: {nan_count}")
    logging.info(f"Imputed dataset saved at {output_path}")
    logging.info(f"Basic statistics after imputation:\n{imputed_df.describe()}")

    return imputed_df

In [None]:
# Create a small subset of the dataframe (for faster testing)
small_data = o4_X_train.iloc[:500, :]  # pick first 50 features

# Test imputation on small_data
impute_with_iterative(
    input_df=small_data,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/small_o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

In [None]:
# ExtraTrees estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_ExtraTrees.csv",
    method="ExtraTrees",
    n_iter=20
)

In [None]:
# BayesianRidge estimator
impute_with_iterative(
    input_df=o4_X_train,
    output_path="CSV/exports/CRI-02/o1_impute_baselines/01_iterative/o3_X_train_imputed_Iterative_BayesianRidge.csv",
    method="BayesianRidge",
    n_iter=20
)