# CIR-03: Hierarchical Imputation Framework

In [1]:
import pandas as pd
import numpy as np
import os
import io
import logging

from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-2.log')
logger.info("This is being logged to CIR-2.log")

2025-05-03 14:16:14,819 - INFO - Switched logging to logs/CIR-2.log
2025-05-03 14:16:14,822 - INFO - This is being logged to CIR-2.log


In [None]:
# Load datasets
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("+++++++++++++++++CIR-2+++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-05-03 14:16:14,834 - INFO - +++++++++++++++++CIR-2+++++++++++++++++++++++++
2025-05-03 14:16:14,835 - INFO - Start Loading Dataframes.
2025-05-03 14:16:14,837 - INFO - Loading... -> o1_X_external.csv


# CIR-14: Implement Row Segmentation by Missingness

In [None]:
# Build log file
switch_log_file('logs/CIR-14.log')
logger.info("This is being logged to CIR-14.log")

In [None]:
"""
Segments the dataframe rows into categories based on the percentage of missing values.
"""
def segment_rows_by_missingness(df: pd.DataFrame):
    row_missing_perc = df.isnull().mean(axis=1)

    segments = {
        'very_low_missing 0% < 20%': df[(row_missing_perc <= 0.20)],
        'low_missing 21% <= 40%': df[(row_missing_perc > 0.20) & (row_missing_perc <= 0.40)],
        'moderate_missing 41% <= 60%': df[(row_missing_perc > 0.40) & (row_missing_perc <= 0.60)],
        'high_missing > 60%': df[(row_missing_perc > 0.60)]
    }

    row_indices = {
        name: segment.index.tolist() for name, segment in segments.items()
    }

    return segments, row_indices, row_missing_perc

In [None]:
# Segment and log all X_ dataframes
logging.info("---------------")
for var_name, df in dataframes.items():
    if not var_name.startswith("o") or "_X_" not in var_name:
        continue  # Skip non-feature or target datasets

    logging.info(f"Segmenting rows by missingness for: {var_name}")
    logging.info(f"{var_name} - Total rows: {df.shape[0]}")

    segments, row_indices, row_missing_perc = segment_rows_by_missingness(df)

    for segment_name, segment_df in segments.items():
        logging.info(f"{var_name} - {segment_name}: {len(segment_df)} rows")
    logging.info("---------------")

In [None]:
# Ensure output directory exists
base_plot_path = "figures/CIR-14"
os.makedirs(base_plot_path, exist_ok=True)

# Seaborn aesthetic settings
sns.set(style="whitegrid", context="talk", palette="deep")

for var_name, df in dataframes.items():
    if not var_name.startswith("o") or "_X_" not in var_name:
        continue  # Skip targets

    logging.info(f"Processing missing distribution plot for {var_name}")

    # Calculate row-wise missingness
    row_missing_perc = df.isnull().mean(axis=1)
    segments, _, _ = segment_rows_by_missingness(df)

    # Prepare summary box content
    summary_text = (
        f"Total rows: {len(df):,}\n"
        f"Very low (≤20%): {len(segments['very_low_missing 0% < 20%']):,}\n"
        f"Low (21–40%): {len(segments['low_missing 21% <= 40%']):,}\n"
        f"Moderate (41–60%): {len(segments['moderate_missing 41% <= 60%']):,}\n"
        f"High (>60%): {len(segments['high_missing > 60%']):,}"
    )

    # Create the figure
    fig, ax = plt.subplots(figsize=(12, 7))
    sns.histplot(row_missing_perc, bins=20, kde=True, color='#2c7fb8', edgecolor='black', ax=ax)

    # Customize titles and labels
    ax.set_title(f"Row-wise Missing Value Distribution\n{var_name}", fontsize=18, fontweight='bold')
    ax.set_xlabel("Proportion of Missing Values", fontsize=15)
    ax.set_ylabel("Number of Rows", fontsize=15)

    # Add summary box to top-right
    ax.text(
        0.99, 0.95, summary_text,
        transform=ax.transAxes,
        fontsize=12,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle="round,pad=0.4", facecolor='whitesmoke', alpha=0.85, edgecolor='gray')
    )

    # Add grid with transparency
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)
    ax.tick_params(axis='both', labelsize=12)

    # Optional: Add watermark tag
    ax.text(0.01, 0.01, "CIR-14", transform=ax.transAxes,
            fontsize=10, color='gray', alpha=0.7, ha='left', va='bottom')

    # Save the figure
    plt.tight_layout()
    plot_filename = os.path.join(base_plot_path, f"{var_name}_missing_distribution.png")
    fig.savefig(plot_filename, dpi=300)
    plt.close(fig)

    logging.info(f"Saved professional missingness plot to {plot_filename}")