# Sprint 1: Foundational Setup

## CIR-8: Load and Validate all Datasets

In [18]:
import pandas as pd
import numpy as np
import os
import logging

from tqdm import tqdm

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

from math import ceil
from matplotlib.patches import Patch

In [2]:
# Initial logger setup
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Global variable to hold the active file handler
current_file_handler = None

# Create the stream handler (to console)
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

def switch_log_file(filename):
    global current_file_handler

    # If a file handler already exists, remove and close it
    if current_file_handler:
        logger.removeHandler(current_file_handler)
        current_file_handler.close()

    # Create a new file handler
    current_file_handler = logging.FileHandler(filename)
    current_file_handler.setFormatter(formatter)
    logger.addHandler(current_file_handler)

    logger.info(f"Switched logging to {filename}")

In [3]:
# Build log file
switch_log_file('logs/CIR-8.log')
logger.info("This is being logged to CIR-8.log")

2025-04-30 12:17:10,377 - INFO - Switched logging to logs/CIR-8.log
2025-04-30 12:17:10,380 - INFO - This is being logged to CIR-8.log


In [4]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("++++++++++++++++++CIR-1++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-30 12:17:10,393 - INFO - ++++++++++++++++++CIR-1++++++++++++++++++++++++
2025-04-30 12:17:10,394 - INFO - Start Loading Dataframes.
2025-04-30 12:17:10,395 - INFO - Loading... -> o1_X_external.csv
2025-04-30 12:17:17,449 - INFO - Loading... -> o1_X_test.csv
2025-04-30 12:17:17,952 - INFO - Loading... -> o1_X_train.csv
2025-04-30 12:17:22,051 - INFO - Loading... -> o1_X_validate.csv
2025-04-30 12:17:22,585 - INFO - Loading... -> o1_y_external_los.csv
2025-04-30 12:17:22,627 - INFO - Loading... -> o1_y_external_mortality.csv
2025-04-30 12:17:22,656 - INFO - Loading... -> o1_y_test_los.csv
2025-04-30 12:17:22,666 - INFO - Loading... -> o1_y_test_mortality.csv
2025-04-30 12:17:22,671 - INFO - Loading... -> o1_y_train_los.csv
2025-04-30 12:17:22,707 - INFO - Loading... -> o1_y_train_mortality.csv
2025-04-30 12:17:22,725 - INFO - Loading... -> o1_y_validate_los.csv
2025-04-30 12:17:22,736 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-04-30 12:17:22,741 - INFO - Loading... -

In [5]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Check Datatypes...")
for name, df in dataframes.items():
    logging.info(f"{name} types:\n{df.dtypes.value_counts()}")
logging.info("Check Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-30 12:17:36,723 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-04-30 12:17:36,724 - INFO - Check Datatypes...
2025-04-30 12:17:36,727 - INFO - o1_X_external types:
float32    345
Name: count, dtype: int64
2025-04-30 12:17:36,730 - INFO - o1_X_test types:
float32    345
Name: count, dtype: int64
2025-04-30 12:17:36,732 - INFO - o1_X_train types:
float32    345
Name: count, dtype: int64
2025-04-30 12:17:36,735 - INFO - o1_X_validate types:
float32    345
Name: count, dtype: int64
2025-04-30 12:17:36,737 - INFO - o1_y_external_los types:
float32    1
Name: count, dtype: int64
2025-04-30 12:17:36,742 - INFO - o1_y_external_mortality types:
float32    1
Name: count, dtype: int64
2025-04-30 12:17:36,746 - INFO - o1_y_test_los types:
float32    1
Name: count, dtype: int64
2025-04-30 12:17:36,750 - INFO - o1_y_test_mortality types:
float32    1
Name: count, dtype: int64
2025-04-30 12:17:36,751 - INFO - o1_y_train_los types:
float32    1
Name: count, dtype: int64
2025-04-30 12

In [6]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate missing values...")
for name, df in dataframes.items():
    missing_total = df.isnull().sum().sum()
    missing_cols = df.columns[df.isnull().any()].tolist()
    logging.info(f"{name}: {missing_total} missing values across {len(missing_cols)} columns")
logging.info("Missing values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-30 12:17:36,847 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-04-30 12:17:36,864 - INFO - Calculate missing values...
2025-04-30 12:17:37,062 - INFO - o1_X_external: 39527616 missing values across 308 columns
2025-04-30 12:17:37,077 - INFO - o1_X_test: 1985664 missing values across 292 columns
2025-04-30 12:17:37,189 - INFO - o1_X_train: 15876288 missing values across 304 columns
2025-04-30 12:17:37,206 - INFO - o1_X_validate: 2007696 missing values across 296 columns
2025-04-30 12:17:37,208 - INFO - o1_y_external_los: 0 missing values across 0 columns
2025-04-30 12:17:37,211 - INFO - o1_y_external_mortality: 0 missing values across 0 columns
2025-04-30 12:17:37,213 - INFO - o1_y_test_los: 0 missing values across 0 columns
2025-04-30 12:17:37,215 - INFO - o1_y_test_mortality: 0 missing values across 0 columns
2025-04-30 12:17:37,219 - INFO - o1_y_train_los: 0 missing values across 0 columns
2025-04-30 12:17:37,221 - INFO - o1_y_train_mortality: 0 missing values acros

In [7]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate unique values.")
for name, df in dataframes.items():
    if 'y_' in name:
        unique_vals = df.nunique()
        logging.info(f"{name} unique target values: {unique_vals.to_dict()}")

logging.info("Unique values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-04-30 12:17:37,709 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-04-30 12:17:37,711 - INFO - Calculate unique values.
2025-04-30 12:17:37,720 - INFO - o1_y_external_los unique target values: {'los': 830}
2025-04-30 12:17:37,724 - INFO - o1_y_external_mortality unique target values: {'hospital_expire_flag': 2}
2025-04-30 12:17:37,726 - INFO - o1_y_test_los unique target values: {'los': 319}
2025-04-30 12:17:37,728 - INFO - o1_y_test_mortality unique target values: {'hospital_expire_flag': 2}
2025-04-30 12:17:37,732 - INFO - o1_y_train_los unique target values: {'los': 2548}
2025-04-30 12:17:37,736 - INFO - o1_y_train_mortality unique target values: {'hospital_expire_flag': 2}
2025-04-30 12:17:37,739 - INFO - o1_y_validate_los unique target values: {'los': 319}
2025-04-30 12:17:37,743 - INFO - o1_y_validate_mortality unique target values: {'hospital_expire_flag': 2}
2025-04-30 12:17:37,746 - INFO - o2_y_external_los unique target values: {'los': 830}
2025-04-30 12:17:37,7

In [8]:
summary = []
for name, df in dataframes.items():
    missing_values = df.isnull().sum().sum()
    missing_cols = df.isnull().any().sum()
    total_cells = df.shape[0] * df.shape[1]
    summary.append({
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Values': missing_values,
        'Accross Missing Columns': missing_cols,
        'Missing %': 100 * missing_values / total_cells
    })

summary_df = pd.DataFrame(summary)
# Save to file
summary_df.to_csv("CSV/exports/01_dataset_summary_CIR-8.csv", index=False)

In [9]:
percent = 7

# Dictionary of datasets for iteration
datasets = {
    "o1_X_train": o1_X_train,
    "o2_X_train": o2_X_train,
    "o3_X_train": o3_X_train,
    "o4_X_train": o4_X_train
}

# Loop through each dataset
for name, df in datasets.items():
    missing_percentage_per_row = df.isnull().mean(axis=1) * 100
    missing_rows = (missing_percentage_per_row <= percent).sum()
    total_rows, total_columns = df.shape
    percent_between = (missing_rows * 100) / total_rows

    print(f"\nDataset: {name}")
    print(f"Total Rows: {total_rows}, Total Columns: {total_columns}")
    print(f"Number of rows with missing values up to {percent}%: {missing_rows}")
    print(f"The percentage between total and missing values sets is {percent_between:.2f}%")


Dataset: o1_X_train
Total Rows: 122496, Total Columns: 345
Number of rows with missing values up to 7%: 48
The percentage between total and missing values sets is 0.04%

Dataset: o2_X_train
Total Rows: 61248, Total Columns: 345
Number of rows with missing values up to 7%: 24
The percentage between total and missing values sets is 0.04%

Dataset: o3_X_train
Total Rows: 40832, Total Columns: 345
Number of rows with missing values up to 7%: 16
The percentage between total and missing values sets is 0.04%

Dataset: o4_X_train
Total Rows: 30624, Total Columns: 345
Number of rows with missing values up to 7%: 12
The percentage between total and missing values sets is 0.04%


## CIR-9: Analyze Missingness (per row/column)

In [10]:
# Build log file
switch_log_file('logs/CIR-9.log')
logger.info("This is being logged to CIR-9.log")

2025-04-30 12:17:38,762 - INFO - Switched logging to logs/CIR-9.log
2025-04-30 12:17:38,763 - INFO - This is being logged to CIR-9.log


In [11]:
missingness_summary = []

for name, df in dataframes.items():
    if not isinstance(df, pd.DataFrame):
        continue
    
    total_cells = df.shape[0] * df.shape[1]
    total_missing = df.isnull().sum().sum()
    
    col_missing = df.isnull().mean() * 100
    row_missing = df.isnull().mean(axis=1) * 100
    
    summary = {
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Cells': total_missing,
        'Total Missing %': round(100 * total_missing / total_cells, 2),
        'Columns with Missing (%)': (col_missing > 0).sum(),
        'Max % Missing in Row': round(row_missing.max(), 2),
        'Mean % Missing in Row': round(row_missing.mean(), 2),
        'Min % Missing in Row': round(row_missing.min(), 2)
    }
    missingness_summary.append(summary)

# Create DataFrame for summary
missingness_df = pd.DataFrame(missingness_summary)

# Sort by most missing
missingness_df.sort_values("Total Missing %", ascending=False, inplace=True)

# Save to file
missingness_df.to_csv("CSV/exports/02_missingness_summary-CIR-9.csv", index=False)

# Show top rows, I have leave out the y_ files which are labels
logging.info(missingness_df.head(16))

2025-04-30 12:17:40,876 - INFO -           Dataset          Shape  Total Missing Cells  Total Missing %  \
0   o1_X_external  (234720, 345)             39527616            48.81   
12  o2_X_external  (117360, 345)             19763572            48.81   
36  o4_X_external   (58680, 345)              9881900            48.81   
24  o3_X_external   (78240, 345)             13175808            48.81   
27  o3_X_validate    (5104, 345)               669232            38.01   
3   o1_X_validate   (15312, 345)              2007696            38.01   
15  o2_X_validate    (7656, 345)              1003848            38.01   
39  o4_X_validate    (3828, 345)               501924            38.01   
25      o3_X_test    (5104, 345)               661888            37.59   
1       o1_X_test   (15312, 345)              1985664            37.59   
13      o2_X_test    (7656, 345)               992832            37.59   
37      o4_X_test    (3828, 345)               496416            37.59   
26   

## CIR-10: Visualize Missingness

In [12]:
# Build log file
switch_log_file('logs/CIR-10.log')
logger.info("This is being logged to CIR-10.log")

2025-04-30 12:17:40,911 - INFO - Switched logging to logs/CIR-10.log
2025-04-30 12:17:40,912 - INFO - This is being logged to CIR-10.log


In [13]:
# Create output folder
output_dir = "figures/CIR-10_missingness"
os.makedirs(output_dir, exist_ok=True)

# Functions for plotting
def plot_missing_heatmap(
    df, title, max_rows=200, save_path=None, suffix_filter='(Min)', cmap='Blues'
):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    df_filtered = df[filtered_cols].head(max_rows)

    plt.figure(figsize=(16, 6))
    sns.heatmap(df_filtered.isnull(), cbar=False, cmap=cmap, yticklabels=False,
                linecolor='lightgrey', linewidths=0.001)

    plt.title(f"Missing Data Heatmap – First {max_rows} Rows\n({suffix_filter}) – {title}",
              fontsize=14, weight='bold')
    plt.xlabel("Features", fontsize=12)
    plt.ylabel("Rows", fontsize=12)
    plt.xticks(fontsize=8, rotation=90)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def plot_column_missing_bar(
    df, title, save_path=None, top_n=40, suffix_filter='(Min)', color='#3E8ED0'
):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    missing_perc = df[filtered_cols].isnull().mean() * 100
    missing_perc = missing_perc[missing_perc > 0].sort_values(ascending=False).head(top_n)

    plt.figure(figsize=(10, 0.4 * len(missing_perc)))
    ax = missing_perc.plot(kind='barh', color=color)

    plt.title(f"Top {len(missing_perc)} Features with Missing Values\n({suffix_filter}) – {title}",
              fontsize=14, weight='bold')
    plt.xlabel("Percentage Missing", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=8)
    plt.grid(axis='x', linestyle='--', linewidth=0.5)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


# Filter only X datasets from the summary
top_datasets = missingness_df[
    (~missingness_df['Dataset'].str.startswith('y_')) &
    (missingness_df['Dataset'].isin(dataframes.keys()))
].head(16)['Dataset'].tolist()

# Generate plots
for dataset_name in top_datasets:
    df = dataframes.get(dataset_name)
    if df is not None:
        logging.info(f"Generating plots for: {dataset_name}")
        heatmap_path = os.path.join(output_dir, f"{dataset_name}_heatmap.png")
        barplot_path = os.path.join(output_dir, f"{dataset_name}_barplot.png")
        
        plot_missing_heatmap(df, dataset_name, save_path=heatmap_path)
        plot_column_missing_bar(df, dataset_name, save_path=barplot_path)
    else:
        logging.info(f"Dataset not found in dataframes: {dataset_name}")

2025-04-30 12:17:40,947 - INFO - Generating plots for: o1_X_external
2025-04-30 12:17:45,380 - INFO - Generating plots for: o2_X_external
2025-04-30 12:17:49,274 - INFO - Generating plots for: o4_X_external
2025-04-30 12:17:53,336 - INFO - Generating plots for: o3_X_external
2025-04-30 12:17:57,306 - INFO - Generating plots for: o3_X_validate
2025-04-30 12:18:01,239 - INFO - Generating plots for: o1_X_validate
2025-04-30 12:18:05,165 - INFO - Generating plots for: o2_X_validate
2025-04-30 12:18:09,389 - INFO - Generating plots for: o4_X_validate
2025-04-30 12:18:13,208 - INFO - Generating plots for: o3_X_test
2025-04-30 12:18:17,029 - INFO - Generating plots for: o1_X_test
2025-04-30 12:18:20,844 - INFO - Generating plots for: o2_X_test
2025-04-30 12:18:25,298 - INFO - Generating plots for: o4_X_test
2025-04-30 12:18:29,771 - INFO - Generating plots for: o3_X_train
2025-04-30 12:18:33,653 - INFO - Generating plots for: o1_X_train
2025-04-30 12:18:37,544 - INFO - Generating plots for: o

## CIR-11: Define Clinical Value Ranges

In [14]:
# Build log file
switch_log_file('logs/CIR-11.log')
logger.info("This is being logged to CIR-11.log")

2025-04-30 12:18:45,488 - INFO - Switched logging to logs/CIR-11.log
2025-04-30 12:18:45,489 - INFO - This is being logged to CIR-11.log


In [15]:
# List of suffixes
suffixes = ['(Mean)', '(Median)', '(Min)', '(Max)']

for SUFFIX in suffixes:
    logging.info(f"Processing suffix: {SUFFIX}")
    
    # Step 2: Collect global min/max per feature
    global_min_max = {}

    for name, df in dataframes.items():
        if name.startswith('y_'):  # Skip target datasets
            continue
        filtered_cols = [col for col in df.columns if col.endswith(SUFFIX)]
        filtered_df = df[filtered_cols]

        # For each feature, collect min and max
        for col in filtered_cols:
            col_min = filtered_df[col].min(skipna=True)
            col_max = filtered_df[col].max(skipna=True)

            if col not in global_min_max:
                global_min_max[col] = {'min': col_min, 'max': col_max}
            else:
                global_min_max[col]['min'] = min(global_min_max[col]['min'], col_min)
                global_min_max[col]['max'] = max(global_min_max[col]['max'], col_max)

    # Step 3: Convert to DataFrame
    clinical_ranges_df = pd.DataFrame.from_dict(global_min_max, orient='index')
    clinical_ranges_df = clinical_ranges_df.rename(columns={'min': 'Observed Min', 'max': 'Observed Max'})
    clinical_ranges_df = clinical_ranges_df.sort_index()

    # Save it to a CSV file
    suffix_clean = SUFFIX.strip('()')  # Remove parentheses for a cleaner filename
    output_file_path = f"CSV/exports/CRI-11/observed_clinical_ranges_{suffix_clean}.csv"
    os.makedirs(output_dir, exist_ok=True)
    clinical_ranges_df.to_csv(output_file_path, index=True)

    # Optional: Preview
    display(clinical_ranges_df)

2025-04-30 12:18:45,504 - INFO - Processing suffix: (Mean)


Unnamed: 0,Observed Min,Observed Max
Alanine_Aminotransferase_(ALT)_(Mean),1.0,7784.666504
Albumin_(Mean),1.0,5.100000
Alkaline_Phosphatase_(Mean),9.0,1448.000000
Ammonia_(Mean),2.2,130.000000
Amylase_(Mean),8.0,4867.000000
...,...,...
Uric_Acid_(Mean),1.3,16.500000
White_Blood_Cells_(Mean),0.2,361.000000
pCO2_(Mean),9.0,101.333336
pH_(Mean),3.0,8.500000


2025-04-30 12:18:45,899 - INFO - Processing suffix: (Median)


Unnamed: 0,Observed Min,Observed Max
Alanine_Aminotransferase_(ALT)_(Median),1.0,7510.0
Albumin_(Median),1.0,5.1
Alkaline_Phosphatase_(Median),9.0,1448.0
Ammonia_(Median),2.2,130.0
Amylase_(Median),8.0,4867.0
...,...,...
Uric_Acid_(Median),1.3,16.5
White_Blood_Cells_(Median),0.2,361.0
pCO2_(Median),9.0,98.0
pH_(Median),5.0,8.5


2025-04-30 12:18:46,285 - INFO - Processing suffix: (Min)


Unnamed: 0,Observed Min,Observed Max
Alanine_Aminotransferase_(ALT)_(Min),1.00,7510.0
Albumin_(Min),1.00,5.1
Alkaline_Phosphatase_(Min),8.00,1448.0
Ammonia_(Min),2.20,130.0
Amylase_(Min),7.00,4867.0
...,...,...
Uric_Acid_(Min),0.30,16.5
White_Blood_Cells_(Min),0.17,361.0
pCO2_(Min),9.00,98.0
pH_(Min),5.00,8.5


2025-04-30 12:18:46,669 - INFO - Processing suffix: (Max)


Unnamed: 0,Observed Min,Observed Max
Alanine_Aminotransferase_(ALT)_(Max),1.0,8776.000000
Albumin_(Max),1.0,5.100000
Alkaline_Phosphatase_(Max),9.0,1448.000000
Ammonia_(Max),2.2,157.000000
Amylase_(Max),8.0,4867.000000
...,...,...
Uric_Acid_(Max),1.3,16.500000
White_Blood_Cells_(Max),0.2,361.000000
pCO2_(Max),9.0,163.399994
pH_(Max),5.0,8.500000


## CRI-50: Identify and Quantify Out-of-Range Values

In [16]:
# Build log file
switch_log_file('logs/CIR-50.log')
logger.info("This is being logged to CIR-50.log")

2025-04-30 12:18:47,059 - INFO - Switched logging to logs/CIR-50.log
2025-04-30 12:18:47,061 - INFO - This is being logged to CIR-50.log


In [19]:
# Datasets
datasets = [
    "o1_X_train", "o1_X_validate", "o1_X_test", "o1_X_external",
    "o2_X_train", "o2_X_validate", "o2_X_test", "o2_X_external",
    "o3_X_train", "o3_X_validate", "o3_X_test", "o3_X_external",
    "o4_X_train", "o4_X_validate", "o4_X_test", "o4_X_external"
]

# Identify clinical-laboratory features
clinical_suffixes = ('(Min)', '(Max)')
clinical_features = [col for col in dataframes['o1_X_train'].columns if col.endswith(clinical_suffixes)]

# Concatenate all dataframes with mortality label
all_rows = []

for dataset in datasets:
    df = dataframes[dataset].copy()
    label_name = dataset.replace("X", "y") + "_mortality"
    df["mortality"] = dataframes[label_name]
    df["dataset"] = dataset  # keep track of original dataset
    all_rows.append(df)

df_all = pd.concat(all_rows)

# Separate groups
groups = {
    "survive": df_all[df_all["mortality"] == 0.0],
    "non_survive": df_all[df_all["mortality"] == 1.0]
}

# Process each group separately
for group_name, group_df in groups.items():
    logging.info(f"Processing group: {group_name}")

    # --- Part A: Observed ranges ---
    observed_ranges = []

    for feature in tqdm(clinical_features, desc=f"[{group_name}] Observed Ranges"):
        if feature in group_df.columns and not group_df[feature].isnull().all():
            observed_ranges.append({
                'Feature': feature,
                'Observed Min': group_df[feature].min(),
                'Observed Max': group_df[feature].max()
            })

    observed_ranges_df = pd.DataFrame(observed_ranges)
    observed_ranges_df.to_csv(f"CSV/exports/02_task4_observed_ranges_{group_name}.csv", index=False)
    logging.info(f"Saved observed ranges for {group_name}")

    # --- Part B: Percentile detection ---
    percentile_summary = []

    for dataset in datasets:
        df_dataset = df_all[df_all["dataset"] == dataset]
        group_df = df_dataset[df_dataset["mortality"] == group_df["mortality"].iloc[0]]

        for feature in tqdm(clinical_features, desc=f"[{group_name}] {dataset}", leave=False):
            if feature not in group_df.columns or group_df[feature].isnull().all():
                continue

            lower_pct = group_df[feature].quantile(0.005)
            upper_pct = group_df[feature].quantile(0.995)

            extreme_low = group_df[group_df[feature] < lower_pct].shape[0]
            extreme_high = group_df[group_df[feature] > upper_pct].shape[0]

            percentile_summary.append({
                'Source Dataset': dataset,
                'Feature': feature,
                '0.5th Percentile Threshold': lower_pct,
                '99.5th Percentile Threshold': upper_pct,
                'Count Below 0.5th': extreme_low,
                'Count Above 99.5th': extreme_high,
                'Min Value Found': group_df[feature].min(),
                'Max Value Found': group_df[feature].max()
            })
    
    # Define the full output file path
    output_file_path = f"CSV/exports/CRI-50/percentile_extreme_values_{group_name}.csv"

    # Ensure the directory exists
    output_dir = os.path.dirname(output_file_path)
    os.makedirs(output_dir, exist_ok=True)

    # Save the DataFrame
    percentile_df = pd.DataFrame(percentile_summary)
    percentile_df.to_csv(output_file_path, index=False)
    logging.info(f"Saved extreme values for {group_name}")

2025-04-30 12:20:37,942 - INFO - Processing group: survive
[survive] Observed Ranges: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 154/154 [00:00<00:00, 327.67it/s]
2025-04-30 12:20:38,430 - INFO - Saved observed ranges for survive
2025-04-30 12:20:47,815 - INFO - Saved extreme values for survive                                                                                                
2025-04-30 12:20:47,819 - INFO - Processing group: non_survive
[non_survive] Observed Ranges: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 154/154 [00:00<00:00, 1750.06it/s]
2025-04-30 12:20:47,914 - INFO - Saved observed ranges for non_survive
2025-04-30 12:20:54,733 - INFO - Saved extreme values for non_survive                                                                                            


In [None]:
"""
survive and non survive together
"""
# Settings
output_root = "figures/CIR-50/grouped_adaptive_plots"
os.makedirs(output_root, exist_ok=True)


suffixes = ['(Min)', '(Max)']
LOWER_COL = '0.5th Percentile Threshold'
UPPER_COL = '99.5th Percentile Threshold'
CHUNK_SIZE = 10  # Features per plot

# Loop through datasets and suffixes
for dataset in datasets:
    df = dataframes[dataset]
    dataset_dir = os.path.join(output_root, dataset)
    os.makedirs(dataset_dir, exist_ok=True)

    for suffix in suffixes:
        df_suffix = percentile_df[
            (percentile_df['Source Dataset'] == dataset) &
            (percentile_df['Feature'].str.endswith(suffix))
        ].copy()

        if df_suffix.empty:
            continue

        df_suffix['Total_Extremes'] = df_suffix['Count Below 0.5th'] + df_suffix['Count Above 99.5th']
        df_sorted = df_suffix.sort_values('Total_Extremes', ascending=False)
        features_to_plot = df_sorted['Feature'].tolist()

        sns.set(style="whitegrid")
        total_chunks = ceil(len(features_to_plot) / CHUNK_SIZE)

        for chunk_idx in range(total_chunks):
            chunk_features = features_to_plot[chunk_idx * CHUNK_SIZE : (chunk_idx + 1) * CHUNK_SIZE]
            fig, axes = plt.subplots(nrows=len(chunk_features)*2, ncols=1, figsize=(13, len(chunk_features) * 3.0))
            if len(chunk_features) == 1:
                axes = [axes] * 2

            for i, feature in enumerate(chunk_features):
                plot_ax = axes[i*2]
                table_ax = axes[i*2 + 1]

                data_clean = df[feature].dropna().values
                if data_clean.size == 0:
                    continue

                row = df_sorted[df_sorted['Feature'] == feature].iloc[0]

                lower_pct = row[LOWER_COL]
                upper_pct = row[UPPER_COL]
                x_min, x_max = np.percentile(data_clean, [1, 99])
                x_min = min(x_min, lower_pct) - 0.05 * abs(min(x_min, lower_pct))
                x_max = max(x_max, upper_pct) + 0.05 * abs(max(x_max, upper_pct))

                sns.boxplot(x=data_clean, ax=plot_ax, orient='h', fliersize=2, color='skyblue',
                            width=0.5, linewidth=1, boxprops=dict(alpha=0.6))
                plot_ax.axvline(lower_pct, color='red', linestyle='--', linewidth=3, zorder=10)
                plot_ax.axvline(upper_pct, color='purple', linestyle='--', linewidth=3, zorder=10)
                plot_ax.set_xlim(x_min, x_max)
                plot_ax.set_title(feature, fontsize=13, weight='bold')
                plot_ax.set_xlabel("Value", fontsize=11)

                # Table
                table_ax.axis('off')
                table_data = [[
                    f"{row[LOWER_COL]:.2f}", f"{row[UPPER_COL]:.2f}",
                    f"{row['Count Below 0.5th']}", f"{row['Count Above 99.5th']}",
                    f"{row['Min Value Found']:.2f}", f"{row['Max Value Found']:.2f}"
                ]]
                col_labels = [
                    '0.5th Percentile', '99.5th Percentile',
                    'Count < 0.5th', 'Count > 99.5th',
                    'Min', 'Max'
                ]
                table = table_ax.table(
                    cellText=table_data,
                    colLabels=col_labels,
                    loc='center',
                    cellLoc='center',
                    colWidths=[0.15] * len(col_labels)
                )
                table.scale(1, 2.2)
                for (row_i, col_i), cell in table.get_celld().items():
                    if row_i == 0:
                        cell.set_text_props(weight='bold', fontsize=14)
                    else:
                        cell.set_text_props(fontsize=13)

            # File naming
            suffix_safe = suffix.replace('(', '').replace(')', '')
            file_index = str(chunk_idx + 1).zfill(2)
            plot_filename = f"{dataset}_{suffix_safe}_adaptive_per_feature_table_{file_index}.png"

            # Title and single legend (positioned slightly above the title)
            plt.suptitle(f'Top Features {suffix} – {dataset} (Set {file_index})',
                         fontsize=15, weight='bold', y=1.035)
            
            legend_elements = [
                Patch(facecolor='red', edgecolor='red', linestyle='--', label='0.5th Percentile'),
                Patch(facecolor='purple', edgecolor='purple', linestyle='--', label='99.5th Percentile')
            ]
            fig.legend(
                handles=legend_elements,
                loc='upper center',
                fontsize=12,
                ncol=2,
                frameon=False,
                bbox_to_anchor=(0.5, 1.06)
            )

            plt.subplots_adjust(top=0.95)
            plt.tight_layout()
            plt.savefig(os.path.join(dataset_dir, plot_filename), dpi=300, bbox_inches='tight')
            plt.close()

print("All adaptive plots saved with a single top legend and clean spacing.")

In [None]:
# Settings
output_root = "figures/CIR-50/grouped_adaptive_plots"
os.makedirs(output_root, exist_ok=True)

# Settings
suffixes = ['(Min)', '(Max)']
LOWER_COL = '0.5th Percentile Threshold'
UPPER_COL = '99.5th Percentile Threshold'
CHUNK_SIZE = 10

# Group info
groups = {
    'survive': 'percentile_extreme_values_survive.csv',
    'non_survive': 'percentile_extreme_values_non_survive.csv'
}

for group, csv_file in groups.items():
    print(f"📊 Processing group: {group}")
    
    # Load percentile summary
    percentile_df = pd.read_csv(f"CSV/exports/CRI-50/{csv_file}")

    # Output path
    output_root = f"figures/CIR-50/grouped_adaptive_plots_{group}"
    os.makedirs(output_root, exist_ok=True)

    datasets = percentile_df['Source Dataset'].unique()

    for dataset in datasets:
        df = dataframes[dataset]
        dataset_dir = os.path.join(output_root, dataset)
        os.makedirs(dataset_dir, exist_ok=True)

        for suffix in suffixes:
            df_suffix = percentile_df[
                (percentile_df['Source Dataset'] == dataset) &
                (percentile_df['Feature'].str.endswith(suffix))
            ].copy()

            if df_suffix.empty:
                continue

            df_suffix['Total_Extremes'] = df_suffix['Count Below 0.5th'] + df_suffix['Count Above 99.5th']
            df_sorted = df_suffix.sort_values('Total_Extremes', ascending=False)
            features_to_plot = df_sorted['Feature'].tolist()

            sns.set(style="whitegrid")
            total_chunks = ceil(len(features_to_plot) / CHUNK_SIZE)

            for chunk_idx in range(total_chunks):
                chunk_features = features_to_plot[chunk_idx * CHUNK_SIZE : (chunk_idx + 1) * CHUNK_SIZE]
                fig, axes = plt.subplots(nrows=len(chunk_features)*2, ncols=1, figsize=(13, len(chunk_features) * 3.0))
                if len(chunk_features) == 1:
                    axes = [axes] * 2

                for i, feature in enumerate(chunk_features):
                    plot_ax = axes[i*2]
                    table_ax = axes[i*2 + 1]

                    data_clean = df[feature].dropna().values
                    if data_clean.size == 0:
                        continue

                    row = df_sorted[df_sorted['Feature'] == feature].iloc[0]

                    lower_pct = row[LOWER_COL]
                    upper_pct = row[UPPER_COL]
                    x_min, x_max = np.percentile(data_clean, [1, 99])
                    x_min = min(x_min, lower_pct) - 0.05 * abs(min(x_min, lower_pct))
                    x_max = max(x_max, upper_pct) + 0.05 * abs(max(x_max, upper_pct))

                    sns.boxplot(x=data_clean, ax=plot_ax, orient='h', fliersize=2, color='skyblue',
                                width=0.5, linewidth=1, boxprops=dict(alpha=0.6))
                    plot_ax.axvline(lower_pct, color='red', linestyle='--', linewidth=3, zorder=10)
                    plot_ax.axvline(upper_pct, color='purple', linestyle='--', linewidth=3, zorder=10)
                    plot_ax.set_xlim(x_min, x_max)
                    plot_ax.set_title(feature, fontsize=13, weight='bold')
                    plot_ax.set_xlabel("Value", fontsize=11)

                    table_ax.axis('off')


                    table_data = [[
                        f"{row[LOWER_COL]:.2f}", f"{row[UPPER_COL]:.2f}",
                        f"{row['Count Below 0.5th']}", f"{row['Count Above 99.5th']}",
                        f"{row['Min Value Found']:.2f}", f"{row['Max Value Found']:.2f}"
                    ]]
                    col_labels = [
                        '0.5th Percentile', '99.5th Percentile',
                        'Count < 0.5th', 'Count > 99.5th',
                        'Min', 'Max'
                    ]
                    table = table_ax.table(
                        cellText=table_data,
                        colLabels=col_labels,
                        loc='center',
                        cellLoc='center',
                        colWidths=[0.15] * len(col_labels)
                    )
                    table.scale(1, 2.2)
                    for (row_i, col_i), cell in table.get_celld().items():
                        if row_i == 0:
                            cell.set_text_props(weight='bold', fontsize=14)
                        else:
                            cell.set_text_props(fontsize=13)

                suffix_safe = suffix.replace('(', '').replace(')', '')
                file_index = str(chunk_idx + 1).zfill(2)
                plot_filename = f"{dataset}_{suffix_safe}_adaptive_per_feature_table_{file_index}.png"

                plt.suptitle(f'Top Features {suffix} – {dataset} ({group.capitalize()} – Set {file_index})',
                             fontsize=15, weight='bold', y=1.035)

                legend_elements = [
                    Patch(facecolor='red', edgecolor='red', linestyle='--', label='0.5th Percentile'),
                    Patch(facecolor='purple', edgecolor='purple', linestyle='--', label='99.5th Percentile')
                ]
                fig.legend(
                    handles=legend_elements,
                    loc='upper center',
                    fontsize=12,
                    ncol=2,
                    frameon=False,
                    bbox_to_anchor=(0.5, 1.06)
                )

                plt.subplots_adjust(top=0.95)
                plt.tight_layout()
                plt.savefig(os.path.join(dataset_dir, plot_filename), dpi=300, bbox_inches='tight')
                plt.close()

print("Plots for both survive and non-survive groups created successfully.")

# Test Field

In [None]:
labevents_df = pd.read_csv(r"..\00_Datasets\mimic-iv-3_1\icu\d_items.csv.gz")

In [None]:
display(labevents_df)

In [None]:

abp_df = labevents_df[labevents_df['label'].str.startswith('Arterial Blood Pressure mean', na=False)]

# Display the result
display(abp_df)


In [None]:
temp_df = pd.read_csv(r"../01_MimicIV/CSV/Exports/o04_icu_chartevent.csv")

In [None]:
display(temp_df)

In [None]:
filtered_df = temp_df[['itemid', 'valuenum', 'valueuom']]

In [None]:
filtered_df = temp_df[temp_df['itemid'] == 220052][['itemid', 'valuenum', 'valueuom']]
display(filtered_df)

In [None]:
filtered_df = filtered_df.sort_values('valuenum', ascending=False)

In [None]:
display(filtered_df)

In [None]:
# File path
file_path = "../00_Datasets/mimic-iv-3_1/icu/chartevents.csv.gz"

# Chunk size
chunksize = 1000000  # Adjust based on your memory capacity

# Filtered data will be appended here
filtered_chunks = []

# Loop through the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, compression='gzip', usecols=['subject_id', 'itemid', 'valuenum', 'valueuom']):
    # Filter only rows with itemid == 220052
    filtered_chunk = chunk[chunk['itemid'] == 220052]
    filtered_chunks.append(filtered_chunk)

# Combine all filtered chunks into one DataFrame
result_df = pd.concat(filtered_chunks, ignore_index=True)

# Show a preview
print(result_df.head())

# Optional: Save to CSV
# result_df.to_csv("filtered_chartevents_220052.csv", index=False)


In [None]:
result_df = result_df.sort_values('valuenum', ascending=False)
display(result_df)

In [None]:
display(result_df)

In [None]:
# File path
file_path = "../00_Datasets/mimic-iv-3_1/icu/chartevents.csv.gz"

# Chunk size
chunksize = 1000000  # Adjust based on your system's RAM

# Container for filtered data
negative_valuenum_chunks = []

# Process the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, compression='gzip', usecols=['subject_id', 'itemid', 'value', 'valuenum', 'valueuom']):
    # Drop rows where valuenum is NaN to avoid errors
    chunk = chunk.dropna(subset=['valuenum'])

    # Filter for negative valuenum values
    negative_chunk = chunk[chunk['valuenum'] < 0]
    
    # Append to list
    negative_valuenum_chunks.append(negative_chunk)

# Combine all the negative valuenum rows
result_df = pd.concat(negative_valuenum_chunks, ignore_index=True)

# Display a few rows
print(result_df.head())

# Optional: Save to CSV
# result_df.to_csv("negative_valuenum_chartevents.csv", index=False)


In [None]:
result_df = result_df.sort_values('valuenum', ascending=True)
display(result_df)