# ICU Data Loading and Cube Preparation Script

In [1]:
"""
This script automates the pipeline for loading, reshaping, validating, and preparing 
clinical ICU data for deep learning models (ANN, RNN, etc.) that predict 
**Length of Stay (LOS)** and **Mortality**.

Outputs:
----------------------------------------------------------------------------------------------------
- numpy_cubes/train.npz + train.json
- numpy_cubes/validate.npz + validate.json
- numpy_cubes/test.npz + test.json
- numpy_cubes/external.npz + external.json

Quick Peek Example:
----------------------------------------------------------------------------------------------------
Batch X: (32, 48, 345, 4), Batch y: (32,)

====================================================================================================
"""



In [2]:
import os
import pandas as pd
import numpy as np
import logging
import math
import json

In [3]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [4]:
path = "CSV/Imports/original"

# Get all files in the directory
all_files = os.listdir(path)

# Store dictionary
dataframes = {}

# Load files one by one
for file in all_files:
    if file.endswith(".csv"):
        # Create a variable name from the filename
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        # Load the CSV into a pandas dataframe
        dataframes[var_name] = pd.read_csv(os.path.join(path, file)).astype('float32')

# Chech if they are load
for var_name, df in dataframes.items():
    globals()[var_name] = df  # Assign to global variables if needed
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")

logging.info("Load Complete.")

2025-09-06 12:53:00,166 - INFO - Loading... -> o1_X_external.csv
2025-09-06 12:53:08,444 - INFO - Loading... -> o1_X_test.csv
2025-09-06 12:53:09,073 - INFO - Loading... -> o1_X_train.csv
2025-09-06 12:53:13,777 - INFO - Loading... -> o1_X_validate.csv
2025-09-06 12:53:14,343 - INFO - Loading... -> o1_y_external_los.csv
2025-09-06 12:53:14,397 - INFO - Loading... -> o1_y_external_mortality.csv
2025-09-06 12:53:14,424 - INFO - Loading... -> o1_y_test_los.csv
2025-09-06 12:53:14,433 - INFO - Loading... -> o1_y_test_mortality.csv
2025-09-06 12:53:14,433 - INFO - Loading... -> o1_y_train_los.csv
2025-09-06 12:53:14,478 - INFO - Loading... -> o1_y_train_mortality.csv
2025-09-06 12:53:14,496 - INFO - Loading... -> o1_y_validate_los.csv
2025-09-06 12:53:14,505 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-09-06 12:53:14,514 - INFO - Loading... -> o2_X_external.csv
2025-09-06 12:53:18,571 - INFO - Loading... -> o2_X_test.csv
2025-09-06 12:53:18,905 - INFO - Loading... -> o2_X_train.c

In [5]:
# --- Config ---
FEATURES = 345
T_STEPS = {'o1': 48, 'o2': 24, 'o3': 16, 'o4': 12}
UPSAMPLE = {'o1': 1, 'o2': 2, 'o3': 3, 'o4': 4}
WINDOWS = ['o1', 'o2', 'o3', 'o4']
SPLITS  = ['train', 'validate', 'test', 'external']

# Save the feature names from one window (all are identical)
FEATURE_NAMES = o1_X_train.columns.tolist()
logging.info(f"Number of features: {len(FEATURE_NAMES)}")

# --- Helpers ---
def _reshape_X(dfX, t_steps, features=FEATURES):
    rows = dfX.shape[0]
    assert dfX.shape[1] == features, f"Expected {features}, got {dfX.shape[1]}"
    assert rows % t_steps == 0, f"Rows {rows} not divisible by {t_steps}"
    n_patients = rows // t_steps
    return dfX.to_numpy(dtype=np.float32).reshape(n_patients, t_steps, features)

def _reshape_y(dfY, t_steps):
    y = dfY.to_numpy(dtype=np.float32).reshape(-1, t_steps, 1)
    # verify consistency
    if not np.allclose(y, y[:, :1, :]):
        logging.warning("Some patients have non-constant target within block. Using first value.")
    return y[:, 0, 0]

def _upsample_time(x3d, factor):
    if factor == 1:
        return x3d
    return np.repeat(x3d, repeats=factor, axis=1)

def build_split_cubes(split):
    X_by_window = []
    y_los = y_mort = None
    n_patients_ref = None

    for w in WINDOWS:
        Xdf = globals()[f"{w}_X_{split}"]
        Ylos = globals()[f"{w}_y_{split}_los"]
        Ymor = globals()[f"{w}_y_{split}_mortality"]

        Xw = _reshape_X(Xdf, T_STEPS[w], FEATURES)
        yl = _reshape_y(Ylos, T_STEPS[w])
        ym = _reshape_y(Ymor, T_STEPS[w])

        Xw48 = _upsample_time(Xw, UPSAMPLE[w])  # (N,48,F)

        if n_patients_ref is None:
            n_patients_ref = Xw48.shape[0]
            y_los, y_mort = yl, ym
        else:
            assert Xw48.shape[0] == n_patients_ref, "Patient count mismatch"
            assert np.allclose(yl, y_los), "LOS mismatch"
            assert np.allclose(ym, y_mort), "Mortality mismatch"

        X_by_window.append(Xw48[..., np.newaxis])  # (N,48,F,1)

    X_cube = np.concatenate(X_by_window, axis=-1).astype(np.float32)

    mb = X_cube.size * 4 / (1024**2)
    logging.info(f"{split}: X_cube {X_cube.shape}, ~{mb:.1f} MB, y_los {y_los.shape}, y_mort {y_mort.shape}")
    
    return {
        "X": X_cube,
        "y_los": y_los.astype(np.float32),
        "y_mort": y_mort.astype(np.float32),
        "feature_names": FEATURE_NAMES
    }

# --- Build all splits ---
train_data = build_split_cubes('train')
val_data   = build_split_cubes('validate')
test_data  = build_split_cubes('test')
ext_data   = build_split_cubes('external')

2025-09-06 12:53:29,680 - INFO - Number of features: 345
2025-09-06 12:53:30,868 - INFO - train: X_cube (2552, 48, 345, 4), ~644.9 MB, y_los (2552,), y_mort (2552,)
2025-09-06 12:53:31,051 - INFO - validate: X_cube (319, 48, 345, 4), ~80.6 MB, y_los (319,), y_mort (319,)
2025-09-06 12:53:31,195 - INFO - test: X_cube (319, 48, 345, 4), ~80.6 MB, y_los (319,), y_mort (319,)
2025-09-06 12:53:34,046 - INFO - external: X_cube (4890, 48, 345, 4), ~1235.6 MB, y_los (4890,), y_mort (4890,)


In [6]:
# 1) Sanity checks on features across windows/splits
def check_feature_consistency():
    problems = []
    ref = o1_X_train.columns.tolist()
    for w in ['o1','o2','o3','o4']:
        for split in ['train','validate','test','external']:
            cols = globals()[f"{w}_X_{split}"].columns.tolist()
            if cols != ref:
                problems.append((w, split))
    if problems:
        raise ValueError(f"Feature name/order mismatch in: {problems}")
    print("Feature names/order are identical across all windows & splits.")

check_feature_consistency()

# 2) Persist cubes + metadata to disk (compressed)
os.makedirs("numpy_cubes", exist_ok=True)

def save_split(split_name, data_bundle):
    # data_bundle is what build_split_cubes() returned earlier
    X = data_bundle["X"]
    y_los = data_bundle["y_los"]
    y_mort = data_bundle["y_mort"]
    feature_names = data_bundle["feature_names"]

    np.savez_compressed(f"numpy_cubes/{split_name}.npz",
                        X=X, y_los=y_los, y_mort=y_mort)
    meta = {
        "split": split_name,
        "shape": tuple(int(v) for v in X.shape),
        "dtype": str(X.dtype),
        "n_patients": int(X.shape[0]),
        "timesteps": int(X.shape[1]),
        "n_features": int(X.shape[2]),
        "n_windows": int(X.shape[3]),
        "feature_names": feature_names
    }
    with open(f"numpy_cubes/{split_name}.json", "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

save_split("train",   train_data)
save_split("validate",val_data)
save_split("test",    test_data)
save_split("external",ext_data)

print("Saved: numpy_cubes/{train,validate,test,external}.npz + .json")

# 3) Build a memory-safe TensorFlow dataset for ANN training
import tensorflow as tf

def load_split_npz(split_name):
    pack = np.load(f"numpy_cubes/{split_name}.npz")
    return pack["X"], pack["y_los"], pack["y_mort"]

def make_tf_dataset(split_name, target="los", batch_size=32, shuffle=True):
    X, y_los, y_mort = load_split_npz(split_name)
    y = y_los if target == "los" else y_mort
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(8192, X.shape[0]), reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

# Example:
train_ds_los = make_tf_dataset("train", target="los", batch_size=32, shuffle=True)
val_ds_los   = make_tf_dataset("validate", target="los", batch_size=32, shuffle=False)

# (Optional) quick peek to ensure shapes flow into the model:
xb, yb = next(iter(train_ds_los))
print("Batch X:", xb.shape, "Batch y:", yb.shape)  # -> (B,48,345,4), (B,)

Feature names/order are identical across all windows & splits.
Saved: numpy_cubes/{train,validate,test,external}.npz + .json
Batch X: (32, 48, 345, 4) Batch y: (32,)
