In [1]:
import pandas as pd
import numpy as np
import logging
import h5py # Save 3D tensor
import os

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [3]:
path = "../CSV/exports/impute/o1_GAN/o01"

# Get all files in the directory
all_files = os.listdir(path)

# Store dictionary
dataframes = {}

# Load files one by one
for file in all_files:
    if file.endswith(".csv"):
        # Create a variable name from the filename
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        # Load the CSV into a pandas dataframe
        dataframes[var_name] = pd.read_csv(os.path.join(path, file)).astype('float32')

# Chech if they are load
for var_name, df in dataframes.items():
    globals()[var_name] = df  # Assign to global variables if needed
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")

logging.info("All files loaded successfully.")

2024-12-16 22:21:40,932 - INFO - Loading... -> o1_X_external.csv
2024-12-16 22:21:49,756 - INFO - Loading... -> o1_X_test.csv
2024-12-16 22:21:50,380 - INFO - Loading... -> o1_X_train.csv
2024-12-16 22:21:55,232 - INFO - Loading... -> o1_X_validate.csv
2024-12-16 22:21:55,845 - INFO - Loading... -> o1_y_external_los.csv
2024-12-16 22:21:55,891 - INFO - Loading... -> o1_y_external_mortality.csv
2024-12-16 22:21:55,921 - INFO - Loading... -> o1_y_test_los.csv
2024-12-16 22:21:55,931 - INFO - Loading... -> o1_y_test_mortality.csv
2024-12-16 22:21:55,935 - INFO - Loading... -> o1_y_train_los.csv
2024-12-16 22:21:55,973 - INFO - Loading... -> o1_y_train_mortality.csv
2024-12-16 22:21:55,991 - INFO - Loading... -> o1_y_validate_los.csv
2024-12-16 22:21:56,000 - INFO - Loading... -> o1_y_validate_mortality.csv
2024-12-16 22:21:56,006 - INFO - Loading... -> o2_X_external.csv
2024-12-16 22:22:00,645 - INFO - Loading... -> o2_X_test.csv
2024-12-16 22:22:00,976 - INFO - Loading... -> o2_X_train.c

In [4]:
# Align column. They must be in same order and have same names.
logging.info(f"Aligning columns for all datasets.")
train_columns = o1_X_train.columns # Extract columns names in their order
o2_train = o2_X_train[train_columns] # Reorder the columns to match with o1_train
o3_train = o3_X_train[train_columns] 
o4_train = o4_X_train[train_columns] 

val_columnns = o1_X_validate.columns
o2_validate = o2_X_validate[val_columnns]
o3_validate = o3_X_validate[val_columnns]
o4_validate = o4_X_validate[val_columnns]

test_columnns = o1_X_test.columns
o2_test = o2_X_test[test_columnns]
o3_test = o3_X_test[test_columnns]
o4_test = o4_X_test[test_columnns]

external_columnns = o1_X_external.columns
o2_external = o2_X_external[external_columnns]
o3_external = o3_X_external[external_columnns]
o4_external = o4_X_external[external_columnns]
logging.info(f"Aligning complete.")

2024-12-16 22:22:14,462 - INFO - Aligning columns for all datasets.
2024-12-16 22:22:14,623 - INFO - Aligning complete.


In [5]:
# Multiply rows to match the rows of o1_ file
logging.info(f"Repeating train rows to match the number of rows in o1_train.")
o2_train = o2_train.loc[o2_train.index.repeat(2)].reset_index(drop=True)
o3_train = o3_train.loc[o3_train.index.repeat(3)].reset_index(drop=True)
o4_train = o4_train.loc[o4_train.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete\n")

logging.info(f"Repeating validate rows to match the number of rows in o1_validate.")
o2_validate = o2_validate.loc[o2_validate.index.repeat(2)].reset_index(drop=True)
o3_validate = o3_validate.loc[o3_validate.index.repeat(3)].reset_index(drop=True)
o4_validate = o4_validate.loc[o4_validate.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete\n")

logging.info(f"Repeating test rows to match the number of rows in o1_test.")
o2_test = o2_test.loc[o2_test.index.repeat(2)].reset_index(drop=True)
o3_test = o3_test.loc[o3_test.index.repeat(3)].reset_index(drop=True)
o4_test = o4_test.loc[o4_test.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete\n")

logging.info(f"Repeating external rows to match the number of rows in o1_external.")
o2_external= o2_external.loc[o2_external.index.repeat(2)].reset_index(drop=True)
o3_external = o3_external.loc[o3_external.index.repeat(3)].reset_index(drop=True)
o4_external = o4_external.loc[o4_external.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete")

2024-12-16 22:22:14,639 - INFO - Repeating train rows to match the number of rows in o1_train.
2024-12-16 22:22:15,389 - INFO - Repeating Complete

2024-12-16 22:22:15,390 - INFO - Repeating validate rows to match the number of rows in o1_validate.
2024-12-16 22:22:15,493 - INFO - Repeating Complete

2024-12-16 22:22:15,493 - INFO - Repeating test rows to match the number of rows in o1_test.
2024-12-16 22:22:15,589 - INFO - Repeating Complete

2024-12-16 22:22:15,590 - INFO - Repeating external rows to match the number of rows in o1_external.
2024-12-16 22:22:17,165 - INFO - Repeating Complete


In [6]:
# Check all datasets if they have the same number of rows.
if not (len(o1_X_train) == len(o2_train) == len(o3_train) == len(o4_train)):
    raise ValueError("The Train datasets do not have the same number of rows after alignment!")

if not (len(o1_X_validate) == len(o2_validate) == len(o3_validate) == len(o4_validate)):
    raise ValueError("The Validation datasets do not have the same number of rows after alignment!")

if not (len(o1_X_test) == len(o2_test) == len(o3_test) == len(o4_test)):
    raise ValueError("The Test datasets do not have the same number of rows after alignment!")

if not (len(o1_X_external) == len(o2_external) == len(o3_external) == len(o4_external)):
    raise ValueError("The External datasets do not have the same number of rows after alignment!")

In [7]:
# Create a 3D tensor by stacking the dataframes
logging.info("Stacking datasets to create a 3D tensor.")
train_tensor = np.stack([o1_X_train.values, o2_train.values, o3_train.values, o4_train.values], axis=-1)
validate_tensor = np.stack([o1_X_validate.values, o2_validate.values, o3_validate.values, o4_validate.values], axis=-1)
test_tensor = np.stack([o1_X_test.values, o2_test.values, o3_test.values, o4_test.values], axis=-1)
external_tensor = np.stack([o1_X_external.values, o2_external.values, o3_external.values, o4_external.values], axis=-1)

logging.info(f"Train 3D Tensor shape: {train_tensor.shape}")
logging.info(f"Validate 3D Tensor shape: {validate_tensor.shape}")
logging.info(f"Test 3D Tensor shape: {test_tensor.shape}")
logging.info(f"External 3D Tensor shape: {external_tensor.shape}")

2024-12-16 22:22:17,190 - INFO - Stacking datasets to create a 3D tensor.
2024-12-16 22:22:19,061 - INFO - Train 3D Tensor shape: (122496, 346, 4)
2024-12-16 22:22:19,062 - INFO - Validate 3D Tensor shape: (15312, 346, 4)
2024-12-16 22:22:19,063 - INFO - Test 3D Tensor shape: (15312, 346, 4)
2024-12-16 22:22:19,064 - INFO - External 3D Tensor shape: (234720, 346, 4)


In [9]:
save_path = '../CSV/exports/tensors/'
os.makedirs(save_path, exist_ok=True)

name = 'o1_3D_four_dataframe.h5'

# Save tensors to an HDF5 file
with h5py.File(os.path.join(save_path, name), 'w') as hf:
    # Tensors
    hf.create_dataset('train_tensor', data=train_tensor)
    hf.create_dataset('validate_tensor', data=validate_tensor)
    hf.create_dataset('test_tensor', data=test_tensor)
    hf.create_dataset('external_tensor', data=external_tensor)
    # Los Labels
    hf.create_dataset('train_los_label', data=o1_y_train_los)
    hf.create_dataset('validate_los_label', data=o1_y_validate_los)
    hf.create_dataset('test_los_label', data=o1_y_test_los)
    hf.create_dataset('external_los_label', data=o1_y_external_los)
    # Mortality Labels
    hf.create_dataset('train_mortality_label', data=o1_y_train_mortality)
    hf.create_dataset('validate_mortality_label', data=o1_y_validate_mortality)
    hf.create_dataset('test_mortality_label', data=o1_y_test_mortality)
    hf.create_dataset('external_mortality_label', data=o1_y_external_mortality)

logging.info(f"Tensors saved to {save_path}{name}.")

2024-12-16 22:24:14,591 - INFO - Tensors saved to ../CSV/exports/tensors/o1_3D_four_dataframe.h5.


# Load Tensors

In [None]:
# Load tensors from the HDF5 file
load_path = '../CSV/exports/tensors/icu_tensors.h5'

logging.info(f"Loading...")
with h5py.File(load_path, 'r') as hf:
    train_tensor = hf['train_tensor'][:]
    validate_tensor = hf['validate_tensor'][:]
    test_tensor = hf['test_tensor'][:]
    external_tensor = hf['external_tensor'][:]
    # los
    train_los_label = hf['train_los_label'][:]
    validate_los_label = hf['validate_los_label'][:]
    test_los_label = hf['test_los_label'][:]
    external_los_label = hf['external_los_label'][:]
    # mortality
    train_mortality_label = hf['train_mortality_label'][:]
    validate_mortality_label = hf['validate_mortality_label'][:]
    test_mortality_label = hf['test_mortality_label'][:]
    external_mortality_label = hf['external_mortality_label'][:]

logging.info(f"Train: {train_tensor.shape}, Los Label: {train_los_label.shape}, Mortality Label: {train_mortality_label.shape}")
logging.info(f"Validate: {validate_tensor.shape}, Los Label: {validate_los_label.shape}, Mortality Label: {validate_mortality_label.shape}")
logging.info(f"Test: {test_tensor.shape}, Los Label: {test_los_label.shape}, Mortality Label: {test_mortality_label.shape}")
logging.info(f"External: {external_tensor.shape}, Los Label: {external_los_label.shape}, Mortality Label: {external_mortality_label.shape}")