In [1]:
import pandas as pd
import numpy as np
import logging
import h5py # Save 3D tensor
import os

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [4]:
# Load the dataframes
path = "../CSV/exports/impute/o1_GAN/o01"
logging.info(f"Loading Train Sets")
logging.info(f"Loading... -> o1_X_train.csv")
o1_train = pd.read_csv(f"{path}/o1_X_train.csv").astype('float32')
logging.info(f"Loading... -> o2_X_train.csv")
o2_train = pd.read_csv(f"{path}/o2_X_train.csv").astype('float32')
logging.info(f"Loading... -> o3_X_train.csv")
o3_train = pd.read_csv(f"{path}/o3_X_train.csv").astype('float32')
logging.info(f"Loading... -> o4_X_train.csv")
o4_train = pd.read_csv(f"{path}/o4_X_train.csv").astype('float32')
logging.info(f"Loading... -> o1_y_train_los.csv")
o1_train_los_label = pd.read_csv(f"{path}/o1_y_train_los.csv").astype('float32')
logging.info(f"Train Load Complete\n")

logging.info(f"Loading Validation Sets")
logging.info(f"Loading... -> o1_X_validate.csv")
o1_validate = pd.read_csv(f"{path}/o1_X_validate.csv").astype('float32')
logging.info(f"Loading... -> o2_X_validate.csv")
o2_validate = pd.read_csv(f"{path}/o2_X_validate.csv").astype('float32')
logging.info(f"Loading... -> o3_X_validate.csv")
o3_validate = pd.read_csv(f"{path}/o3_X_validate.csv").astype('float32')
logging.info(f"Loading... -> o4_X_validate.csv")
o4_validate = pd.read_csv(f"{path}/o4_X_validate.csv").astype('float32')
logging.info(f"Loading... -> o1_y_validate_los.csv")
o1_validate_los_label = pd.read_csv(f"{path}/o1_y_validate_los.csv").astype('float32')
logging.info(f"Validate Load Complete\n")

logging.info(f"Loading Test Sets")
logging.info(f"Loading... -> o1_X_test.csv")
o1_test = pd.read_csv(f"{path}/o1_X_test.csv").astype('float32')
logging.info(f"Loading... -> o2_X_test.csv")
o2_test = pd.read_csv(f"{path}/o2_X_test.csv").astype('float32')
logging.info(f"Loading... -> o3_X_test.csv")
o3_test = pd.read_csv(f"{path}/o3_X_test.csv").astype('float32')
logging.info(f"Loading... -> o4_X_test.csv")
o4_test = pd.read_csv(f"{path}/o4_X_test.csv").astype('float32')
logging.info(f"Loading... -> o1_y_test_los.csv")
o1_test_los_label = pd.read_csv(f"{path}/o1_y_test_los.csv").astype('float32')
logging.info(f"Test Load Complete\n")

logging.info(f"Loading External Sets")
logging.info(f"Loading... -> o1_X_external.csv")
o1_external = pd.read_csv(f"{path}/o1_X_external.csv").astype('float32')
logging.info(f"Loading... -> o2_X_external.csv")
o2_external = pd.read_csv(f"{path}/o2_X_external.csv").astype('float32')
logging.info(f"Loading... -> o3_X_external.csv")
o3_external = pd.read_csv(f"{path}/o3_X_external.csv").astype('float32')
logging.info(f"Loading... -> o4_X_external.csv")
o4_external = pd.read_csv(f"{path}/o4_X_external.csv").astype('float32')
logging.info(f"Loading... -> o1_y_external_los.csv")
o1_external_los_label = pd.read_csv(f"{path}/o1_y_external_los.csv").astype('float32')
logging.info(f"External Load Complete")

2024-12-16 20:32:29,105 - INFO - Loading Train Sets
2024-12-16 20:32:29,106 - INFO - Loading... -> o1_X_train.csv
2024-12-16 20:32:33,883 - INFO - Loading... -> o2_X_train.csv
2024-12-16 20:32:36,172 - INFO - Loading... -> o3_X_train.csv
2024-12-16 20:32:37,709 - INFO - Loading... -> o4_X_train.csv
2024-12-16 20:32:38,863 - INFO - Loading... -> o1_y_train_los.csv
2024-12-16 20:32:38,904 - INFO - Train Load Complete

2024-12-16 20:32:38,906 - INFO - Loading Validation Sets
2024-12-16 20:32:38,906 - INFO - Loading... -> o1_X_validate.csv
2024-12-16 20:32:39,502 - INFO - Loading... -> o2_X_validate.csv
2024-12-16 20:32:39,805 - INFO - Loading... -> o3_X_validate.csv
2024-12-16 20:32:40,016 - INFO - Loading... -> o4_X_validate.csv
2024-12-16 20:32:40,177 - INFO - Loading... -> o1_y_validate_los.csv
2024-12-16 20:32:40,188 - INFO - Validate Load Complete

2024-12-16 20:32:40,189 - INFO - Loading Test Sets
2024-12-16 20:32:40,190 - INFO - Loading... -> o1_X_test.csv
2024-12-16 20:32:40,763 -

In [None]:
display(o4_external)

In [5]:
# Ensure columns are aligned (same order and names)
logging.info(f"Aligning columns for all datasets.")
train_columns = o1_train.columns # Extract columns names in their order
o2_train = o2_train[train_columns] # Reorder the columns to match with o1_train
o3_train = o3_train[train_columns] 
o4_train = o4_train[train_columns] 

val_columnns = o1_validate.columns
o2_validate = o2_validate[val_columnns]
o3_validate = o3_validate[val_columnns]
o4_validate = o4_validate[val_columnns]

test_columnns = o1_test.columns
o2_test = o2_test[test_columnns]
o3_test = o3_test[test_columnns]
o4_test = o4_test[test_columnns]

external_columnns = o1_external.columns
o2_external = o2_external[external_columnns]
o3_external = o3_external[external_columnns]
o4_external = o4_external[external_columnns]
logging.info(f"Aligning complete.")

2024-12-16 20:34:19,900 - INFO - Aligning columns for all datasets.
2024-12-16 20:34:20,095 - INFO - Aligning complete.


In [6]:
# Repeat rows to match the row count of o1
logging.info(f"Repeating train rows to match the number of rows in o1_train.")
o2_train = o2_train.loc[o2_train.index.repeat(2)].reset_index(drop=True)
o3_train = o3_train.loc[o3_train.index.repeat(3)].reset_index(drop=True)
o4_train = o4_train.loc[o4_train.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete\n")

logging.info(f"Repeating validate rows to match the number of rows in o1_validate.")
o2_validate = o2_validate.loc[o2_validate.index.repeat(2)].reset_index(drop=True)
o3_validate = o3_validate.loc[o3_validate.index.repeat(3)].reset_index(drop=True)
o4_validate = o4_validate.loc[o4_validate.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete\n")

logging.info(f"Repeating test rows to match the number of rows in o1_test.")
o2_test = o2_test.loc[o2_test.index.repeat(2)].reset_index(drop=True)
o3_test = o3_test.loc[o3_test.index.repeat(3)].reset_index(drop=True)
o4_test = o4_test.loc[o4_test.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete\n")

logging.info(f"Repeating external rows to match the number of rows in o1_external.")
o2_external= o2_external.loc[o2_external.index.repeat(2)].reset_index(drop=True)
o3_external = o3_external.loc[o3_external.index.repeat(3)].reset_index(drop=True)
o4_external = o4_external.loc[o4_external.index.repeat(4)].reset_index(drop=True)
logging.info(f"Repeating Complete")

2024-12-16 20:34:20,978 - INFO - Repeating train rows to match the number of rows in o1_train.
2024-12-16 20:34:21,695 - INFO - Repeating Complete

2024-12-16 20:34:21,696 - INFO - Repeating validate rows to match the number of rows in o1_validate.
2024-12-16 20:34:21,812 - INFO - Repeating Complete

2024-12-16 20:34:21,813 - INFO - Repeating test rows to match the number of rows in o1_test.
2024-12-16 20:34:21,929 - INFO - Repeating Complete

2024-12-16 20:34:21,930 - INFO - Repeating external rows to match the number of rows in o1_external.
2024-12-16 20:34:23,532 - INFO - Repeating Complete


In [7]:
# Check all datasets to the same number of rows
if not (len(o1_train) == len(o2_train) == len(o3_train) == len(o4_train)):
    raise ValueError("The Train datasets do not have the same number of rows after alignment!")

if not (len(o1_validate) == len(o2_validate) == len(o3_validate) == len(o4_validate)):
    raise ValueError("The Validation datasets do not have the same number of rows after alignment!")

if not (len(o1_test) == len(o2_test) == len(o3_test) == len(o4_test)):
    raise ValueError("The Test datasets do not have the same number of rows after alignment!")

if not (len(o1_external) == len(o2_external) == len(o3_external) == len(o4_external)):
    raise ValueError("The External datasets do not have the same number of rows after alignment!")

# Create a 3D tensor by stacking the dataframes
logging.info("Stacking datasets to create a 3D tensor.")
train_tensor = np.stack([o1_train.values, o2_train.values, o3_train.values, o4_train.values], axis=-1)
validate_tensor = np.stack([o1_validate.values, o2_validate.values, o3_validate.values, o4_validate.values], axis=-1)
test_tensor = np.stack([o1_test.values, o2_test.values, o3_test.values, o4_test.values], axis=-1)
external_tensor = np.stack([o1_external.values, o2_external.values, o3_external.values, o4_external.values], axis=-1)

logging.info(f"Train 3D Tensor shape: {train_tensor.shape}")
logging.info(f"Validate 3D Tensor shape: {validate_tensor.shape}")
logging.info(f"Test 3D Tensor shape: {test_tensor.shape}")
logging.info(f"External 3D Tensor shape: {external_tensor.shape}")

2024-12-16 20:34:24,035 - INFO - Stacking datasets to create a 3D tensor.
2024-12-16 20:34:26,059 - INFO - Train 3D Tensor shape: (122496, 346, 4)
2024-12-16 20:34:26,060 - INFO - Validate 3D Tensor shape: (15312, 346, 4)
2024-12-16 20:34:26,061 - INFO - Test 3D Tensor shape: (15312, 346, 4)
2024-12-16 20:34:26,062 - INFO - External 3D Tensor shape: (234720, 346, 4)


In [8]:
save_path = '../CSV/exports/tensors/'
os.makedirs(save_path, exist_ok=True)

# Save tensors to an HDF5 file
with h5py.File(os.path.join(save_path, 'icu_tensors.h5'), 'w') as hf:
    # Tensors
    hf.create_dataset('train_tensor', data=train_tensor)
    hf.create_dataset('validate_tensor', data=validate_tensor)
    hf.create_dataset('test_tensor', data=test_tensor)
    hf.create_dataset('external_tensor', data=external_tensor)
    # Labels
    hf.create_dataset('o1_train_los_label', data=o1_train_los_label)
    hf.create_dataset('o1_validate_los_label', data=o1_validate_los_label)
    hf.create_dataset('o1_test_los_label', data=o1_test_los_label)
    hf.create_dataset('o1_external_los_label', data=o1_external_los_label)

logging.info(f"Tensors saved to {save_path}icu_tensors.h5.")

2024-12-16 20:38:07,009 - INFO - Tensors saved to ../CSV/exports/tensors/icu_tensors.h5.


# Load Tensors

In [None]:
# Load tensors from the HDF5 file
load_path = '../CSV/exports/tensors/icu_tensors.h5'

logging.info(f"Loading...")
with h5py.File(load_path, 'r') as hf:
    train_tensor = hf['train_tensor'][:]
    validate_tensor = hf['validate_tensor'][:]
    test_tensor = hf['test_tensor'][:]
    external_tensor = hf['external_tensor'][:]

logging.info(f"Train: {train_tensor.shape}, Validate: {validate_tensor.shape}, Test: {test_tensor.shape}, External: {external_tensor.shape}")


In [9]:
# Load tensors and labels from the HDF5 file
load_path = '../CSV/exports/tensors/icu_tensors.h5'

with h5py.File(load_path, 'r') as hf:
    # Load tensors
    train_tensor = hf['train_tensor'][:]
    validate_tensor = hf['validate_tensor'][:]
    test_tensor = hf['test_tensor'][:]
    external_tensor = hf['external_tensor'][:]
    
    # Load labels
    o1_train_los_label = hf['o1_train_los_label'][:]
    o1_validate_los_label = hf['o1_validate_los_label'][:]
    o1_test_los_label = hf['o1_test_los_label'][:]
    o1_external_los_label = hf['o1_external_los_label'][:]

print("Tensors and labels loaded. Shapes:")
print(f"Train Tensor: {train_tensor.shape}, Train Labels: {o1_train_los_label.shape}")
print(f"Validate Tensor: {validate_tensor.shape}, Validate Labels: {o1_validate_los_label.shape}")
print(f"Test Tensor: {test_tensor.shape}, Test Labels: {o1_test_los_label.shape}")
print(f"External Tensor: {external_tensor.shape}, External Labels: {o1_external_los_label.shape}")


Tensors and labels loaded. Shapes:
Train Tensor: (122496, 346, 4), Train Labels: (122496, 1)
Validate Tensor: (15312, 346, 4), Validate Labels: (15312, 1)
Test Tensor: (15312, 346, 4), Test Labels: (15312, 1)
External Tensor: (234720, 346, 4), External Labels: (234720, 1)
