# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import logging

from sklearn.model_selection import train_test_split

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

# Reads | Filter Patients (Phase 01)

In [3]:
for i in range(1, 5):
    # File paths
    eicu_file = f"../CSV/exports/whole_set/o{i}_hour_overlap_window_eicu.csv"
    mimic_file = f"../CSV/exports/whole_set/o{i}_hour_overlap_window_mimic.csv"
    
    # Variable names
    eicu_var_name = f"o{i}_eicu"
    mimic_var_name = f"o{i}_mimic"
    
    try:
        # Read MIMIC file and assign to a variable
        globals()[mimic_var_name] = pd.read_csv(mimic_file)
        logging.info(f"Successfully read {mimic_file} into variable {mimic_var_name}")
    except FileNotFoundError:
        logging.info(f"{mimic_file} not found.")
    except Exception as e:
        logging.info(f"An error occurred while reading {mimic_file}: {e}")
    
    try:
        # Read eICU file and assign to a variable
        globals()[eicu_var_name] = pd.read_csv(eicu_file)
        logging.info(f"Successfully read {eicu_file} into variable {eicu_var_name}")
    except FileNotFoundError:
        logging.info(f"{eicu_file} not found.")
    except Exception as e:
        logging.info(f"An error occurred while reading {eicu_file}: {e}")

2024-12-15 16:29:15,349 - INFO - Successfully read ../CSV/exports/whole_set/o1_hour_overlap_window_mimic.csv into variable o1_mimic
2024-12-15 16:29:22,483 - INFO - Successfully read ../CSV/exports/whole_set/o1_hour_overlap_window_eicu.csv into variable o1_eicu
2024-12-15 16:29:25,074 - INFO - Successfully read ../CSV/exports/whole_set/o2_hour_overlap_window_mimic.csv into variable o2_mimic
2024-12-15 16:29:28,630 - INFO - Successfully read ../CSV/exports/whole_set/o2_hour_overlap_window_eicu.csv into variable o2_eicu
2024-12-15 16:29:30,373 - INFO - Successfully read ../CSV/exports/whole_set/o3_hour_overlap_window_mimic.csv into variable o3_mimic
2024-12-15 16:29:32,711 - INFO - Successfully read ../CSV/exports/whole_set/o3_hour_overlap_window_eicu.csv into variable o3_eicu
2024-12-15 16:29:34,070 - INFO - Successfully read ../CSV/exports/whole_set/o4_hour_overlap_window_mimic.csv into variable o4_mimic
2024-12-15 16:29:35,870 - INFO - Successfully read ../CSV/exports/whole_set/o4_hou

In [None]:
# Making all the dataset to have the same rows by multiplication.
logging.info("Starting row multiplication for all mimic and eicu datasets.")

# Store mimic and eicu dataframes in separate dictionaries
mimic_dataframes = {
    "o2_mimic": o2_mimic,
    "o3_mimic": o3_mimic,
    "o4_mimic": o4_mimic,
}

eicu_dataframes = {
    "o2_eicu": o2_eicu,
    "o3_eicu": o3_eicu,
    "o4_eicu": o4_eicu,
}

# Multiply rows for mimic datasets
for i in range(2, 5):
    df_name = f"o{i}_mimic"
    logging.info(f"Processing mimic dataframe: {df_name}, multiplying rows by {i}.")
    mimic_dataframes[df_name] = mimic_dataframes[df_name].loc[mimic_dataframes[df_name].index.repeat(i)].reset_index(drop=True)
    logging.info(f"Completed multiplication for {df_name}. New row count: {len(mimic_dataframes[df_name])}.")

# Multiply rows for eicu datasets
for i in range(2, 5):
    df_name = f"o{i}_eicu"
    logging.info(f"Processing eicu dataframe: {df_name}, multiplying rows by {i}.")
    eicu_dataframes[df_name] = eicu_dataframes[df_name].loc[eicu_dataframes[df_name].index.repeat(i)].reset_index(drop=True)
    logging.info(f"Completed multiplication for {df_name}. New row count: {len(eicu_dataframes[df_name])}.")

# Access the modified mimic and eicu dataframes
o2_mimic = mimic_dataframes["o2_mimic"]
o3_mimic = mimic_dataframes["o3_mimic"]
o4_mimic = mimic_dataframes["o4_mimic"]

o2_eicu = eicu_dataframes["o2_eicu"]
o3_eicu = eicu_dataframes["o3_eicu"]
o4_eicu = eicu_dataframes["o4_eicu"]

# Logging the end of the process
logging.info("Row multiplication for all mimic and eicu datasets is complete.")

In [4]:
"""
I'm gonna concat and split the mimic and icu
at this point. I must create the same columns
from the tranformation of categorical data.
"""
# Store mimic and eicu datasets in dictionaries
mimic_dataframes = {
    "o1_mimic": o1_mimic,
    "o2_mimic": o2_mimic,
    "o3_mimic": o3_mimic,
    "o4_mimic": o4_mimic,
}

eicu_dataframes = {
    "o1_eicu": o1_eicu,
    "o2_eicu": o2_eicu,
    "o3_eicu": o3_eicu,
    "o4_eicu": o4_eicu,
}

# Loop through datasets to concatenate and split
combined_results = {}
for i in range(1, 5):
    mimic_df_name = f"o{i}_mimic"
    eicu_df_name = f"o{i}_eicu"
    
    logging.info(f"Processing datasets: {mimic_df_name} and {eicu_df_name}")
    
    # Get the mimic and eicu datasets
    mimic_df = mimic_dataframes[mimic_df_name]
    eicu_df = eicu_dataframes[eicu_df_name]
    
    # Get the row count of mimic dataset
    row_count = mimic_df.shape[0]
    logging.info(f"Row count of {mimic_df_name}: {row_count}")
    
    # Concatenate mimic and eicu datasets
    df_combined = pd.concat([mimic_df, eicu_df], ignore_index=True)
    logging.info(f"Combined dataset for {mimic_df_name} and {eicu_df_name} created.")
    
    # Find categorical columns and apply one-hot encoding
    categorical_columns = df_combined.select_dtypes(include=['object', 'category']).columns.tolist()
    df_encoded = pd.get_dummies(df_combined, columns=categorical_columns)
    logging.info(f"One-hot encoding applied to categorical columns for {mimic_df_name} and {eicu_df_name}.")
    
    # Split the encoded dataframe back into mimic and eicu datasets
    mimic_encoded = df_encoded.iloc[:row_count, :]
    eicu_encoded = df_encoded.iloc[row_count:, :]
    
    # Store the split results
    combined_results[mimic_df_name] = mimic_encoded
    combined_results[eicu_df_name] = eicu_encoded
    
    logging.info(f"Splitting completed for {mimic_df_name} and {eicu_df_name}.")

# Access the modified mimic and eicu dataframes
o1_mimic = combined_results["o1_mimic"]
o1_eicu = combined_results["o1_eicu"]
o2_mimic = combined_results["o2_mimic"]
o2_eicu = combined_results["o2_eicu"]
o3_mimic = combined_results["o3_mimic"]
o3_eicu = combined_results["o3_eicu"]
o4_mimic = combined_results["o4_mimic"]
o4_eicu = combined_results["o4_eicu"]

# Logging the end of the process
logging.info("All datasets have been processed successfully.")

2024-12-15 16:30:09,954 - INFO - Processing datasets: o1_mimic and o1_eicu
2024-12-15 16:30:09,955 - INFO - Row count of o1_mimic: 174432
2024-12-15 16:30:10,304 - INFO - Combined dataset for o1_mimic and o1_eicu created.
2024-12-15 16:30:11,900 - INFO - One-hot encoding applied to categorical columns for o1_mimic and o1_eicu.
2024-12-15 16:30:11,901 - INFO - Splitting completed for o1_mimic and o1_eicu.
2024-12-15 16:30:11,902 - INFO - Processing datasets: o2_mimic and o2_eicu
2024-12-15 16:30:11,903 - INFO - Row count of o2_mimic: 87216
2024-12-15 16:30:12,166 - INFO - Combined dataset for o2_mimic and o2_eicu created.
2024-12-15 16:30:12,997 - INFO - One-hot encoding applied to categorical columns for o2_mimic and o2_eicu.
2024-12-15 16:30:12,998 - INFO - Splitting completed for o2_mimic and o2_eicu.
2024-12-15 16:30:12,999 - INFO - Processing datasets: o3_mimic and o3_eicu
2024-12-15 16:30:13,000 - INFO - Row count of o3_mimic: 58144
2024-12-15 16:30:13,187 - INFO - Combined datase

In [5]:
day = 10

# Filter icu stay less than 10 days
o1_mimic = o1_mimic[o1_mimic['los'] < day]
o2_mimic = o2_mimic[o2_mimic['los'] < day]
o3_mimic = o3_mimic[o3_mimic['los'] < day]
o4_mimic = o4_mimic[o4_mimic['los'] < day]

# Filter icu stay less than 10 days
o1_eicu = o1_eicu[o1_eicu['los'] < day]
o2_eicu = o2_eicu[o2_eicu['los'] < day]
o3_eicu = o3_eicu[o3_eicu['los'] < day]
o4_eicu = o4_eicu[o4_eicu['los'] < day]

In [6]:
# Filter Time Zone

#time_zone = 16
#mimic_df = mimic_df[mimic_df['Time_Zone'] == time_zone]
#eicu_df = eicu_df[eicu_df['Time_Zone'] == time_zone]

# Split Training - Validation - Test Set (Phase 02)

In [7]:
# Mimic datasets dictionary
mimic_dataframes = {
    "o1_mimic": o1_mimic,
    "o2_mimic": o2_mimic,
    "o3_mimic": o3_mimic,
    "o4_mimic": o4_mimic,
}

# Parameters for splitting
total_test_val_perc = 0.2  # Total percentage for validation and test sets
split_between_test_val_perc = 0.5  # Percentage split between validation and test sets

# Splitting function
def split_mimic_data(mimic_df, total_test_val_perc, split_between_test_val_perc):
    # Step 1: Group by subject_id and hadm_id
    grouped_df = mimic_df.groupby(['subject_id', 'hadm_id'])
    patient_df = grouped_df['hospital_expire_flag'].first().reset_index()

    # Step 2: Perform stratified split
    train, temp = train_test_split(
        patient_df,
        test_size=total_test_val_perc,
        stratify=patient_df['hospital_expire_flag'],
        random_state=42
    )
    val, test = train_test_split(
        temp,
        test_size=split_between_test_val_perc,
        stratify=temp['hospital_expire_flag'],
        random_state=42
    )

    # Step 3: Merge back to original mimic_df
    train_df = mimic_df.merge(train[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')
    val_df = mimic_df.merge(val[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')
    test_df = mimic_df.merge(test[['subject_id', 'hadm_id']], on=['subject_id', 'hadm_id'], how='inner')

    return train_df, val_df, test_df

# Loop through mimic datasets and apply the split
split_results = {}
for name, df in mimic_dataframes.items():
    train_df, val_df, test_df = split_mimic_data(df, total_test_val_perc, split_between_test_val_perc)
    split_results[name] = {
        "train": train_df,
        "val": val_df,
        "test": test_df
    }
    # Print the sizes of splits
    logging.info(f"{name}:")
    logging.info(f"  Training set size: {train_df.shape[0]}")
    logging.info(f"  Validation set size: {val_df.shape[0]}")
    logging.info(f"  Test set size: {test_df.shape[0]}")

# Access the splits
o1_train = split_results["o1_mimic"]["train"]
o1_val = split_results["o1_mimic"]["val"]
o1_test = split_results["o1_mimic"]["test"]

o2_train = split_results["o2_mimic"]["train"]
o2_val = split_results["o2_mimic"]["val"]
o2_test = split_results["o2_mimic"]["test"]

o3_train = split_results["o3_mimic"]["train"]
o3_val = split_results["o3_mimic"]["val"]
o3_test = split_results["o3_mimic"]["test"]

o4_train = split_results["o4_mimic"]["train"]
o4_val = split_results["o4_mimic"]["val"]
o4_test = split_results["o4_mimic"]["test"]

2024-12-15 16:30:41,291 - INFO - o1_mimic:
2024-12-15 16:30:41,292 - INFO -   Training set size: 122496
2024-12-15 16:30:41,292 - INFO -   Validation set size: 15312
2024-12-15 16:30:41,294 - INFO -   Test set size: 15312
2024-12-15 16:30:41,513 - INFO - o2_mimic:
2024-12-15 16:30:41,514 - INFO -   Training set size: 61248
2024-12-15 16:30:41,515 - INFO -   Validation set size: 7656
2024-12-15 16:30:41,516 - INFO -   Test set size: 7656
2024-12-15 16:30:41,658 - INFO - o3_mimic:
2024-12-15 16:30:41,658 - INFO -   Training set size: 40832
2024-12-15 16:30:41,659 - INFO -   Validation set size: 5104
2024-12-15 16:30:41,660 - INFO -   Test set size: 5104
2024-12-15 16:30:41,767 - INFO - o4_mimic:
2024-12-15 16:30:41,768 - INFO -   Training set size: 30624
2024-12-15 16:30:41,769 - INFO -   Validation set size: 3828
2024-12-15 16:30:41,770 - INFO -   Test set size: 3828


# Check ratio and unique patients between sets (Phase 03)

In [11]:
# Count on Training set survive and non-survive
survival_counts = o2_train['hospital_expire_flag'].value_counts()
temp_survive = survival_counts.get(0, 0)/48
temp_non_survive = survival_counts.get(1, 0)/48

# Display the results
print(f'Train Set')
print(f'Survive: {temp_survive}')
print(f'Non-survive: {temp_non_survive}')

# Check if temp_non_survive is not zero to avoid division by zero
if temp_non_survive != 0:
    ratio = temp_survive / temp_non_survive
else:
    ratio = float('inf')  # Set ratio to infinity if there are no non-survivors

# Display the ratio
print(f'Ratio Train Set: {ratio:.2f}:1')

"""----------------------------"""

# Count on validation set survive and non-survive
survival_counts = o2_val['hospital_expire_flag'].value_counts()
temp_survive = survival_counts.get(0, 0)/48
temp_non_survive = survival_counts.get(1, 0)/48

# Display the results
print(f'\nValidation Set')
print(f'Survive: {temp_survive}')
print(f'Non-survive: {temp_non_survive}')

# Check if temp_non_survive is not zero to avoid division by zero
if temp_non_survive != 0:
    ratio = temp_survive / temp_non_survive
else:
    ratio = float('inf')  # Set ratio to infinity if there are no non-survivors

# Display the ratio
print(f'Ratio Train Set: {ratio:.2f}:1')

"""----------------------------"""

# Count on validation set survive and non-survive
survival_counts = o2_test['hospital_expire_flag'].value_counts()
temp_survive = survival_counts.get(0, 0)/48
temp_non_survive = survival_counts.get(1, 0)/48

# Display the results
print(f'\nTest Set')
print(f'Survive: {temp_survive}')
print(f'Non-survive: {temp_non_survive}')

# Check if temp_non_survive is not zero to avoid division by zero
if temp_non_survive != 0:
    ratio = temp_survive / temp_non_survive
else:
    ratio = float('inf')  # Set ratio to infinity if there are no non-survivors

# Display the ratio
print(f'Ratio Train Set: {ratio:.2f}:1')

Train Set
Survive: 1014.0
Non-survive: 262.0
Ratio Train Set: 3.87:1

Validation Set
Survive: 126.5
Non-survive: 33.0
Ratio Train Set: 3.83:1

Test Set
Survive: 127.0
Non-survive: 32.5
Ratio Train Set: 3.91:1


In [10]:
# Mine unique subject_id from sets
train_subjects = set(o2_train['subject_id'].unique())
val_subjects = set(o2_val['subject_id'].unique())
test_subjects = set(o2_test['subject_id'].unique())

# Check if there are overlaping subject_id
train_val_overlap = train_subjects.intersection(val_subjects)
train_test_overlap = train_subjects.intersection(test_subjects)
val_test_overlap = val_subjects.intersection(test_subjects)

# Display the results
print(f'Overlap between training and validation sets: {len(train_val_overlap)}')
print(f'Overlap between training and test sets: {len(train_test_overlap)}')
print(f'Overlap between validation and test sets: {len(val_test_overlap)}')

# print overlaping
if train_val_overlap:
    print(f'Subjects in both training and validation: {train_val_overlap}')
if train_test_overlap:
    print(f'Subjects in both training and test: {train_test_overlap}')
if val_test_overlap:
    print(f'Subjects in both validation and test: {val_test_overlap}')

Overlap between training and validation sets: 0
Overlap between training and test sets: 0
Overlap between validation and test sets: 0


# Split label from Train - Validation - Test Sets (Phase 04)

In [12]:
# External validation from eICU
o1_X_external = o1_eicu.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o1_y_external_los = o1_eicu['los']
o1_y_external_mortality = o1_eicu['hospital_expire_flag']

o2_X_external = o2_eicu.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o2_y_external_los = o2_eicu['los']
o2_y_external_mortality = o2_eicu['hospital_expire_flag']

o3_X_external = o3_eicu.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o3_y_external_los = o3_eicu['los']
o3_y_external_mortality = o3_eicu['hospital_expire_flag']

o4_X_external = o4_eicu.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o4_y_external_los = o4_eicu['los']
o4_y_external_mortality = o4_eicu['hospital_expire_flag']


# Separate features and target for the training, validation, and test sets
# Train
o1_X_train = o1_train.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o1_y_train_los = o1_train['los']
o1_y_train_mortality = o1_train['hospital_expire_flag']

o2_X_train = o2_train.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o2_y_train_los = o2_train['los']
o2_y_train_mortality = o2_train['hospital_expire_flag']

o3_X_train = o3_train.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o3_y_train_los = o3_train['los']
o3_y_train_mortality = o3_train['hospital_expire_flag']

o4_X_train = o4_train.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o4_y_train_los = o4_train['los']
o4_y_train_mortality = o4_train['hospital_expire_flag']

# Validation
o1_X_validate = o1_val.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o1_y_validate_los = o1_val['los']
o1_y_validate_mortality = o1_val['hospital_expire_flag']

o2_X_validate = o2_val.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o2_y_validate_los = o2_val['los']
o2_y_validate_mortality = o2_val['hospital_expire_flag']

o3_X_validate = o3_val.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o3_y_validate_los = o3_val['los']
o3_y_validate_mortality = o3_val['hospital_expire_flag']

o4_X_validate = o4_val.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o4_y_validate_los = o4_val['los']
o4_y_validate_mortality = o4_val['hospital_expire_flag']

# Test
o1_X_test = o1_test.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o1_y_test_los = o1_test['los']
o1_y_test_mortality = o1_test['hospital_expire_flag']

o2_X_test = o2_test.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o2_y_test_los = o2_test['los']
o2_y_test_mortality = o2_test['hospital_expire_flag']

o3_X_test = o3_test.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o3_y_test_los = o3_test['los']
o3_y_test_mortality = o3_test['hospital_expire_flag']

o4_X_test = o4_test.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count'])
o4_y_test_los = o4_test['los']
o4_y_test_mortality = o4_test['hospital_expire_flag']

In [13]:
# Check if the specified columns have the same order across all dataframes
columns_to_check = ['subject_id', 'hadm_id']

# Extract the relevant columns from each dataframe
o1_subset = o1_train[columns_to_check]
o2_subset = o2_train[columns_to_check]
o3_subset = o3_train[columns_to_check]
o4_subset = o4_train[columns_to_check]

# Compare the order and content
same_order = (
    o1_subset.equals(o2_subset) and
    o1_subset.equals(o3_subset) and
    o1_subset.equals(o4_subset)
)

# Print the result
if same_order:
    print("The dataframes have the same order in the columns 'subject_id' and 'hadm_id'.")
else:
    print("The dataframes do NOT have the same order in the columns 'subject_id' and'hadm_id'.")

The dataframes do NOT have the same order in the columns 'subject_id' and'hadm_id'.


In [14]:
output_dir = "../CSV/exports/split_set"
os.makedirs(output_dir, exist_ok=True)
logging.info(f"Output directory set to: {output_dir}")

# Variables to save
variables_to_save = {
    "o1_X_external.csv": o1_X_external,
    "o1_y_external_los.csv": o1_y_external_los,
    "o1_y_external_mortality.csv": o1_y_external_mortality,
    "o2_X_external.csv": o2_X_external,
    "o2_y_external_los.csv": o2_y_external_los,
    "o2_y_external_mortality.csv": o2_y_external_mortality,
    "o3_X_external.csv": o3_X_external,
    "o3_y_external_los.csv": o3_y_external_los,
    "o3_y_external_mortality.csv": o3_y_external_mortality,
    "o4_X_external.csv": o4_X_external,
    "o4_y_external_los.csv": o4_y_external_los,
    "o4_y_external_mortality.csv": o4_y_external_mortality,
    "o1_X_train.csv": o1_X_train,
    "o1_y_train_los.csv": o1_y_train_los,
    "o1_y_train_mortality.csv": o1_y_train_mortality,
    "o2_X_train.csv": o2_X_train,
    "o2_y_train_los.csv": o2_y_train_los,
    "o2_y_train_mortality.csv": o2_y_train_mortality,
    "o3_X_train.csv": o3_X_train,
    "o3_y_train_los.csv": o3_y_train_los,
    "o3_y_train_mortality.csv": o3_y_train_mortality,
    "o4_X_train.csv": o4_X_train,
    "o4_y_train_los.csv": o4_y_train_los,
    "o4_y_train_mortality.csv": o4_y_train_mortality,
    "o1_X_validate.csv": o1_X_validate,
    "o1_y_validate_los.csv": o1_y_validate_los,
    "o1_y_validate_mortality.csv": o1_y_validate_mortality,
    "o2_X_validate.csv": o2_X_validate,
    "o2_y_validate_los.csv": o2_y_validate_los,
    "o2_y_validate_mortality.csv": o2_y_validate_mortality,
    "o3_X_validate.csv": o3_X_validate,
    "o3_y_validate_los.csv": o3_y_validate_los,
    "o3_y_validate_mortality.csv": o3_y_validate_mortality,
    "o4_X_validate.csv": o4_X_validate,
    "o4_y_validate_los.csv": o4_y_validate_los,
    "o4_y_validate_mortality.csv": o4_y_validate_mortality,
    "o1_X_test.csv": o1_X_test,
    "o1_y_test_los.csv": o1_y_test_los,
    "o1_y_test_mortality.csv": o1_y_test_mortality,
    "o2_X_test.csv": o2_X_test,
    "o2_y_test_los.csv": o2_y_test_los,
    "o2_y_test_mortality.csv": o2_y_test_mortality,
    "o3_X_test.csv": o3_X_test,
    "o3_y_test_los.csv": o3_y_test_los,
    "o3_y_test_mortality.csv": o3_y_test_mortality,
    "o4_X_test.csv": o4_X_test,
    "o4_y_test_los.csv": o4_y_test_los,
    "o4_y_test_mortality.csv": o4_y_test_mortality,
}

# Save each variable to its respective CSV file
for file_name, variable in variables_to_save.items():
    file_path = os.path.join(output_dir, file_name)
    variable.to_csv(file_path, index=False)
    logging.info(f"Saved {file_name} to {file_path}")

# Logging the end of the process
logging.info("All datasets have been processed successfully.")

2024-12-15 16:32:01,835 - INFO - Output directory set to: ../CSV/exports/split_set
2024-12-15 16:32:45,897 - INFO - Saved o1_X_external.csv to ../CSV/exports/split_set\o1_X_external.csv
2024-12-15 16:32:46,110 - INFO - Saved o1_y_external_los.csv to ../CSV/exports/split_set\o1_y_external_los.csv
2024-12-15 16:32:46,225 - INFO - Saved o1_y_external_mortality.csv to ../CSV/exports/split_set\o1_y_external_mortality.csv
2024-12-15 16:33:07,853 - INFO - Saved o2_X_external.csv to ../CSV/exports/split_set\o2_X_external.csv
2024-12-15 16:33:07,959 - INFO - Saved o2_y_external_los.csv to ../CSV/exports/split_set\o2_y_external_los.csv
2024-12-15 16:33:08,014 - INFO - Saved o2_y_external_mortality.csv to ../CSV/exports/split_set\o2_y_external_mortality.csv
2024-12-15 16:33:22,090 - INFO - Saved o3_X_external.csv to ../CSV/exports/split_set\o3_X_external.csv
2024-12-15 16:33:22,177 - INFO - Saved o3_y_external_los.csv to ../CSV/exports/split_set\o3_y_external_los.csv
2024-12-15 16:33:22,216 - INF