In [1]:
import pandas as pd
import numpy as np
import os
import logging

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("data_loading.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")

2025-03-23 00:01:55,220 - INFO - Loading... -> o1_X_external.csv
2025-03-23 00:02:02,381 - INFO - Loading... -> o1_X_test.csv
2025-03-23 00:02:02,930 - INFO - Loading... -> o1_X_train.csv
2025-03-23 00:02:07,032 - INFO - Loading... -> o1_X_validate.csv
2025-03-23 00:02:07,531 - INFO - Loading... -> o1_y_external_los.csv
2025-03-23 00:02:07,574 - INFO - Loading... -> o1_y_external_mortality.csv
2025-03-23 00:02:07,603 - INFO - Loading... -> o1_y_test_los.csv
2025-03-23 00:02:07,612 - INFO - Loading... -> o1_y_test_mortality.csv
2025-03-23 00:02:07,617 - INFO - Loading... -> o1_y_train_los.csv
2025-03-23 00:02:07,654 - INFO - Loading... -> o1_y_train_mortality.csv
2025-03-23 00:02:07,672 - INFO - Loading... -> o1_y_validate_los.csv
2025-03-23 00:02:07,681 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-03-23 00:02:07,686 - INFO - Loading... -> o2_X_external.csv
2025-03-23 00:02:11,190 - INFO - Loading... -> o2_X_test.csv
2025-03-23 00:02:11,456 - INFO - Loading... -> o2_X_train.c

In [4]:
for name, df in dataframes.items():
    logging.info(f"{name} types:\n{df.dtypes.value_counts()}")

2025-03-23 00:02:20,996 - INFO - o1_X_external types:
float32    345
Name: count, dtype: int64
2025-03-23 00:02:20,999 - INFO - o1_X_test types:
float32    345
Name: count, dtype: int64
2025-03-23 00:02:21,001 - INFO - o1_X_train types:
float32    345
Name: count, dtype: int64
2025-03-23 00:02:21,005 - INFO - o1_X_validate types:
float32    345
Name: count, dtype: int64
2025-03-23 00:02:21,008 - INFO - o1_y_external_los types:
float32    1
Name: count, dtype: int64
2025-03-23 00:02:21,010 - INFO - o1_y_external_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 00:02:21,012 - INFO - o1_y_test_los types:
float32    1
Name: count, dtype: int64
2025-03-23 00:02:21,014 - INFO - o1_y_test_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 00:02:21,017 - INFO - o1_y_train_los types:
float32    1
Name: count, dtype: int64
2025-03-23 00:02:21,019 - INFO - o1_y_train_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 00:02:21,020 - INFO - o1_y_val

In [5]:
for name, df in dataframes.items():
    missing_total = df.isnull().sum().sum()
    missing_cols = df.columns[df.isnull().any()].tolist()
    logging.info(f"{name}: {missing_total} missing values across {len(missing_cols)} columns")

2025-03-23 00:02:21,280 - INFO - o1_X_external: 39527616 missing values across 308 columns
2025-03-23 00:02:21,295 - INFO - o1_X_test: 1985664 missing values across 292 columns
2025-03-23 00:02:21,385 - INFO - o1_X_train: 15876288 missing values across 304 columns
2025-03-23 00:02:21,399 - INFO - o1_X_validate: 2007696 missing values across 296 columns
2025-03-23 00:02:21,402 - INFO - o1_y_external_los: 0 missing values across 0 columns
2025-03-23 00:02:21,404 - INFO - o1_y_external_mortality: 0 missing values across 0 columns
2025-03-23 00:02:21,406 - INFO - o1_y_test_los: 0 missing values across 0 columns
2025-03-23 00:02:21,408 - INFO - o1_y_test_mortality: 0 missing values across 0 columns
2025-03-23 00:02:21,410 - INFO - o1_y_train_los: 0 missing values across 0 columns
2025-03-23 00:02:21,412 - INFO - o1_y_train_mortality: 0 missing values across 0 columns
2025-03-23 00:02:21,414 - INFO - o1_y_validate_los: 0 missing values across 0 columns
2025-03-23 00:02:21,416 - INFO - o1_y_v

In [6]:
for name, df in dataframes.items():
    if 'y_' in name:
        unique_vals = df.nunique()
        logging.info(f"{name} unique target values: {unique_vals.to_dict()}")

2025-03-23 00:02:21,810 - INFO - o1_y_external_los unique target values: {'los': 830}
2025-03-23 00:02:21,815 - INFO - o1_y_external_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 00:02:21,817 - INFO - o1_y_test_los unique target values: {'los': 319}
2025-03-23 00:02:21,820 - INFO - o1_y_test_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 00:02:21,823 - INFO - o1_y_train_los unique target values: {'los': 2548}
2025-03-23 00:02:21,826 - INFO - o1_y_train_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 00:02:21,828 - INFO - o1_y_validate_los unique target values: {'los': 319}
2025-03-23 00:02:21,829 - INFO - o1_y_validate_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 00:02:21,832 - INFO - o2_y_external_los unique target values: {'los': 830}
2025-03-23 00:02:21,835 - INFO - o2_y_external_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 00:02:21,837 - INFO - o2_y_test_los u

In [7]:
summary = []
for name, df in dataframes.items():
    missing_values = df.isnull().sum().sum()
    missing_cols = df.isnull().any().sum()
    total_cells = df.shape[0] * df.shape[1]
    summary.append({
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Values': missing_values,
        'Accross Missing Columns': missing_cols,
        'Missing %': 100 * missing_values / total_cells
    })

summary_df = pd.DataFrame(summary)
summary_df.to_csv("CSV/exports/01_dataset_summary.csv", index=False)

In [8]:
percent = 7

# Dictionary of datasets for iteration
datasets = {
    "o1_X_train": o1_X_train,
    "o2_X_train": o2_X_train,
    "o3_X_train": o3_X_train,
    "o4_X_train": o4_X_train
}

# Loop through each dataset
for name, df in datasets.items():
    missing_percentage_per_row = df.isnull().mean(axis=1) * 100
    missing_rows = (missing_percentage_per_row <= percent).sum()
    total_rows, total_columns = df.shape
    percent_between = (missing_rows * 100) / total_rows

    print(f"\nDataset: {name}")
    print(f"Total Rows: {total_rows}, Total Columns: {total_columns}")
    print(f"Number of rows with missing values up to {percent}%: {missing_rows}")
    print(f"The percentage between total and missing values sets is {percent_between:.2f}%")


Dataset: o1_X_train
Total Rows: 122496, Total Columns: 345
Number of rows with missing values up to 7%: 48
The percentage between total and missing values sets is 0.04%

Dataset: o2_X_train
Total Rows: 61248, Total Columns: 345
Number of rows with missing values up to 7%: 24
The percentage between total and missing values sets is 0.04%

Dataset: o3_X_train
Total Rows: 40832, Total Columns: 345
Number of rows with missing values up to 7%: 16
The percentage between total and missing values sets is 0.04%

Dataset: o4_X_train
Total Rows: 30624, Total Columns: 345
Number of rows with missing values up to 7%: 12
The percentage between total and missing values sets is 0.04%
