# Sprint 1: Foundational Setup

## Task 1 (CIR-8): Load and Validate all Datasets

In [1]:
import pandas as pd
import numpy as np
import os
import logging

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("logs/CIR-8.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 01:31:53,384 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 01:31:53,387 - INFO - Start Loading Dataframes.
2025-03-23 01:31:53,388 - INFO - Loading... -> o1_X_external.csv
2025-03-23 01:32:00,228 - INFO - Loading... -> o1_X_test.csv
2025-03-23 01:32:00,819 - INFO - Loading... -> o1_X_train.csv
2025-03-23 01:32:04,899 - INFO - Loading... -> o1_X_validate.csv
2025-03-23 01:32:05,444 - INFO - Loading... -> o1_y_external_los.csv
2025-03-23 01:32:05,489 - INFO - Loading... -> o1_y_external_mortality.csv
2025-03-23 01:32:05,520 - INFO - Loading... -> o1_y_test_los.csv
2025-03-23 01:32:05,531 - INFO - Loading... -> o1_y_test_mortality.csv
2025-03-23 01:32:05,536 - INFO - Loading... -> o1_y_train_los.csv
2025-03-23 01:32:05,580 - INFO - Loading... -> o1_y_train_mortality.csv
2025-03-23 01:32:05,598 - INFO - Loading... -> o1_y_validate_los.csv
2025-03-23 01:32:05,610 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-03-23 01:32:05,616 - INFO - Loading... -> o2_

In [4]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Check Datatypes...")
for name, df in dataframes.items():
    logging.info(f"{name} types:\n{df.dtypes.value_counts()}")
logging.info("Check Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 01:32:23,553 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 01:32:23,554 - INFO - Check Datatypes...
2025-03-23 01:32:23,557 - INFO - o1_X_external types:
float32    345
Name: count, dtype: int64
2025-03-23 01:32:23,559 - INFO - o1_X_test types:
float32    345
Name: count, dtype: int64
2025-03-23 01:32:23,561 - INFO - o1_X_train types:
float32    345
Name: count, dtype: int64
2025-03-23 01:32:23,563 - INFO - o1_X_validate types:
float32    345
Name: count, dtype: int64
2025-03-23 01:32:23,565 - INFO - o1_y_external_los types:
float32    1
Name: count, dtype: int64
2025-03-23 01:32:23,567 - INFO - o1_y_external_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 01:32:23,568 - INFO - o1_y_test_los types:
float32    1
Name: count, dtype: int64
2025-03-23 01:32:23,571 - INFO - o1_y_test_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 01:32:23,573 - INFO - o1_y_train_los types:
float32    1
Name: count, dtype: int64
2025-03-23 01

In [5]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate missing values...")
for name, df in dataframes.items():
    missing_total = df.isnull().sum().sum()
    missing_cols = df.columns[df.isnull().any()].tolist()
    logging.info(f"{name}: {missing_total} missing values across {len(missing_cols)} columns")
logging.info("Missing values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 01:32:27,924 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 01:32:27,925 - INFO - Calculate missing values...
2025-03-23 01:32:28,102 - INFO - o1_X_external: 39527616 missing values across 308 columns
2025-03-23 01:32:28,117 - INFO - o1_X_test: 1985664 missing values across 292 columns
2025-03-23 01:32:28,211 - INFO - o1_X_train: 15876288 missing values across 304 columns
2025-03-23 01:32:28,226 - INFO - o1_X_validate: 2007696 missing values across 296 columns
2025-03-23 01:32:28,228 - INFO - o1_y_external_los: 0 missing values across 0 columns
2025-03-23 01:32:28,231 - INFO - o1_y_external_mortality: 0 missing values across 0 columns
2025-03-23 01:32:28,233 - INFO - o1_y_test_los: 0 missing values across 0 columns
2025-03-23 01:32:28,234 - INFO - o1_y_test_mortality: 0 missing values across 0 columns
2025-03-23 01:32:28,237 - INFO - o1_y_train_los: 0 missing values across 0 columns
2025-03-23 01:32:28,239 - INFO - o1_y_train_mortality: 0 missing values acros

In [6]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate unique values.")
for name, df in dataframes.items():
    if 'y_' in name:
        unique_vals = df.nunique()
        logging.info(f"{name} unique target values: {unique_vals.to_dict()}")

logging.info("Unique values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 01:32:30,612 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 01:32:30,613 - INFO - Calculate unique values.
2025-03-23 01:32:30,619 - INFO - o1_y_external_los unique target values: {'los': 830}
2025-03-23 01:32:30,623 - INFO - o1_y_external_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 01:32:30,625 - INFO - o1_y_test_los unique target values: {'los': 319}
2025-03-23 01:32:30,627 - INFO - o1_y_test_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 01:32:30,631 - INFO - o1_y_train_los unique target values: {'los': 2548}
2025-03-23 01:32:30,634 - INFO - o1_y_train_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 01:32:30,635 - INFO - o1_y_validate_los unique target values: {'los': 319}
2025-03-23 01:32:30,637 - INFO - o1_y_validate_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 01:32:30,641 - INFO - o2_y_external_los unique target values: {'los': 830}
2025-03-23 01:32:30,6

In [7]:
summary = []
for name, df in dataframes.items():
    missing_values = df.isnull().sum().sum()
    missing_cols = df.isnull().any().sum()
    total_cells = df.shape[0] * df.shape[1]
    summary.append({
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Values': missing_values,
        'Accross Missing Columns': missing_cols,
        'Missing %': 100 * missing_values / total_cells
    })

summary_df = pd.DataFrame(summary)
# Save to file
summary_df.to_csv("CSV/exports/01_dataset_summary.csv", index=False)

In [8]:
percent = 7

# Dictionary of datasets for iteration
datasets = {
    "o1_X_train": o1_X_train,
    "o2_X_train": o2_X_train,
    "o3_X_train": o3_X_train,
    "o4_X_train": o4_X_train
}

# Loop through each dataset
for name, df in datasets.items():
    missing_percentage_per_row = df.isnull().mean(axis=1) * 100
    missing_rows = (missing_percentage_per_row <= percent).sum()
    total_rows, total_columns = df.shape
    percent_between = (missing_rows * 100) / total_rows

    print(f"\nDataset: {name}")
    print(f"Total Rows: {total_rows}, Total Columns: {total_columns}")
    print(f"Number of rows with missing values up to {percent}%: {missing_rows}")
    print(f"The percentage between total and missing values sets is {percent_between:.2f}%")


Dataset: o1_X_train
Total Rows: 122496, Total Columns: 345
Number of rows with missing values up to 7%: 48
The percentage between total and missing values sets is 0.04%

Dataset: o2_X_train
Total Rows: 61248, Total Columns: 345
Number of rows with missing values up to 7%: 24
The percentage between total and missing values sets is 0.04%

Dataset: o3_X_train
Total Rows: 40832, Total Columns: 345
Number of rows with missing values up to 7%: 16
The percentage between total and missing values sets is 0.04%

Dataset: o4_X_train
Total Rows: 30624, Total Columns: 345
Number of rows with missing values up to 7%: 12
The percentage between total and missing values sets is 0.04%


## Task 2 (CIR-9): Analyze Missingness (per row/column)

In [9]:
missingness_summary = []

for name, df in dataframes.items():
    if not isinstance(df, pd.DataFrame):
        continue
    
    total_cells = df.shape[0] * df.shape[1]
    total_missing = df.isnull().sum().sum()
    
    col_missing = df.isnull().mean() * 100
    row_missing = df.isnull().mean(axis=1) * 100
    
    summary = {
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Cells': total_missing,
        'Total Missing %': round(100 * total_missing / total_cells, 2),
        'Columns with Missing (%)': (col_missing > 0).sum(),
        'Max % Missing in Row': round(row_missing.max(), 2),
        'Mean % Missing in Row': round(row_missing.mean(), 2),
        'Min % Missing in Row': round(row_missing.min(), 2)
    }
    missingness_summary.append(summary)

# Create DataFrame for summary
missingness_df = pd.DataFrame(missingness_summary)

# Sort by most missing
missingness_df.sort_values("Total Missing %", ascending=False, inplace=True)

# Save to file
missingness_df.to_csv("CSV/exports/02_missingness_summary.csv", index=False)

# Show top rows, I have leave out the y_ files which are labels
display(missingness_df.head(16))

Unnamed: 0,Dataset,Shape,Total Missing Cells,Total Missing %,Columns with Missing (%),Max % Missing in Row,Mean % Missing in Row,Min % Missing in Row
0,o1_X_external,"(234720, 345)",39527616,48.81,308,89.28,48.81,13.91
12,o2_X_external,"(117360, 345)",19763572,48.81,308,89.28,48.81,13.91
36,o4_X_external,"(58680, 345)",9881900,48.81,308,89.28,48.81,13.91
24,o3_X_external,"(78240, 345)",13175808,48.81,308,89.28,48.81,13.91
27,o3_X_validate,"(5104, 345)",669232,38.01,296,78.84,38.01,9.28
3,o1_X_validate,"(15312, 345)",2007696,38.01,296,78.84,38.01,9.28
15,o2_X_validate,"(7656, 345)",1003848,38.01,296,78.84,38.01,9.28
39,o4_X_validate,"(3828, 345)",501924,38.01,296,78.84,38.01,9.28
25,o3_X_test,"(5104, 345)",661888,37.59,292,80.0,37.59,10.43
1,o1_X_test,"(15312, 345)",1985664,37.59,292,80.0,37.59,10.43


## Task 3 (CIR-10): Visualize Missingness

In [11]:
# Create output folder
output_dir = "figures/task3_missingness"
os.makedirs(output_dir, exist_ok=True)

# Functions for plotting
def plot_missing_heatmap(
    df, title, max_rows=200, save_path=None, suffix_filter='(Min)', cmap='Blues'
):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    df_filtered = df[filtered_cols].head(max_rows)

    plt.figure(figsize=(16, 6))
    sns.heatmap(df_filtered.isnull(), cbar=False, cmap=cmap, yticklabels=False,
                linecolor='lightgrey', linewidths=0.001)

    plt.title(f"Missing Data Heatmap – First {max_rows} Rows\n({suffix_filter}) – {title}",
              fontsize=14, weight='bold')
    plt.xlabel("Features", fontsize=12)
    plt.ylabel("Rows", fontsize=12)
    plt.xticks(fontsize=8, rotation=90)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def plot_column_missing_bar(
    df, title, save_path=None, top_n=40, suffix_filter='(Min)', color='#3E8ED0'
):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    missing_perc = df[filtered_cols].isnull().mean() * 100
    missing_perc = missing_perc[missing_perc > 0].sort_values(ascending=False).head(top_n)

    plt.figure(figsize=(10, 0.4 * len(missing_perc)))
    ax = missing_perc.plot(kind='barh', color=color)

    plt.title(f"Top {len(missing_perc)} Features with Missing Values\n({suffix_filter}) – {title}",
              fontsize=14, weight='bold')
    plt.xlabel("Percentage Missing", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=8)
    plt.grid(axis='x', linestyle='--', linewidth=0.5)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


# Filter only X datasets from the summary
top_datasets = missingness_df[
    (~missingness_df['Dataset'].str.startswith('y_')) &
    (missingness_df['Dataset'].isin(dataframes.keys()))
].head(16)['Dataset'].tolist()

# Generate plots
for dataset_name in top_datasets:
    df = dataframes.get(dataset_name)
    if df is not None:
        print(f"Generating plots for: {dataset_name}")
        heatmap_path = os.path.join(output_dir, f"{dataset_name}_heatmap.png")
        barplot_path = os.path.join(output_dir, f"{dataset_name}_barplot.png")
        
        plot_missing_heatmap(df, dataset_name, save_path=heatmap_path)
        plot_column_missing_bar(df, dataset_name, save_path=barplot_path)
    else:
        print(f"Dataset not found in dataframes: {dataset_name}")

Generating plots for: o1_X_external
Generating plots for: o2_X_external
Generating plots for: o4_X_external
Generating plots for: o3_X_external
Generating plots for: o3_X_validate
Generating plots for: o1_X_validate
Generating plots for: o2_X_validate
Generating plots for: o4_X_validate
Generating plots for: o3_X_test
Generating plots for: o1_X_test
Generating plots for: o2_X_test
Generating plots for: o4_X_test
Generating plots for: o3_X_train
Generating plots for: o1_X_train
Generating plots for: o2_X_train
Generating plots for: o4_X_train
