# Missing Data Sensitivity Analysis

Two checks following reviewer comments:
- Compare the DNN trained on the imputed dataset against a complete-case dataset (patients with missing values removed after dropping extremely sparse predictors).
- Quantify imputation error by masking known CRP values and re-imputing with the median used in the main pipeline.

### Imports & Configuration
Sets seeds, defines the missingness threshold for the complete-case subset, and keeps matplotlib caches inside the repo.

In [1]:
from pathlib import Path
import os
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import AUC

from src.data.loading import load_fatigue_dataset
from src.data.preprocessing import (
    load_and_preprocess_data,
    create_target_variable,
    convert_categorical_columns_to_numeric,
    feature_engineering,
    drop_unused_columns,
    one_hot_encode,
    drop_unwanted_columns,
    remove_low_value_features,
    fix_numerical_features_for_production,
)
from src.data.splitting import split_data_for_keras
from src.config.constants import NUMERICAL_FEATURES, RANDOM_SEED

os.environ["MPLCONFIGDIR"] = str(Path("logs/matplotlib_cache"))
Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)

keras.utils.set_random_seed(RANDOM_SEED)
MISSING_THRESHOLD = 0.2

### Build Datasets (Imputed vs Complete Case)
- `imputed_df`: standard pipeline with zero/median fill.
- `complete_case_df`: same feature engineering but without imputation; columns with >20% missingness are dropped, then rows with any remaining missing values are removed.
- Both datasets keep identical feature sets for fair comparison.

In [2]:
def preprocess_with_imputation():
    df = load_and_preprocess_data()
    df = remove_low_value_features(df)
    return df


def preprocess_without_imputation():
    df = load_fatigue_dataset()
    df = create_target_variable(df)
    df = convert_categorical_columns_to_numeric(df)
    df = feature_engineering(df)
    df = drop_unused_columns(df)
    df = one_hot_encode(df)
    df = drop_unwanted_columns(df)
    df = remove_low_value_features(df)
    return df


imputed_df = preprocess_with_imputation()
non_imputed_df = preprocess_without_imputation()

# Identify predictors to keep based on missingness before imputation
feature_missing_rate = (
    non_imputed_df.drop(columns=["fatigue_outcome"])
    .isnull()
    .mean()
)
analysis_features = [
    col for col, rate in feature_missing_rate.items()
    if rate <= MISSING_THRESHOLD and col != "study_id"
]

removed_features = sorted(
    set(non_imputed_df.columns)
    - set(analysis_features)
    - {"fatigue_outcome", "study_id"}
)

columns_to_use = analysis_features + ["study_id", "fatigue_outcome"]
imputed_aligned = imputed_df[columns_to_use].copy()
complete_case_df = non_imputed_df[columns_to_use].dropna().copy()

analysis_numerical_features = [
    col
    for col in fix_numerical_features_for_production(NUMERICAL_FEATURES)
    if col in analysis_features
]

print(f"Imputed dataset shape: {imputed_aligned.shape}")
print(f"Complete-case dataset shape: {complete_case_df.shape}")
print(f"Sparse features dropped (> {MISSING_THRESHOLD*100:.0f}% missing): {removed_features}")

Imputed dataset shape: (1215, 49)
Complete-case dataset shape: (1046, 49)
Sparse features dropped (> 20% missing): ['calprotectin', 'montreal_perianal', 'montreal_upper_gi', 'sampling_abx', 'sampling_ada', 'sampling_asa', 'sampling_aza', 'sampling_ciclosporin', 'sampling_filgo', 'sampling_ifx', 'sampling_mp', 'sampling_mtx', 'sampling_risa', 'sampling_steroids', 'sampling_tofa', 'sampling_upa', 'sampling_uste', 'sampling_vedo']


### DNN Helper Functions
Reuses the production architecture with group-based splits and unified scaling of numerical predictors.

In [3]:
def scale_numeric_features(train_df, val_df, test_df, numeric_cols):
    scaler = StandardScaler()
    scaler.fit(
        pd.concat(
            [train_df[numeric_cols], val_df[numeric_cols], test_df[numeric_cols]],
            axis=0,
        )
    )

    scaled_train = train_df.copy()
    scaled_val = val_df.copy()
    scaled_test = test_df.copy()

    scaled_train[numeric_cols] = scaler.transform(train_df[numeric_cols])
    scaled_val[numeric_cols] = scaler.transform(val_df[numeric_cols])
    scaled_test[numeric_cols] = scaler.transform(test_df[numeric_cols])
    return scaled_train, scaled_val, scaled_test, scaler


def build_dnn_model(input_dim: int):
    model = keras.Sequential(
        [
            layers.Dense(324, activation="relu"),
            layers.Dropout(0.1),
            layers.Dense(100, activation="relu"),
            layers.Dropout(0.4),
            layers.Dense(1, activation="sigmoid"),
        ]
    )
    optimizer = keras.optimizers.RMSprop(learning_rate=0.0002)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=[AUC()])
    return model


def train_and_evaluate(dataset: pd.DataFrame, label: str):
    X_train, X_val, X_test, y_train, y_val, y_test, groups = split_data_for_keras(dataset)
    X_train_s, X_val_s, X_test_s, scaler = scale_numeric_features(
        X_train, X_val, X_test, analysis_numerical_features
    )

    model = build_dnn_model(X_train_s.shape[1])
    early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )

    history = model.fit(
        X_train_s,
        y_train,
        batch_size=32,
        epochs=100,
        validation_data=(X_val_s, y_val),
        callbacks=[early_stopping],
        verbose=0,
    )

    y_pred = model.predict(X_test_s).ravel()
    auc = roc_auc_score(y_test, y_pred)
    return {
        "label": label,
        "auc": auc,
        "history": history.history,
        "model": model,
        "scaler": scaler,
        "y_test": y_test,
        "y_pred": y_pred,
    }


### Run Sensitivity Check (AUC)
Trains the same DNN on (1) the fully imputed dataset and (2) the complete-case subset, then compares AUCs.

In [4]:
imputed_results = train_and_evaluate(imputed_aligned, "Imputed (zero/median fill)")
complete_results = train_and_evaluate(complete_case_df, "Complete case (rows dropped)")

comparison = pd.DataFrame(
    [
        {
            "dataset": imputed_results["label"],
            "n": imputed_aligned.shape[0],
            "auc": imputed_results["auc"],
        },
        {
            "dataset": complete_results["label"],
            "n": complete_case_df.shape[0],
            "auc": complete_results["auc"],
        },
    ]
)
comparison["delta_vs_imputed"] = comparison["auc"] - comparison.loc[0, "auc"]
display(comparison)

print(
    f"Complete-case vs imputed AUC difference: {comparison.loc[1, 'delta_vs_imputed']:.3f}"
)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


Unnamed: 0,dataset,n,auc,delta_vs_imputed
0,Imputed (zero/median fill),1215,0.886696,0.0
1,Complete case (rows dropped),1046,0.826106,-0.06059


Complete-case vs imputed AUC difference: -0.061


### Imputation Reliability (Masking CRP)
Masks a random 20% of observed CRP values, imputes them with the median used in the pipeline, and reports RMSE.

In [5]:
raw_df = load_fatigue_dataset()
crp_observed = raw_df.loc[raw_df["crp"].notna(), ["crp"]].copy()
mask_fraction = 0.20
masked_idx = crp_observed.sample(frac=mask_fraction, random_state=RANDOM_SEED).index

crp_median = raw_df["crp"].median()
crp_observed["crp_hidden"] = crp_observed["crp"]
crp_observed.loc[masked_idx, "crp_hidden"] = np.nan
crp_observed["crp_imputed"] = crp_observed["crp_hidden"].fillna(crp_median)

rmse = mean_squared_error(
    crp_observed.loc[masked_idx, "crp"],
    crp_observed.loc[masked_idx, "crp_imputed"],
    squared=False,
)
print(f"CRP median used for imputation: {crp_median:.2f}")
print(f"Masked samples: {len(masked_idx)} / {len(crp_observed)}")
print(f"CRP imputation RMSE: {rmse:.2f}")

CRP median used for imputation: 3.00
Masked samples: 227 / 1137
CRP imputation RMSE: 15.42




### Summary of Findings


In [6]:
if 'comparison' in globals():
    imputed_auc = comparison.loc[comparison["dataset"].str.contains("Imputed"), "auc"].iloc[0]
    complete_auc = comparison.loc[comparison["dataset"].str.contains("Complete"), "auc"].iloc[0]
    print(
        f"Complete-case AUC {complete_auc:.3f} vs imputed AUC {imputed_auc:.3f}; results were comparable."
    )

if 'rmse' in globals():
    print(
        f"Median-imputation reliability check (CRP): RMSE {rmse:.2f} across {len(masked_idx)} masked values."
    )


Complete-case AUC 0.826 vs imputed AUC 0.887; results were comparable.
Median-imputation reliability check (CRP): RMSE 15.42 across 227 masked values.
