In [6]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    recall_score,
    confusion_matrix,
    balanced_accuracy_score,
)


TRAIN_CSV = r"C:\Users\Brightons\Downloads\Files_breast\Newest_file\Nonormalization_pneumonia_imagemasked_normcount_train.csv"
VAL_CSV   = r"C:\Users\Brightons\Downloads\Files_breast\Newest_file\Nonormalization_pneumonia_imagemasked_normcount_val.csv"
TEST_CSV  = r"C:\Users\Brightons\Downloads\Files_breast\Newest_file\Nonormalization_pneumonia_imagemasked_normcount_test.csv"

LABELS_NPZ = r"C:\Users\Brightons\Downloads\pneumoniamnist_224.npz"

# Positive label to use when the task is binary
POS_LABEL_FOR_BINARY = 1

# Output results file
OUT_CSV = "mlp_feature_config_results_breast.csv"

EXPECTED_NUM_COLS = 101


df_train = pd.read_csv(TRAIN_CSV)
df_val   = pd.read_csv(VAL_CSV)
df_test  = pd.read_csv(TEST_CSV)

data = np.load(LABELS_NPZ)
y_train, y_val, y_test = data["train_labels"], data["val_labels"], data["test_labels"]
y_train = np.ravel(y_train)
y_val   = np.ravel(y_val)
y_test  = np.ravel(y_test)

X_train_raw = df_train.values.astype(np.float32)
X_val_raw   = df_val.values.astype(np.float32)
X_test_raw  = df_test.values.astype(np.float32)

assert X_train_raw.shape[1] == EXPECTED_NUM_COLS
assert X_val_raw.shape[1]   == EXPECTED_NUM_COLS
assert X_test_raw.shape[1]  == EXPECTED_NUM_COLS


def macro_specificity_ovr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Macro-average specificity (TNR) via one-vs-rest per class."""
    classes = np.unique(np.concatenate([y_true, y_pred]))
    specs = []
    for c in classes:
        y_true_bin = (y_true == c)
        y_pred_bin = (y_pred == c)
        tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
        spec = tn / (tn + fp) if (tn + fp) > 0 else np.nan
        specs.append(spec)
    return float(np.nanmean(specs))

def macro_auc_valid_classes(y_true: np.ndarray, y_proba: np.ndarray, class_order: np.ndarray):
    aucs, skipped = [], []
    for idx, c in enumerate(class_order):
        y_true_bin = (y_true == c).astype(int)
        pos = y_true_bin.sum()
        neg = len(y_true_bin) - pos
        if pos == 0 or neg == 0:
            skipped.append(int(c))
            continue
        aucs.append(roc_auc_score(y_true_bin, y_proba[:, idx]))
    if not aucs:
        return np.nan, skipped
    return float(np.mean(aucs)), skipped

def sens_spec(y_true: np.ndarray, y_pred: np.ndarray, pos_label=None):
    classes = np.unique(np.concatenate([y_true, y_pred]))
    if classes.size == 2:
        if pos_label is None:
            pos_label = 1 if 1 in classes else classes.max()
        neg_label = classes[0] if classes[1] == pos_label else classes[1]
        sens = recall_score(y_true, y_pred, average="binary", pos_label=pos_label, zero_division=0)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[neg_label, pos_label]).ravel()
        spec = tn / (tn + fp) if (tn + fp) > 0 else np.nan
        return float(sens), float(spec)
    else:
        sens = recall_score(y_true, y_pred, average="macro", zero_division=0)
        spec = macro_specificity_ovr(y_true, y_pred)
        return float(sens), float(spec)


X_train_full_all101 = np.vstack([X_train_raw, X_val_raw])
y_train_full = np.concatenate([y_train, y_val])
X_test_all101 = X_test_raw



1
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full_all101)
X_test_scaled  = scaler.transform(X_test_all101)

# Train MLP
clf = MLPClassifier(hidden_layer_sizes=(100, 64), max_iter=500, random_state=42)
clf.fit(X_train_scaled, y_train_full)

# Predict
y_pred  = clf.predict(X_test_scaled)
y_proba = clf.predict_proba(X_test_scaled)  # columns align with clf.classes_

# Metrics
acc = accuracy_score(y_test, y_pred)
sens, spec = sens_spec(y_test, y_pred, pos_label=POS_LABEL_FOR_BINARY)
f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
auc_macro, skipped = macro_auc_valid_classes(y_test, y_proba, clf.classes_)
if skipped:
    print(f"[all_101] AUC skipped classes (no pos/neg in y_test): {skipped}")


if np.unique(y_test).size == 2:
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    diff = abs(bal_acc - 0.5 * (sens + spec))
    if diff > 1e-8:
        print(f"[WARN] Balanced acc mismatch by {diff:.3e}")

results_df = pd.DataFrame([{
    "Feature_Config": "all_101",
    "AUC_macro": auc_macro,
    "Accuracy": acc,
    "Sensitivity": sens,
    "Specificity": spec,
    "F1_macro": f1_macro,
}])

print("\nResults (all_101 only):\n", results_df.to_string(index=False))
results_df.to_csv(OUT_CSV, index=False)
print(f"\nSaved results to: {os.path.abspath(OUT_CSV)}")



Results (all_101 only):
 Feature_Config  AUC_macro  Accuracy  Sensitivity  Specificity  F1_macro
       all_101   0.999825  0.995192          1.0     0.987179  0.994858

Saved results to: C:\Users\Brightons\Downloads\Files_breast\mlp_feature_config_results_breast.csv
