In [11]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import classification_report




In [12]:

# Load dataset

data_path = r"C:\Users\dkast\ANA 680\ana680mid\combined_clean.csv"
df = pd.read_csv(data_path)




In [13]:

# Prepare target variable (DIABETE3 â†’ binary)

df["diabetes_binary"] = np.where(df["DIABETE3"] == 1.0, 1, 0)
df = df.dropna(subset=["diabetes_binary"])



In [14]:

#  Drop ID + target from features

drop_cols = ["PERSONID", "DIABETE3", "diabetes_binary"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

y = df["diabetes_binary"]



In [15]:
# Keep only numeric columns
X = X.select_dtypes(include=["number"])


In [16]:

#  Replace inf with NaN
X = X.replace([np.inf, -np.inf], np.nan)



In [17]:
# Fill remaining NaN with column medians
X = X.fillna(X.median())



In [18]:

# Train/test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# ---------------------------------------------------------


In [19]:
# Select top 4 features

selector = SelectKBest(mutual_info_classif, k=4)
selector.fit(X_train, y_train)

selected_features = X_train.columns[selector.get_support()].tolist()
print("Top 4 selected features:", selected_features)

X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]



Top 4 selected features: ['EMPLOY1', 'GENHLTH', '_AGEG5YR', '_BMI5CAT']


In [20]:

# Scale + train MLP

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    max_iter=500,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)

print("\nMLP Performance:\n")
print(classification_report(y_test, mlp.predict(X_test_scaled)))




MLP Performance:

              precision    recall  f1-score   support

           0       0.88      0.99      0.93      1093
           1       0.50      0.07      0.12       157

    accuracy                           0.87      1250
   macro avg       0.69      0.53      0.53      1250
weighted avg       0.83      0.87      0.83      1250



In [21]:

# Save model, scaler, and feature list

out_dir = r"C:\Users\dkast\ANA 680\ana680mid"
os.makedirs(out_dir, exist_ok=True)

pickle.dump(mlp, open(os.path.join(out_dir, "mlp_model.pkl"), "wb"))
pickle.dump(scaler, open(os.path.join(out_dir, "scaler.pkl"), "wb"))
pickle.dump(selected_features, open(os.path.join(out_dir, "selected_features.pkl"), "wb"))

print("\nSaved mlp_model.pkl, scaler.pkl, selected_features.pkl")


Saved mlp_model.pkl, scaler.pkl, selected_features.pkl
