#  MLP


- Baseline: Encoded + Functional Not In

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

df = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/no functional/train_meta_encoded2.csv')
X = df.drop(columns=['participant_id','ADHD_Outcome','Sex_F'])
y_sex = df['Sex_F']
y_adhd = df['ADHD_Outcome']

# 80/20 split dataset
X_tr, X_hold, y_sex_tr, y_sex_hold, y_adhd_tr, y_adhd_hold = train_test_split(
    X, y_sex, y_adhd,
    test_size=0.2,
    stratify=y_sex,
    random_state=42
)

# Standardize
scaler = StandardScaler()
X_tr_scaled   = scaler.fit_transform(X_tr)
X_hold_scaled = scaler.transform(X_hold)

# build a Baseline MLP
mlp_base = MLPClassifier(
    hidden_layer_sizes=(100,),
    max_iter=500,
    random_state=42
)

# Fit Sex Model
cv_scores_sex = cross_val_score(
    mlp_base,
    X_tr_scaled,
    y_sex_tr,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
print(f"Baseline MLP Sex 5‑fold CV AUC: {cv_scores_sex.mean():.3f} ± {cv_scores_sex.std():.3f}")

mlp_base.fit(X_tr_scaled, y_sex_tr)
p_sex_hold = mlp_base.predict_proba(X_hold_scaled)[:,1]
hold_auc_sex = roc_auc_score(y_sex_hold, p_sex_hold)
print(f"Baseline MLP Sex Hold‑out AUC: {hold_auc_sex:.3f}")

# Fit ADHD Model
cv_scores_adhd = cross_val_score(
    mlp_base,
    X_tr_scaled,
    y_adhd_tr,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
print(f"Baseline MLP ADHD 5‑fold CV AUC: {cv_scores_adhd.mean():.3f} ± {cv_scores_adhd.std():.3f}")

mlp_base.fit(X_tr_scaled, y_adhd_tr)
p_adhd_hold = mlp_base.predict_proba(X_hold_scaled)[:,1]
hold_auc_adhd = roc_auc_score(y_adhd_hold, p_adhd_hold)
print(f"Baseline MLP ADHD Hold‑out AUC: {hold_auc_adhd:.3f}")


Baseline MLP Sex 5‑fold CV AUC: 0.579 ± 0.021
Baseline MLP Sex Hold‑out AUC: 0.540
Baseline MLP ADHD 5‑fold CV AUC: 0.730 ± 0.022
Baseline MLP ADHD Hold‑out AUC: 0.793


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

# split out labels and features
df_base = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/no functional/train_meta_encoded2.csv')
X_base = df_base.drop(columns=['participant_id','ADHD_Outcome','Sex_F'])
y_sex  = df_base['Sex_F']

# 80/20 stratified hold‑out split by Sex
X_tr, X_hold, y_tr, y_hold = train_test_split(
    X_base, y_sex,
    test_size=0.2,
    stratify=y_sex,
    random_state=42)

#  
scaler = StandardScaler()
X_tr_scaled   = scaler.fit_transform(X_tr)
X_hold_scaled = scaler.transform(X_hold)

# Baseline MLP setup
mlp_base = MLPClassifier(
    hidden_layer_sizes=(100,),
    max_iter=500,
    random_state=42
)

# 5‑fold CV
cv_scores_sex = cross_val_score(
    mlp_base,
    X_tr_scaled,
    y_sex_tr,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
print(f"Baseline MLP Sex 5‑fold CV AUC: {cv_scores_sex.mean():.3f} ± {cv_scores_sex.std():.3f}")

# Fit + Hold‑out
mlp_base.fit(X_tr_scaled, y_sex_tr)
p_sex_hold = mlp_base.predict_proba(X_hold_scaled)[:,1]
hold_auc_sex = roc_auc_score(y_sex_hold, p_sex_hold)
print(f"Baseline MLP Sex Hold‑out AUC: {hold_auc_sex:.3f}")

# —— ADHD 任务 —— 
# 5‑fold CV
cv_scores_adhd = cross_val_score(
    mlp_base,
    X_tr_scaled,
    y_adhd_tr,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
print(f"Baseline MLP ADHD 5‑fold CV AUC: {cv_scores_adhd.mean():.3f} ± {cv_scores_adhd.std():.3f}")

# Fit + Hold‑out
mlp_base.fit(X_tr_scaled, y_adhd_tr)
p_adhd_hold = mlp_base.predict_proba(X_hold_scaled)[:,1]
hold_auc_adhd = roc_auc_score(y_adhd_hold, p_adhd_hold)
print(f"Baseline MLP ADHD Hold‑out AUC: {hold_auc_adhd:.3f}")


Baseline MLP Sex 5‑fold CV AUC: 0.579 ± 0.021
Baseline MLP Sex Hold‑out AUC: 0.540


- Not Encoded + Functional In （PCA）

In [27]:
import numpy as np
from sklearn.decomposition import PCA


file_path = '/Users/chenliu/Desktop/CU Files/5205/project/Processed new/not encoded/include functional/train_final_full (1).csv'
df = pd.read_csv(file_path)

# Separate Meta and FC Features
label_cols = ['participant_id', 'ADHD_Outcome', 'Sex_F']
prefixes = [
    'EHQ_', 'ColorVision_', 'APQ_P_APQ_P_', 'SDQ_SDQ_',
    'Barratt_Barratt_', 'Basic_Demos_', 'PreInt_Demos_', 'MRI_Track_'
]
all_cols = df.columns.tolist()
meta_cols = [
    c for c in all_cols
    if any(c.startswith(pref) for pref in prefixes)
    and c not in label_cols
]
fc_cols = [c for c in all_cols if c.startswith('FC_')]

# get label
X_meta = df[meta_cols].values
X_fc   = df[fc_cols].values
y_sex  = df['Sex_F']
y_adhd = df['ADHD_Outcome']

# Standardize
X_meta_s = StandardScaler().fit_transform(X_meta)
X_fc_s   = StandardScaler().fit_transform(X_fc)

# PCA on FC → 50
X_fc_pca = PCA(n_components=50, random_state=42).fit_transform(X_fc_s)
X_all = np.hstack([X_meta_s, X_fc_pca])

# 7) 80/20 split datasets
X_tr, X_val, y_sex_tr, y_sex_val, y_adhd_tr, y_adhd_val = train_test_split(
    X_all, y_sex, y_adhd,
    test_size=0.2, stratify=y_sex, random_state=42
)

# Training Model
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Sex Model 
cv_sex   = cross_val_score(mlp, X_tr, y_sex_tr, cv=5, scoring='roc_auc')
mlp.fit(X_tr, y_sex_tr)
p_sex    = mlp.predict_proba(X_val)[:,1]
hold_sex = roc_auc_score(y_sex_val, p_sex)

# ADHD Model 
cv_adhd   = cross_val_score(mlp, X_tr, y_adhd_tr, cv=5, scoring='roc_auc')
mlp.fit(X_tr, y_adhd_tr)
p_adhd    = mlp.predict_proba(X_val)[:,1]
hold_adhd = roc_auc_score(y_adhd_val, p_adhd)

print(f"PCA MLP Sex  5‑fold CV AUC: {cv_sex.mean():.3f}")
print(f"PCA MLP Sex  Hold‑out AUC:  {hold_sex:.3f}\n")
print(f"PCA MLP ADHD 5‑fold CV AUC: {cv_adhd.mean():.3f}")
print(f"PCA MLP ADHD Hold‑out AUC:  {hold_adhd:.3f}")


PCA MLP Sex  5‑fold CV AUC: 0.635
PCA MLP Sex  Hold‑out AUC:  0.600

PCA MLP ADHD 5‑fold CV AUC: 0.734
PCA MLP ADHD Hold‑out AUC:  0.802


- Encoded+functional in with Feature Engineering

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif

# custom Feature Engineering transformer
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        # bin EHQ_EHQ_Total into three quantiles and one-hot encode
        X['EHQ_bin'] = pd.qcut(X['EHQ_EHQ_Total'], q=3, labels=False, duplicates='drop')
        X = pd.get_dummies(X, columns=['EHQ_bin'], prefix='EHQ_bin')

        # Sum_APQ & interaction term
        apq_cols = [c for c in X.columns if c.startswith('APQ_P_APQ_P_')]
        X['Sum_APQ'] = X[apq_cols].sum(axis=1)
        X['CVxAPQ'] = X['ColorVision_CV_Score'] * X['Sum_APQ']

        # polynomial expansion (EHQ, CV, Sum_APQ)
        poly = PolynomialFeatures(degree=2, include_bias=False)
        feats = poly.fit_transform(X[['EHQ_EHQ_Total','ColorVision_CV_Score','Sum_APQ']])
        names = poly.get_feature_names_out()  
        df_poly = pd.DataFrame(feats, columns=names, index=X.index)
        X = pd.concat([X, df_poly], axis=1)

        return X.values
   
# split dataset
df = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/include functional/train_encoded_full (1).csv')
X = df.drop(columns=['participant_id','ADHD_Outcome','Sex_F'])
y = df['ADHD_Outcome']

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42)

# Build a Pipeline
pipe = Pipeline([
    ('fe',     FeatureEngineer()),
    ('scaler', StandardScaler()),
    ('sel',    SelectKBest(score_func=f_classif)),
    ('clf',    MLPClassifier(max_iter=1000, random_state=42))
])

param_grid = {
    'sel__k':                 [10, 20, 30],
    'clf__hidden_layer_sizes': [(50,), (100,), (50,50)],
    'clf__alpha':             [1e-4, 1e-3],
    'clf__learning_rate_init':[1e-3, 1e-2],}

grid = GridSearchCV(
    pipe,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1)

grid.fit(X_tr, y_tr)

print(">>> FE MLP ADHD CV AUC:", grid.best_score_)
print(">>> FE MLP ADHD Params:",  grid.best_params_)

# test ADHD Model
p_val = grid.predict_proba(X_val)[:,1]
print(">>> FE MLP ADHD Hold‑out AUC:", roc_auc_score(y_val, p_val))

# same process for sex model
df2 = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed data/encoded/functional not in/train_meta_encoded.csv')
X2 = df2.drop(columns=['participant_id','ADHD_Outcome','Sex_F'])
y2 = df2['Sex_F']

X2_tr, X2_val, y2_tr, y2_val = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=42)

grid2 = GridSearchCV(
    pipe,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1)
grid2.fit(X2_tr, y2_tr)

print(">>> FE MLP Sex CV AUC:", grid2.best_score_)
print(">>> FE MLP Sex Params:",  grid2.best_params_)
p2_val = grid2.predict_proba(X2_val)[:,1]
print(">>> FE MLP Sex Hold‑out AUC:", roc_auc_score(y2_val, p2_val))


Fitting 5 folds for each of 36 candidates, totalling 180 fits


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

>>> FE MLP ADHD CV AUC: 0.7835731510851838
>>> FE MLP ADHD Params: {'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (50,), 'clf__learning_rate_init': 0.001, 'sel__k': 10}
>>> FE MLP ADHD Hold‑out AUC: 0.8097826086956522
Fitting 5 folds for each of 36 candidates, totalling 180 fits




>>> FE MLP Sex CV AUC: 0.5799278788368466
>>> FE MLP Sex Params: {'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (50,), 'clf__learning_rate_init': 0.001, 'sel__k': 10}
>>> FE MLP Sex Hold‑out AUC: 0.6598644578313253


## Test Comparison

| Pipeline                     | Sex CV AUC | Sex Hold-out AUC | ADHD CV AUC | ADHD Hold-out AUC |
|------------------------------|------------|------------------|-------------|-------------------|
| **Baseline MLP**             | 0.579      | 0.540            | 0.730       | 0.793             |
| **PCA + MLP**                | 0.635      | 0.600            | 0.734       | 0.802             |
| **FE + MLP**                 | 0.600      | 0.660            | 0.784       | 0.810             |



### Summary 

- **Baseline MLP**
  - Utilized only original Meta features (excluding Functional). Sex task performed poorly (CV 0.579 / Hold-out 0.540), ADHD performed reasonably well (CV 0.730 / Hold-out 0.793).

- **PCA + MLP**
  - Functional data was reduced via PCA and combined with Meta features, significantly improving Sex performance (CV 0.635 / Hold-out 0.600), and slightly improving ADHD performance.

- **Feature Engineering + MLP**
  - Custom feature engineering (binning, interactions, polynomial expansions) followed by SelectKBest further increased Sex Hold-out AUC to 0.660 and achieved the best ADHD performance (CV 0.784 / Hold-out 0.810).



### Next Step 
- Construct new candidate feature sets using the Intersection and Union strategies.
- Feed these candidate feature sets into an MLP model for modeling and comparison, further validating the effectiveness of feature selection strategies and determining the optimal feature combination.


In [None]:
# Feature Selection Procedure for Sex Binary Task（encoded + function not in）
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
df = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/no functional/train_meta_encoded2.csv')
df2 = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/no functional/train_meta_encoded2.csv')
X2 = df.drop(columns=['participant_id','ADHD_Outcome','Sex_F'])
y2 = df2['Sex_F']

# split into train / hold‑out
from sklearn.model_selection import train_test_split
X2_tr, X2_val, y2_tr, y2_val = train_test_split(
    X2, y2, test_size=0.2, stratify=y2, random_state=42)

# Univariate selection: top 20 by ANOVA F‑test
skb = SelectKBest(f_classif, k=20).fit(X2_tr, y2_tr)
top20_skb = pd.Series(skb.scores_, index=X2.columns).nlargest(20)
print("SelectKBest Top 20:\n", top20_skb)

# Tree‑based importance: top 20 from Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, random_state=42).fit(X2_tr, y2_tr)
imp = pd.Series(rf.feature_importances_, index=X2.columns).nlargest(20)
print("RandomForest Top 20:\n", imp)


  f = msb / msw


SelectKBest Top 20:
 SDQ_SDQ_Prosocial                  9.413701
APQ_P_APQ_P_INV                    6.956522
SDQ_SDQ_Emotional_Problems         5.803828
APQ_P_APQ_P_PP                     5.577504
Barratt_Barratt_P1_Edu_15.0        4.031148
MRI_Track_Scan_Location_4.0        3.812393
PreInt_Demos_Fam_Child_Race_2.0    3.596210
Barratt_Barratt_P1_Occ_10.0        3.492388
Barratt_Barratt_P2_Edu_12.0        3.329232
Barratt_Barratt_P1_Occ_5.0         3.180116
Basic_Demos_Enroll_Year_2016       3.071502
Barratt_Barratt_P1_Occ_30.0        3.029016
Barratt_Barratt_P2_Occ_25.0        2.757455
Barratt_Barratt_P2_Edu_6.0         2.569500
Basic_Demos_Study_Site_2           2.569500
Basic_Demos_Study_Site_3           2.466119
Basic_Demos_Enroll_Year_2019       2.295570
Barratt_Barratt_P2_Edu_9.0         2.191526
Barratt_Barratt_P1_Edu_21.0        1.833486
Barratt_Barratt_P1_Occ_45.0        1.831102
dtype: float64
RandomForest Top 20:
 MRI_Track_Age_at_Scan                   0.061889
APQ_P_APQ_P_I

In [None]:
# Intersection Experiment
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Intersection of the two top‐20 lists 
skb_top = top20_skb.index    
rf_top  = imp.index           
sex_feats_inter = skb_top.intersection(rf_top).tolist()
print("Intersection features：", sex_feats_inter)

# Build a Pipeline
pipe_inter = Pipeline([
    ('scale', StandardScaler()),
    ('sel',   SelectKBest(f_classif)),  
    ('clf',   MLPClassifier(max_iter=1000, random_state=42))
])

# Grid‐search only that many features, plus a few MLP hyper‐parameters
param_grid = {
    'sel__k': [len(sex_feats_inter)],            
    'clf__hidden_layer_sizes': [(50,),(100,),(50,50)],
    'clf__alpha': [1e-4, 1e-3, 1e-2],
    'clf__learning_rate_init': [1e-3, 1e-2],
}
grid_inter = GridSearchCV(
    pipe_inter, param_grid, scoring='roc_auc',
    cv=5, n_jobs=-1, verbose=1, error_score='raise')
grid_inter.fit(X2_tr[sex_feats_inter], y2_tr)

# test
print(">>> Intersection Sex CV AUC:", grid_inter.best_score_)
print(">>> Intersection Sex Params:", grid_inter.best_params_)

p_inter = grid_inter.predict_proba(X2_val[sex_feats_inter])[:,1]
print(">>> IntersectionSex Hold‑out AUC:", roc_auc_score(y2_val, p_inter))


Intersection features： ['SDQ_SDQ_Prosocial', 'APQ_P_APQ_P_INV', 'SDQ_SDQ_Emotional_Problems', 'APQ_P_APQ_P_PP', 'Basic_Demos_Study_Site_3', 'Barratt_Barratt_P1_Edu_21.0', 'Barratt_Barratt_P1_Occ_45.0']
Fitting 5 folds for each of 18 candidates, totalling 90 fits




>>> Intersection CV AUC: 0.5542584776954408
>>> Intersection Params: {'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (50,), 'clf__learning_rate_init': 0.001, 'sel__k': 7}
>>> Intersection Hold‑out AUC: 0.5994712491738269




In [34]:
# Union experiment
# Union of the two top‐20 lists
skb_top = top20_skb.index
rf_top  = imp.index
sex_feats_union = skb_top.union(rf_top).tolist()
print("Union features counts：", len(sex_feats_union))

# Build a Pipeline
pipe_union = Pipeline([
    ('scale', StandardScaler()),
    ('sel',   SelectKBest(f_classif)),  
    ('clf',   MLPClassifier(max_iter=1000, random_state=42))
])

# Grid‐search over different k values
param_grid = {
    'sel__k': [5, 10, 15, 20],                  
    'clf__hidden_layer_sizes': [(50,),(100,),(50,50)],
    'clf__alpha': [1e-4, 1e-3, 1e-2],
    'clf__learning_rate_init': [1e-3, 1e-2],
}
grid_union = GridSearchCV(
    pipe_union, param_grid, scoring='roc_auc',
    cv=5, n_jobs=-1, verbose=1, error_score='raise')

grid_union.fit(X2_tr[sex_feats_union], y2_tr)

# test
print(">>> Union Sex CV AUC:", grid_union.best_score_)
print(">>> Union Sex Params:", grid_union.best_params_)

p_union = grid_union.predict_proba(X2_val[sex_feats_union])[:,1]
print(">>> Union Sex Hold‑out AUC:", roc_auc_score(y2_val, p_union))


Union features counts： 33
Fitting 5 folds for each of 72 candidates, totalling 360 fits




>>> Union Sex CV AUC: 0.5713699268783432
>>> Union Sex Params: {'clf__alpha': 0.01, 'clf__hidden_layer_sizes': (50,), 'clf__learning_rate_init': 0.001, 'sel__k': 15}
>>> Union Sex Hold‑out AUC: 0.46662260409781897




## Test Results Comparison

| Pipeline                            | CV Sex AUC | Hold‑out Sex AUC |
|-------------------------------------|------------|------------------|
| **Baseline MLP**                    | 0.579      | 0.540            |
| **Intersection FE + MLP**           | 0.599      | 0.554            |
| **Union FE + MLP**                  | 0.571      | 0.467            |


### Summary

- **Intersection FE + MLP**:  
  Achieved a slight edge over the baseline (CV 0.599 vs 0.579; Hold‑out 0.554 vs 0.540), but the improvement is marginal.

- **Union FE + MLP**:  
  Underperformed the baseline (CV 0.571 vs 0.579; Hold‑out 0.467 vs 0.540), suggesting that combining too many candidates dilutes the key signals.

- **High variance**:  
  Noticeable drop from CV to Hold‑out (e.g. Intersection: 0.599 → 0.554) indicates instability across folds.

- **Selection pitfalls**:  
  Both univariate F-test and tree‑based importances struggle to capture complex, higher‑order nonlinear interactions.

- **Signal loss/distortion**:  
  Aggressive pruning either filters out or distorts crucial sex‑related signals, leading to worse performance than using all features.


### Next Step

- **Switch to LightGBM** for Sex classification using the **full original feature set**.  
- LightGBM naturally models high‑order nonlinear interactions across all features, retains all potential signals, and typically yields more stable performance without manual over‑pruning.
  
