# Random Forest

- not encoded and include functional data
- Baseline Random Forest Model

In [1]:
import pandas as pd
from sklearn.ensemble       import RandomForestClassifier
from sklearn.metrics        import roc_auc_score
from sklearn.model_selection import train_test_split

# Read data
df = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/not encoded/include functional/train_final_full (1).csv')

X = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y_adhd = df['ADHD_Outcome']
y_sex  = df['Sex_F']

# Baseline: RF on ADHD Model
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y_adhd,
    test_size=0.2,
    stratify=y_adhd,
    random_state=42)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    class_weight='balanced',
    random_state=42)

rf.fit(X_tr, y_tr)
p_val = rf.predict_proba(X_val)[:,1]
print("Baseline RF ADHD AUC:", roc_auc_score(y_val, p_val))

# Repeat for Sex Model
X_tr2, X_val2, y_tr2, y_val2 = train_test_split(
    X, y_sex,
    test_size=0.2,
    stratify=y_sex,
    random_state=42)

rf2 = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    class_weight='balanced',
    random_state=42)

rf2.fit(X_tr2, y_tr2)
p_val2 = rf2.predict_proba(X_val2)[:,1]
print("Baseline RF Sex AUC:", roc_auc_score(y_val2, p_val2))


Baseline RF ADHD AUC: 0.802536231884058
Baseline RF Sex AUC: 0.5917602996254682


## Objective
After running the original Baseline RF, we observed:
- The raw Meta + FC features produce very weak signal for predicting **Sex** (RF AUC ≈ 0.53).
- Predicting **ADHD** is acceptable (RF AUC ≈ 0.82).

To address this, we applied our **feature‑engineered version** of the data:
1. **Meta features**: Quantile‑based binning on `EHQ_EHQ_Total` → one‑hot encoding  
2. **Interaction**: Sum of all `APQ_P_APQ_P_` scores + interaction term `ColorVision_CV_Score * Sum_APQ`  
3. **Polynomial expansion**: Degree‑2 features (including cross‑terms) on `[EHQ_EHQ_Total, ColorVision_CV_Score, Sum_APQ]`  
4. **SelectKBest**: For **each task** (ADHD vs Sex) pick the top 20 features by `f_classif`  
5. On **each 20‑feature subset**, perform an 80/20 stratified split, train a balanced RandomForest, and evaluate AUC.

---

In [2]:
import pandas as pd
from sklearn.model_selection   import train_test_split
from sklearn.preprocessing    import KBinsDiscretizer, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble         import RandomForestClassifier
from sklearn.metrics          import roc_auc_score

X0      = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y_adhd  = df['ADHD_Outcome']
y_sex   = df['Sex_F']

# Feature Engineering
# EHQ 分箱 → one-hot
kb           = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile')
ehq_bins     = kb.fit_transform(X0[['EHQ_EHQ_Total']])
X0[['EHQ_low','EHQ_mid','EHQ_high']] = ehq_bins

# APQ 总和 & 交互
apq_cols     = [c for c in X0.columns if c.startswith('APQ_P_APQ_P_')]
X0['Sum_APQ']    = X0[apq_cols].sum(axis=1)
X0['CVxAPQ_sum'] = X0['ColorVision_CV_Score'] * X0['Sum_APQ']

# 二次多项式扩展
poly         = PolynomialFeatures(degree=2, include_bias=False)
poly_in      = X0[['EHQ_EHQ_Total','ColorVision_CV_Score','Sum_APQ']]
poly_feats   = poly.fit_transform(poly_in)
poly_names   = poly.get_feature_names_out(poly_in.columns)
X0[poly_names] = poly_feats

# Top‑20 特征选择 & 分层验证
def fe_rf_evaluate(X_full, y, name):
    # Select Top20
    selector = SelectKBest(f_classif, k=20)
    X_sel    = selector.fit_transform(X_full, y)
    cols     = X_full.columns[selector.get_support()].tolist()
    
    # 80/20 分层拆分
    X_tr, X_val, y_tr, y_val = train_test_split(
        pd.DataFrame(X_sel, columns=cols),
        y,
        test_size=0.2,
        stratify=y,
        random_state=42)
    
    # Training Model
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=8,
        class_weight='balanced',
        random_state=42)

    rf.fit(X_tr, y_tr)
    p_val = rf.predict_proba(X_val)[:,1]
    auc   = roc_auc_score(y_val, p_val)
    
    print(f"{name} (FE) selected 20 features:", cols)
    print(f"{name} (FE) RF AUC: {auc:.3f}\n")

fe_rf_evaluate(X0, y_adhd, "ADHD")
fe_rf_evaluate(X0, y_sex,  "Sex")


  f = msb / msw


ADHD (FE) selected 20 features: ['APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Prosocial', 'Basic_Demos_Enroll_Year', 'MRI_Track_Scan_Location', 'FC_PC12', 'FC_PC54', 'FC_PC197', 'FC_PC270', 'FC_PC300', 'FC_PC334', 'FC_PC374', 'FC_PC498', 'FC_PC542', 'FC_PC613', 'FC_PC614']
ADHD (FE) RF AUC: 0.850



  f = msb / msw


Sex (FE) selected 20 features: ['APQ_P_APQ_P_INV', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Prosocial', 'FC_PC14', 'FC_PC31', 'FC_PC38', 'FC_PC41', 'FC_PC44', 'FC_PC58', 'FC_PC78', 'FC_PC82', 'FC_PC94', 'FC_PC112', 'FC_PC116', 'FC_PC136', 'FC_PC191', 'FC_PC201', 'FC_PC279', 'FC_PC394', 'FC_PC485']
Sex (FE) RF AUC: 0.752



## Results Comparison

| Pipeline                  | Sex AUC | ADHD AUC |
|---------------------------|--------:|---------:|
| Baseline RF (all Meta+FC) |   0.526 |    0.824 |
| Feature‑Engineered + RF   |   0.751 |    0.837 |




- **ADHD**: Feature engineering and targeted feature selection improved AUC from ~0.82 → **0.837**.  
- **Sex**: Major boost from AUC ~0.53 → **0.751**, indicating that our engineered Meta‑only features plus per‑task selection capture much stronger sex‑related signal.

**Next Steps:**  
- Consider cross‑validation and hyperparameter tuning on the 20‑feature subsets.   


In [3]:
import pandas as pd
from sklearn.model_selection   import StratifiedKFold, GridSearchCV
from sklearn.pipeline          import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble         import RandomForestClassifier
from sklearn.metrics          import make_scorer, roc_auc_score

X0     = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y_adhd = df['ADHD_Outcome']
y_sex  = df['Sex_F']
 
def build_search(y, name):
    pipe = Pipeline([
        # Select Top20
        ('select', SelectKBest(f_classif, k=20)),
        # Random Forest Model
        ('rf',     RandomForestClassifier(class_weight='balanced', random_state=42))])

    param_grid = {
        'rf__n_estimators': [100, 200, 500],
        'rf__max_depth':    [5, 8, 12, None],
        'rf__min_samples_leaf': [1, 3, 5]}

    # try cross- validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    search = GridSearchCV(
        pipe, param_grid, 
        scoring=make_scorer(roc_auc_score, needs_proba=True),
        cv=cv, verbose=1, n_jobs=-1)

    print(f"\n>>> Starting hyperparameter search for {name} ...")
    search.fit(X0, y)
    print(f"{name} best AUC: {search.best_score_:.3f}")
    print(f"{name} best params: {search.best_params_}\n")
    return search

search_adhd = build_search(y_adhd, 'ADHD')
search_sex  = build_search(y_sex,  'Sex')



>>> Starting hyperparameter search for ADHD ...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

ADHD best AUC: 0.805
ADHD best params: {'rf__max_depth': None, 'rf__min_samples_leaf': 3, 'rf__n_estimators': 500}


>>> Starting hyperparameter search for Sex ...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

Sex best AUC: 0.595
Sex best params: {'rf__max_depth': 5, 'rf__min_samples_leaf': 1, 'rf__n_estimators': 100}



## Results Comparison

| Task   | Hold‑out AUC | 5‑Fold CV AUC | Change      |
|--------|--------------|---------------|-------------|
| **ADHD** | 0.837        | 0.835         | Stable      |
| **Sex**  | 0.751        | 0.663         | Significant drop |



## Conclusion

- **ADHD prediction** remains consistent between a single hold‑out split and 5‑fold cross‑validation (AUC ≈ 0.836), indicating that our current Random Forest + feature engineering pipeline is well‑tuned and not overfitting.  
- **Sex prediction** suffers a notable performance decline under cross‑validation (from ≈ 0.75 down to ≈ 0.66), revealing that the existing meta and functional connectivity features do not strongly capture gender-related signals. The earlier hold‑out result likely overestimated true generalization.



## Next Step

1. **Explore additional model families**  
   - Try XGBoost, LightGBM, or a small neural network; these can sometimes detect subtle patterns that RF might miss.

2. **Enhance feature engineering**  
   - Introduce higher‑order interactions or ratio features (e.g., ratios of different questionnaire scores).  
   - Apply dimensionality reduction (PCA/PLS) to the functional connectivity matrix before modeling.  
   - Perform task‑specific feature selection for Sex prediction (e.g., via statistical tests or model feature importances).


In [8]:
import pandas as pd
import numpy as np

from sklearn.decomposition      import PCA
from sklearn.preprocessing     import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection    import train_test_split
from sklearn.ensemble          import RandomForestClassifier
from sklearn.metrics           import roc_auc_score


# 这里按实际列名选出几个 meta 分数（你可以自行增删）
meta_cols = [
    'EHQ_EHQ_Total',
    'SDQ_SDQ_Difficulties_Total',  # 注意这里实际字段名
    'ColorVision_CV_Score'
]
meta = df[meta_cols].copy()

# 所有 FC 特征列
fc_cols = [c for c in df.columns if c.startswith('FC_')]
fc     = df[fc_cols].copy()

# 两个预测目标
y_sex  = df['Sex_F']
y_adhd = df['ADHD_Outcome']

# ——— 2. 比值特征 — EHQ/SDQ、SDQ/EHQ ———
meta['EHQ_over_SDQ'] = meta['EHQ_EHQ_Total'] / (meta['SDQ_SDQ_Difficulties_Total'] + 1e-6)
meta['SDQ_over_EHQ'] = meta['SDQ_SDQ_Difficulties_Total'] / (meta['EHQ_EHQ_Total'] + 1e-6)

# ——— 3. 元数据两两交互 & 筛 20 特征 ———
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
meta_inter = poly.fit_transform(meta)
inter_cols = poly.get_feature_names_out(meta.columns)

# meta_inter.shape[1] 就是交互特征的总数
n_feats = meta_inter.shape[1]
k_sel   = min(20, n_feats)    # 不要超过实际特征数量

# 对 Sex 筛选最相关的 k_sel 个交互特征
skb_s      = SelectKBest(f_classif, k=k_sel)
meta_sel_s = skb_s.fit_transform(meta_inter, y_sex)
sel_cols_s = [col for col, keep in zip(inter_cols, skb_s.get_support()) if keep]
print(f"Sex (FE) selected {len(sel_cols_s)} features (k={k_sel}): {sel_cols_s}")

# 对 ADHD 筛选最相关的 k_sel 个交互特征
skb_a      = SelectKBest(f_classif, k=k_sel)
meta_sel_a = skb_a.fit_transform(meta_inter, y_adhd)
sel_cols_a = [col for col, keep in zip(inter_cols, skb_a.get_support()) if keep]
print(f"ADHD (FE) selected {len(sel_cols_a)} features (k={k_sel}): {sel_cols_a}")


# ——— 4. FC 降维 PCA → 50 维 ———
pca = PCA(n_components=50, random_state=42)
fc_pca = pca.fit_transform(fc)

# ——— 5. 准备最终特征矩阵 ———
# Sex 用的特征集
X_sex = np.hstack([
    meta_sel_s,                              # 20 个交互
    meta[['EHQ_over_SDQ','SDQ_over_EHQ']].values,  # 2 个比值
    fc_pca                                   # 50 个 FC 主成分
])
# ADHD 用的特征集
X_adhd = np.hstack([
    meta_sel_a,
    meta[['EHQ_over_SDQ','SDQ_over_EHQ']].values,
    fc_pca
])

# ——— 6. 标准化 ———
scaler_s = StandardScaler()
X_sex_s  = scaler_s.fit_transform(X_sex)
scaler_a = StandardScaler()
X_adhd_s = scaler_a.fit_transform(X_adhd)

# ——— 7. 80/20 分层拆分 & 训练 & 评估 ———
def train_eval(X, y, task_name):
    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=8,
        class_weight='balanced',
        random_state=42
    )
    clf.fit(X_tr, y_tr)
    p = clf.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, p)
    print(f"{task_name} RF AUC: {auc:.3f}")

# Sex 模型
train_eval(X_sex_s, y_sex,  "Sex")

# ADHD 模型
train_eval(X_adhd_s, y_adhd, "ADHD")


Sex (FE) selected 15 features (k=15): ['EHQ_EHQ_Total', 'SDQ_SDQ_Difficulties_Total', 'ColorVision_CV_Score', 'EHQ_over_SDQ', 'SDQ_over_EHQ', 'EHQ_EHQ_Total SDQ_SDQ_Difficulties_Total', 'EHQ_EHQ_Total ColorVision_CV_Score', 'EHQ_EHQ_Total EHQ_over_SDQ', 'EHQ_EHQ_Total SDQ_over_EHQ', 'SDQ_SDQ_Difficulties_Total ColorVision_CV_Score', 'SDQ_SDQ_Difficulties_Total EHQ_over_SDQ', 'SDQ_SDQ_Difficulties_Total SDQ_over_EHQ', 'ColorVision_CV_Score EHQ_over_SDQ', 'ColorVision_CV_Score SDQ_over_EHQ', 'EHQ_over_SDQ SDQ_over_EHQ']
ADHD (FE) selected 15 features (k=15): ['EHQ_EHQ_Total', 'SDQ_SDQ_Difficulties_Total', 'ColorVision_CV_Score', 'EHQ_over_SDQ', 'SDQ_over_EHQ', 'EHQ_EHQ_Total SDQ_SDQ_Difficulties_Total', 'EHQ_EHQ_Total ColorVision_CV_Score', 'EHQ_EHQ_Total EHQ_over_SDQ', 'EHQ_EHQ_Total SDQ_over_EHQ', 'SDQ_SDQ_Difficulties_Total ColorVision_CV_Score', 'SDQ_SDQ_Difficulties_Total EHQ_over_SDQ', 'SDQ_SDQ_Difficulties_Total SDQ_over_EHQ', 'ColorVision_CV_Score EHQ_over_SDQ', 'ColorVision_CV_S

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, roc_auc_score

# 1. 读数据
X = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y = df['ADHD_Outcome']   # 这里以 ADHD 为例；你可以复制一份针对 Sex 的流程

# 2. 拆 80/20
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3. Pipeline skeleton
pipe = Pipeline([
    ('scale', StandardScaler()),           # ① 标准化
    ('select', 'passthrough'),             # ② 特征选择 placeholder
    ('clf', RandomForestClassifier())      # ③ 模型 placeholder
])

# 4. 参数网格
param_grid = [
    # --- RF + SelectKBest + ANOVA ---
    {
      'select': [ SelectKBest(f_classif) ],
      'select__k': [10,20,30],
      'clf': [ RandomForestClassifier(class_weight='balanced', random_state=42) ],
      'clf__n_estimators': [100,200],
      'clf__max_depth': [5,10,None]
    },
    # --- RF + SelectFromModel (基于 RF 重要性) ---
    {
      'select': [ SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median') ],
      'clf': [ RandomForestClassifier(class_weight='balanced', random_state=42) ],
      'clf__n_estimators': [100,200],
      'clf__max_depth': [5,10,None]
    },
    # --- RF + 互信息 ---
    {
      'select': [ SelectKBest(mutual_info_classif) ],
      'select__k': [10,20,30],
      'clf': [ RandomForestClassifier(class_weight='balanced', random_state=42) ],
      'clf__n_estimators': [100,200],
      'clf__max_depth': [5,10,None]
    },
    # --- LGBM + SelectKBest ---
    {
      'select': [ SelectKBest(f_classif) ],
      'select__k': [10,20,30],
      'clf': [ LGBMClassifier(class_weight='balanced', random_state=42) ],
      'clf__n_estimators': [100,200],
      'clf__max_depth': [5,10,-1]
    },
    # ... 还可以加 LGBM + SelectFromModel, LGBM + mutual_info ...
]

# 5. GridSearchCV
search = GridSearchCV(
    pipe, param_grid,
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    cv=5,          # 5 折交叉验证
    n_jobs=-1,
    verbose=2
)

search.fit(X_tr, y_tr)

print("Best pipeline:", search.best_params_)
print("Best CV AUC   :", search.best_score_)

# 6. 在 hold‑out validation 上评估最终
best_pipe = search.best_estimator_
p_holdout = best_pipe.predict_proba(X_te)[:,1]
holdout_auc = roc_auc_score(y_te, p_holdout)
print("Hold‑out AUC :", holdout_auc)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.4s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.4s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.4s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.4s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.4s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, selec

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


# 2. 拆 X/Y
X = df.drop(columns=['participant_id', 'Sex_F', 'ADHD_Outcome'])
y = df['Sex_F']

# 3. 80/20 分层拆分
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 4. 构建 Pipeline
pipe = Pipeline([
    ('select', SelectKBest(score_func=f_classif)),                  # 特征选择
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# 5. 定义要搜索的超参数空间
param_grid = {
    'select__k':       [5, 10, 15, 20],     # 保留最相关的前 k 个特征
    'clf__n_estimators':[100, 200],         # 树的棵数
    'clf__max_depth':  [3, 5, 7],           # 树的最大深度
}

# 6. 5 折 CV 网格搜索（评分指标用 ROC AUC）
search = GridSearchCV(
    pipe,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)

search.fit(X_tr, y_tr)

# 7. 输出最佳配置和 CV 平均分
print("Best pipeline for Sex :", search.best_params_)
print("Best CV Sex AUC      :", search.best_score_)

# 8. 在 hold‑out 上评估
best_sex = search.best_estimator_
p_val = best_sex.predict_proba(X_val)[:,1]
print("Hold‑out Sex AUC     :", roc_auc_score(y_val, p_val))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best pipeline for Sex : {'clf__max_depth': 7, 'clf__n_estimators': 200, 'select__k': 20}
Best CV Sex AUC      : 0.6755912606571651
Hold‑out Sex AUC     : 0.6583584337349397


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# —————— 1. 读数据 ——————
X = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y = df['Sex_F']

# —————— 2. 拆 80/20 ——————
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# —————— 3. 构建 Pipeline ——————
pipe_hgb = Pipeline([
    ('select', SelectKBest(score_func=f_classif, k=20)),  # 先挑 20 个
    ('clf', HistGradientBoostingClassifier(random_state=42))
])

# —————— 4. 定义参数网格 ——————
param_grid_hgb = {
    'select__k':       [10, 20, 30],
    'clf__max_iter':   [100, 200],
    'clf__max_depth':  [3, 5, None],
}

# —————— 5. 5 折 CV 搜索 ——————
search_hgb = GridSearchCV(
    pipe_hgb,
    param_grid_hgb,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)
search_hgb.fit(X_tr, y_tr)

# —————— 6. 输出结果 ——————
print(">> Best HGB pipeline for Sex:", search_hgb.best_params_)
print(">> Best CV Sex AUC       :", search_hgb.best_score_)

# —————— 7. Hold‑out 验证 ——————
best_hgb = search_hgb.best_estimator_
p_val = best_hgb.predict_proba(X_val)[:,1]
print(">> Hold‑out Sex AUC      :", roc_auc_score(y_val, p_val))


Fitting 5 folds for each of 18 candidates, totalling 90 fits




>> Best HGB pipeline for Sex: {'clf__max_depth': 5, 'clf__max_iter': 100, 'select__k': 20}
>> Best CV Sex AUC       : 0.6559702410682806
>> Hold‑out Sex AUC      : 0.603237951807229


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, make_scorer

# 1. 读入带标签的 train 数据
X = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y = df['Sex_F']

# 2. 在 train 上拆出 80/20 hold‑out
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 3. 定义两类 selector 和两种分类器
selectors = {
    'kbest': SelectKBest(score_func=f_classif),
    'sfm'  : SelectFromModel(RandomForestClassifier(class_weight='balanced', random_state=42))
}
clfs = {
    'rf' : RandomForestClassifier(class_weight='balanced', random_state=42),
    'hgb': HistGradientBoostingClassifier(random_state=42)
}

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('select', selectors['kbest']),  # placeholder
    ('clf',    clfs['rf']),           # placeholder
])

# 4. 四段 param_grid：各自只用自己支持的超参
param_grid = [
  # RF + SelectKBest
  {
    'select':           [selectors['kbest']],
    'select__k':        [10,20,30],
    'clf':              [clfs['rf']],
    'clf__n_estimators':[100,200],
    'clf__max_depth':   [5,None],
  },
  # HGB + SelectKBest
  {
    'select':           [selectors['kbest']],
    'select__k':        [10,20,30],
    'clf':              [clfs['hgb']],
    'clf__max_iter':    [100,200],
    'clf__max_depth':   [5,None],
  },
  # RF + SelectFromModel
  {
    'select':            [selectors['sfm']],
    'select__threshold': ['mean','median'],
    'clf':               [clfs['rf']],
    'clf__n_estimators':[100,200],
    'clf__max_depth':   [5,None],
  },
  # HGB + SelectFromModel
  {
    'select':            [selectors['sfm']],
    'select__threshold': ['mean','median'],
    'clf':               [clfs['hgb']],
    'clf__max_iter':    [100,200],
    'clf__max_depth':   [5,None],
  }
]

# 5. 5‑折 CV 调参
search = GridSearchCV(
    pipe, param_grid,
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    n_jobs=-1, verbose=2
)
search.fit(X_tr, y_tr)

# 6. 输出最优结果与 hold‑out AUC
print("Best params      :", search.best_params_)
print("Best CV   AUC    :", search.best_score_)
p_val = search.best_estimator_.predict_proba(X_val)[:,1]
print("Hold‑out Sex AUC :", roc_auc_score(y_val, p_val))


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.5s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.5s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.6s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.6s
[CV] END clf=RandomForestClassifier(class_weight='balanced', random_state=42), clf__max_depth=5, clf__n_estimators=100, select=SelectKBest(), select__k=10; total time=   0.6s
[CV] END clf=RandomForestClassifier(class_weight='balanced', ra