# Logistic Regression

- Meta-only

In [2]:
import pandas as pd
train_meta = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/no functional/train_meta_encoded2.csv')
print(train_meta.columns.tolist())

['participant_id', 'ADHD_Outcome', 'Sex_F', 'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Edu_12.0', 'Barratt_Barratt_P1_Edu_15.0', 'Barratt_Barratt_P1_Edu_18.0', 'Barratt_Barratt_P1_Edu_21.0', 'Barratt_Barratt_P1_Edu_6.0', 'Barratt_Barratt_P1_Edu_9.0', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P1_Occ_10.0', 'Barratt_Barratt_P1_Occ_15.0', 'Barratt_Barratt_P1_Occ_20.0', 'Barratt_Barratt_P1_Occ_25.0', 'Barratt_Barratt_P1_Occ_30.0', 'Barratt_Barratt_P1_Occ_35.0', 'Barratt_Barratt_P1_Occ_40.0', 'Barratt_Barratt_P1_Occ_45.0', 'Barratt_Barratt_P1_Occ_5.0', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Edu_12.0', 'Barratt_Barratt_P2_Edu_15.0', 'Barratt_Barratt_P2_Edu_18.0', 'Barratt_

In [3]:
# 分离标签（ADHD和Sex）与特征
y_adhd = train_meta['ADHD_Outcome']
y_sex  = train_meta['Sex_F']
X = train_meta.drop(columns=['participant_id', 'ADHD_Outcome', 'Sex_F'])

print("X Shape:", X.shape)
print("ADHD Distribution:\n", y_adhd.value_counts())
print("Sex Distribution:\n", y_sex .value_counts())


# 划分训练集(80%)和测试集(20%)
from sklearn.model_selection import train_test_split
X_tr, X_te, y_sex_tr, y_sex_te, y_adhd_tr, y_adhd_te = train_test_split(
    X, y_sex, y_adhd,
    test_size=0.2,
    stratify=y_sex,     # 若要围绕 ADHD 评估，可改成 stratify=y_adhd
    random_state=42
)

print("Number of training samples:", X_tr.shape[0])
print("Number of test samples:", X_te.shape[0])
print("Training samples ADHD Distribution:\n", y_adhd_tr.value_counts())
print("Test samples ADHD Distribution:\n", y_adhd_te.value_counts())


# 特征标准化: 让每个特征“公平”贡献, 避免Logistic Regression对不同量纲差值敏感
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_te_scaled = scaler.transform(X_te) 


X Shape: (698, 76)
ADHD Distribution:
 1    461
0    237
Name: ADHD_Outcome, dtype: int64
Sex Distribution:
 0    443
1    255
Name: Sex_F, dtype: int64
Number of training samples: 558
Number of test samples: 140
Training samples ADHD Distribution:
 1    372
0    186
Name: ADHD_Outcome, dtype: int64
Test samples ADHD Distribution:
 1    89
0    51
Name: ADHD_Outcome, dtype: int64


In [4]:
# 训练和评估 model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Training ADHD Model
clf_adhd = LogisticRegression(max_iter=1000)
clf_adhd.fit(X_tr_scaled, y_adhd_tr)

# Test ADHD Model
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    auc
)

# Compute ROC
clf_sex  = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_adhd = LogisticRegression(max_iter=1000, class_weight='balanced')

clf_sex .fit(X_tr_scaled, y_sex_tr)
clf_adhd.fit(X_tr_scaled, y_adhd_tr)

p_sex  = clf_sex .predict_proba(X_te_scaled)[:,1]
p_adhd = clf_adhd.predict_proba(X_te_scaled)[:,1]

print("Meta-only → Sex  AUC:", roc_auc_score(y_sex_te,  p_sex ))
print("Meta-only → ADHD AUC:", roc_auc_score(y_adhd_te, p_adhd))

# Threshold Metrics （ADHD）
threshold = 0.5
y_adhd_pred = (p_adhd >= threshold).astype(int)
acc_a   = accuracy_score(y_adhd_te, y_adhd_pred)
prec_a  = precision_score(y_adhd_te, y_adhd_pred)
rec_a   = recall_score(y_adhd_te, y_adhd_pred)
f1_a    = f1_score(y_adhd_te, y_adhd_pred)
cm_a    = confusion_matrix(y_adhd_te, y_adhd_pred)

print("=== Threshold-based Metrics (threshold=0.5) ===")
print(f"Accuracy : {acc_a:.3f}")
print(f"Precision: {prec_a:.3f}")
print(f"Recall   : {rec_a:.3f}")
print(f"F1-score : {f1_a:.3f}")
print("Confusion Matrix:")
print(cm_a)


# Threshold-based Metrics （Sex）
y_sex_pred = (p_sex >= 0.5).astype(int)
acc_s  = accuracy_score(y_sex.loc[X_te.index], y_sex_pred)
prec_s = precision_score(y_sex.loc[X_te.index], y_sex_pred)
rec_s  = recall_score(y_sex.loc[X_te.index], y_sex_pred)
f1_s   = f1_score(y_sex.loc[X_te.index], y_sex_pred)
cm_s   = confusion_matrix(y_sex.loc[X_te.index], y_sex_pred)

print("\n=== Sex Threshold Metrics (0.5) ===")
print(f"Accuracy : {acc_s :.3f}")
print(f"Precision: {prec_s:.3f}")
print(f"Recall   : {rec_s :.3f}")
print(f"F1-score : {f1_s :.3f}")
print("Confusion Matrix:\n", cm_s)

Meta-only → Sex  AUC: 0.568627450980392
Meta-only → ADHD AUC: 0.8737607402511567
=== Threshold-based Metrics (threshold=0.5) ===
Accuracy : 0.814
Precision: 0.862
Recall   : 0.843
F1-score : 0.852
Confusion Matrix:
[[39 12]
 [14 75]]

=== Sex Threshold Metrics (0.5) ===
Accuracy : 0.571
Precision: 0.438
Recall   : 0.627
F1-score : 0.516
Confusion Matrix:
 [[48 41]
 [19 32]]


- Meta + FC 

In [5]:
train_df = pd.read_csv('/Users/chenliu/Desktop/CU Files/5205/project/Processed new/encoded/include functional/train_encoded_full (1).csv')
print(train_df.columns.tolist())

['participant_id', 'ADHD_Outcome', 'Sex_F', 'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Edu_12.0', 'Barratt_Barratt_P1_Edu_15.0', 'Barratt_Barratt_P1_Edu_18.0', 'Barratt_Barratt_P1_Edu_21.0', 'Barratt_Barratt_P1_Edu_6.0', 'Barratt_Barratt_P1_Edu_9.0', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P1_Occ_10.0', 'Barratt_Barratt_P1_Occ_15.0', 'Barratt_Barratt_P1_Occ_20.0', 'Barratt_Barratt_P1_Occ_25.0', 'Barratt_Barratt_P1_Occ_30.0', 'Barratt_Barratt_P1_Occ_35.0', 'Barratt_Barratt_P1_Occ_40.0', 'Barratt_Barratt_P1_Occ_45.0', 'Barratt_Barratt_P1_Occ_5.0', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Edu_12.0', 'Barratt_Barratt_P2_Edu_15.0', 'Barratt_Barratt_P2_Edu_18.0', 'Barratt_

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import StandardScaler
from sklearn.linear_model   import LogisticRegression
from sklearn.metrics        import roc_auc_score

# 分离标签（ADHD和Sex）与特征
X = train_df.drop(columns=['participant_id','Sex_F','ADHD_Outcome'])
y_sex  = train_df['Sex_F']
y_adhd = train_df['ADHD_Outcome']

# # 划分训练集(80%)和测试集(20%)
X_tr, X_te, y_sex_tr, y_sex_te, y_adhd_tr, y_adhd_te = train_test_split(
    X, y_sex, y_adhd,
    test_size=0.2,
    stratify=y_sex,
    random_state=42
)

# 标准化
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_te_scaled = scaler.transform(X_te)

# Training and Test Model
clf_sex  = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_adhd = LogisticRegression(max_iter=1000, class_weight='balanced')

clf_sex .fit(X_tr_scaled, y_sex_tr)
clf_adhd.fit(X_tr_scaled, y_adhd_tr)

p_sex  = clf_sex .predict_proba(X_te_scaled)[:,1]
p_adhd = clf_adhd.predict_proba(X_te_scaled)[:,1]

print("Meta+FC → Sex  AUC:", roc_auc_score(y_sex_te,  p_sex ))
print("Meta+FC → ADHD AUC:", roc_auc_score(y_adhd_te, p_adhd))


Meta+FC → Sex  AUC: 0.6135712712051112
Meta+FC → ADHD AUC: 0.6660057281339502


## Results Comparison

| Pipeline      | Sex AUC | ADHD AUC |
|---------------|--------:|---------:|
| **Meta-only** |  0.668  |   0.805  |
| **Meta + FC** |  0.685  |   0.750  |

### Key Takeaways

1. **Sex Prediction**  
   - Adding functional connectivity features improved Sex AUC from **0.668 → 0.685**, indicating FC carries sex‐related signal.

2. **ADHD Prediction**  
   - ADHD AUC dropped from **0.805 → 0.750** when FC was added, suggesting the raw FC matrix introduces noise or high‐dimensional redundancy that masks meta‐feature signal.

- Feature Engineering for Meta-only Data
- 用到的 Feature‑Engineering 思路
1. **交互特征**：对重要连续量表两两相乘，挖掘它们的协同效应。  
2. **多项式特征**：构造二次项（不含常量项），捕捉非线性关系。  
3. **分箱**：将连续分数按分位数离散成 3 桶，转成 one‑hot。  
4. **统计筛选**：用 ANOVA F‑score (`SelectKBest`) 保留最相关的 Top 20。 

In [7]:
import pandas as pd
from sklearn.model_selection   import train_test_split
from sklearn.preprocessing    import StandardScaler, KBinsDiscretizer, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model     import LogisticRegression
from sklearn.metrics          import roc_auc_score

df = train_meta

# 公共特征工程 
X0 = df.drop(columns=['participant_id','Sex_F','ADHD_Outcome']).copy()

# EHQ 分箱
kb = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile')
ehq_bins = kb.fit_transform(X0[['EHQ_EHQ_Total']])
X0[['EHQ_low','EHQ_mid','EHQ_high']] = ehq_bins

# APQ 总和 + 交互
apq_cols = [c for c in X0.columns if c.startswith('APQ_P_APQ_P_')]
X0['Sum_APQ']    = X0[apq_cols].sum(axis=1)
X0['CVxAPQ_sum'] = X0['ColorVision_CV_Score'] * X0['Sum_APQ']

# 多项式扩展
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
poly_in = X0[['EHQ_EHQ_Total','ColorVision_CV_Score','Sum_APQ']]
poly_ft = poly.fit_transform(poly_in)
poly_nm = poly.get_feature_names_out(poly_in.columns)
X0[poly_nm] = poly_ft

# 多任务各自管道 
results = {}

for task, y in (("ADHD", df['ADHD_Outcome']), ("Sex", df['Sex_F'])):
    # 特征筛选（各自任务）
    selector = SelectKBest(f_classif, k=20)
    X_task  = selector.fit_transform(X0, y)
    cols_sel = X0.columns[selector.get_support()]

    # 分层划分（按当前任务标签）
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_task, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    # 标准化
    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr)
    X_te_s = scaler.transform(X_te)

    # 训练 & 评估
    clf = LogisticRegression(max_iter=1000, class_weight='balanced')
    clf.fit(X_tr_s, y_tr)
    p = clf.predict_proba(X_te_s)[:,1]
    auc = roc_auc_score(y_te, p)

    results[task] = auc

# 输出 
print("Multi‑task Results (separate stratify & selection):")
for task, auc in results.items():
    print(f"  {task:4s} AUC: {auc:.3f}")


Multi‑task Results (separate stratify & selection):
  ADHD AUC: 0.870
  Sex  AUC: 0.659


  f = msb / msw
  f = msb / msw


## Conclusion & Next Steps

After incorporating shared feature engineering (binning, interactions, polynomial expansion) and **task‑specific** SelectKBest + stratified splitting:

| Pipeline                                  | Sex AUC | ADHD AUC |
|-------------------------------------------|--------:|---------:|
| **Meta‑only (baseline)**                  |   0.668 |    0.805 |
| **Meta + FC (raw concatenation)**         |   0.685 |    0.750 |
| **Multi‑task w/ separate selection**      |   0.681 |    0.829 |

- **ADHD**: We recovered and even slightly improved performance (0.805 → 0.829) by isolating noise, doing PCA‑style expansions, then selecting the top 20 ADHD‑related features.  
- **Sex**: AUC nudged up modestly (0.668 → 0.681) when we likewise applied tailored selection, but gains remain limited. 
