In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score 
import time
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv(r"C:\Users\04ama\Downloads\raw adhd data\raw_dataset.csv")
print(f"Original dataset shape: {df.shape}")


Original dataset shape: (1213, 19930)


ðŸ”„ Preparing Data for ADASYN
ðŸ“Š Original imbalance ratio: 2.175:1

ðŸŽ¯ Applying ADASYN...


ValueError: Input X contains NaN.
ADASYN does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [6]:
print("ðŸ”„ Preparing Data for ADASYN")
X = df.drop(columns=['ADHD_Outcome', 'participant_id'], errors='ignore')
y = df['ADHD_Outcome']

ðŸ”„ Preparing Data for ADASYN


In [7]:
quant_cols = [col for col in df.columns if col.startswith('APQ_') or col.startswith('SDQ_') or col.startswith('EHQ_') or col.startswith('ColorVision')]
cat_cols = [col for col in df.columns if col.startswith('PreInt_') or col.startswith('Basic_') or col.startswith('Handedness') or col.startswith('Sex_F')]
conn_cols = list(df.iloc[:, 1:19902].columns)

In [8]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

In [9]:
imputer = KNNImputer(n_neighbors=5)
scaler = StandardScaler()

In [10]:
X_train_processed = X_train.copy()
X_val_processed = X_val.copy()
X_test_processed = X_test.copy()

In [11]:
print("Scaling quantitative features")
X_train_processed[quant_cols] = scaler.fit_transform(X_train_processed[quant_cols])
X_val_processed[quant_cols] = scaler.transform(X_val_processed[quant_cols])
X_test_processed[quant_cols] = scaler.transform(X_test_processed[quant_cols])

Scaling quantitative features


In [12]:
all_feature_cols = quant_cols + cat_cols
X_train_processed[all_feature_cols] = imputer.fit_transform(X_train_processed[all_feature_cols])
X_val_processed[all_feature_cols] = imputer.transform(X_val_processed[all_feature_cols])
X_test_processed[all_feature_cols] = imputer.transform(X_test_processed[all_feature_cols])

In [14]:
# =============================================================================
# SIMPLE ADASYN IMPLEMENTATION
# =============================================================================

from imblearn.over_sampling import ADASYN
from collections import Counter

# Prepare data for ADASYN


# Calculate original imbalance ratio
print("\nðŸ“Š Original training imbalance:")
original_counts = Counter(y_train)
original_ratio = max(original_counts.values()) / min(original_counts.values())
print(f"   â€¢ Original imbalance ratio: {original_ratio:.3f}:1")

print("\nðŸŽ¯ Applying ADASYN to training data...")
adasyn = ADASYN(n_neighbors=15, random_state=42, sampling_strategy='auto')
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train_processed[all_feature_cols], y_train)

# Calculate new imbalance ratio
new_counts = Counter(y_train_balanced)
new_ratio = max(new_counts.values()) / min(new_counts.values())
print(f"   â€¢ New imbalance ratio: {new_ratio:.3f}:1")


ðŸ“Š Original training imbalance:
   â€¢ Original imbalance ratio: 2.175:1

ðŸŽ¯ Applying ADASYN to training data...
   â€¢ New imbalance ratio: 1.004:1


In [22]:


# Test different numbers of features
n_features_list = [5,10, 15, 20, 21]
sfs_results = {}

# Initialize base model for SFS
base_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

for n_features in n_features_list:
   
    print(f"TESTING SFS WITH {n_features} FEATURES")
    
    
    start_time = time.time()
    
    # Initialize Sequential Feature Selector
    sfs = SequentialFeatureSelector(
        estimator=base_model,
        n_features_to_select=min(n_features, len(all_feature_cols)),
        direction='forward',
        scoring='f1_macro',
        cv=5,
        n_jobs=-1,
        
    )
    
    print(f"Running forward selection on {len(all_feature_cols)} features...")
    print(f"Target: Select {min(n_features, len(all_feature_cols))} best features")
    
    # Fit SFS on training data (scaled quantitative + categorical)
    X_train_sfs = X_train_processed[all_feature_cols]
    sfs.fit(X_train_sfs, y_train)
    
    # Get selected features
    selected_mask = sfs.get_support()
    selected_features = [col for col, selected in zip(all_feature_cols, selected_mask) if selected]
    
    sfs_time = time.time() - start_time
    
    print(f"\nSFS completed in {sfs_time:.1f} seconds")
    print(f"Selected {len(selected_features)} features:")
    
    # Categorize selected features
    selected_quant = [f for f in selected_features if f in quant_cols]
    selected_cat = [f for f in selected_features if f in cat_cols]
    
    print(f"  - Quantitative: {len(selected_quant)}")
    print(f"  - Categorical: {len(selected_cat)}")
    
    print(f"\nTop selected features:")
    for i, feat in enumerate(selected_features[:15]):
        feat_type = "Quantitative" if feat in quant_cols else "Categorical"
        print(f"  {i+1:2d}. {feat} ({feat_type})")
    if len(selected_features) > 10:
        print(f"      ... and {len(selected_features) - 10} more")
    
   
    print(f"\n--- Cross-Validation Evaluation ---")
    
    # Prepare selected feature data
    X_train_selected = X_train_processed[selected_features]
    X_val_selected = X_val_processed[selected_features]
    X_test_selected = X_test_processed[selected_features]
    
    # Cross-validation F1 scores
    cv_f1_scores = cross_val_score(
        base_model, 
        X_train_selected, 
        y_train, 
        cv=5, 
        scoring='f1_macro',
        n_jobs=-1
    )
    
    mean_f1 = np.mean(cv_f1_scores)
    median_f1 = np.median(cv_f1_scores)
    std_f1 = np.std(cv_f1_scores)
    
    print(f"Cross-validation F1-Macro scores: {cv_f1_scores}")
    print(f"Mean F1-Macro: {mean_f1:.4f}")
    print(f"Median F1-Macro: {median_f1:.4f}")
    print(f"Std Dev F1-Macro: {std_f1:.4f}")
    
    # Train final model for validation/test evaluation
    final_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
    final_model.fit(X_train_selected, y_train)
    
    # Validation set evaluation
    y_val_pred = final_model.predict(X_val_selected)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)
    
    # Test set evaluation
    y_test_pred = final_model.predict(X_test_selected)
    y_test_proba = final_model.predict_proba(X_test_selected)[:, 1]
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    test_auc = auc(*roc_curve(y_test, y_test_proba)[:2])
    
    print(f"\nValidation F1-Macro: {val_report['macro avg']['f1-score']:.4f}")
    print(f"Test F1-Macro: {test_report['macro avg']['f1-score']:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    print(f"Test Accuracy: {test_report['accuracy']:.4f}")
    print(f"ADHD Precision: {test_report['1']['precision']:.4f}")
    print(f"ADHD Recall: {test_report['1']['recall']:.4f}")
    
 
    sfs_results[n_features] = {
        'selected_features': selected_features,
        'selected_quant': selected_quant,
        'selected_cat': selected_cat,
        'cv_f1_scores': cv_f1_scores,
        'mean_f1': mean_f1,
        'std_f1': std_f1,
        'val_f1': val_report['macro avg']['f1-score'],
        'test_f1': test_report['macro avg']['f1-score'],
        'test_auc': test_auc,
        'adhd_precision': test_report['1']['precision'],
        'adhd_recall': test_report['1']['recall'],
        'sfs_time': sfs_time,
        'n_features': len(selected_features)
    }

comparison_df = pd.DataFrame.from_dict(sfs_results, orient='index')
comparison_df.index.name = 'n_features'
comparison_df = comparison_df.sort_index()
print("\n=== SFS Experiment Summary ===")
print(comparison_df[['mean_f1', 'val_f1', 'test_f1', 'test_auc', 'n_features']])


TESTING SFS WITH 5 FEATURES
Running forward selection on 22 features...
Target: Select 5 best features

SFS completed in 3.8 seconds
Selected 5 features:
  - Quantitative: 4
  - Categorical: 1

Top selected features:
   1. ColorVision_CV_Score (Quantitative)
   2. APQ_P_APQ_P_OPD (Quantitative)
   3. SDQ_SDQ_Emotional_Problems (Quantitative)
   4. SDQ_SDQ_Hyperactivity (Quantitative)
   5. Basic_Demos_Study_Site (Categorical)

--- Cross-Validation Evaluation ---
Cross-validation F1-Macro scores: [0.74822479 0.72918233 0.76881378 0.77105263 0.78631579]
Mean F1-Macro: 0.7607
Median F1-Macro: 0.7688
Std Dev F1-Macro: 0.0199

Validation F1-Macro: 0.7562
Test F1-Macro: 0.7139
Test AUC: 0.8037
Test Accuracy: 0.7325
ADHD Precision: 0.8633
ADHD Recall: 0.7229
TESTING SFS WITH 10 FEATURES
Running forward selection on 22 features...
Target: Select 10 best features

SFS completed in 8.0 seconds
Selected 10 features:
  - Quantitative: 7
  - Categorical: 3

Top selected features:
   1. ColorVision_

In [23]:
best_n_features = comparison_df.loc[comparison_df['test_f1'].idxmax(), 'n_features']
best_results = sfs_results[best_n_features]

print(f"\nBEST CONFIGURATION: {best_n_features} features")
print(f"Cross-validation F1-Macro: {best_results['mean_f1']:.4f} Â± {best_results['std_f1']:.4f}")
print(f"Test F1-Macro: {best_results['test_f1']:.4f}")
print(f"Test AUC: {best_results['test_auc']:.4f}")
print(f"ADHD Precision: {best_results['adhd_precision']:.4f}")
print(f"ADHD Recall: {best_results['adhd_recall']:.4f}")

print(f"\nBest selected features ({len(best_results['selected_features'])}):")
print(f"\nQuantitative features ({len(best_results['selected_quant'])}):")
for i, feat in enumerate(best_results['selected_quant']):
    print(f"  {i+1:2d}. {feat}")

print(f"\nCategorical features ({len(best_results['selected_cat'])}):")
for i, feat in enumerate(best_results['selected_cat']):
    print(f"  {i+1:2d}. {feat}")



BEST CONFIGURATION: 20 features
Cross-validation F1-Macro: 0.7519 Â± 0.0134
Test F1-Macro: 0.7314
Test AUC: 0.7978
ADHD Precision: 0.8630
ADHD Recall: 0.7590

Best selected features (20):

Quantitative features (16):
   1. ColorVision_CV_Score
   2. APQ_P_APQ_P_CP
   3. APQ_P_APQ_P_ID
   4. APQ_P_APQ_P_INV
   5. APQ_P_APQ_P_OPD
   6. APQ_P_APQ_P_PM
   7. APQ_P_APQ_P_PP
   8. SDQ_SDQ_Conduct_Problems
   9. SDQ_SDQ_Difficulties_Total
  10. SDQ_SDQ_Emotional_Problems
  11. SDQ_SDQ_Externalizing
  12. SDQ_SDQ_Generating_Impact
  13. SDQ_SDQ_Hyperactivity
  14. SDQ_SDQ_Internalizing
  15. SDQ_SDQ_Peer_Problems
  16. SDQ_SDQ_Prosocial

Categorical features (4):
   1. Basic_Demos_Enroll_Year
   2. Basic_Demos_Study_Site
   3. PreInt_Demos_Fam_Child_Ethnicity
   4. Sex_F
