In [1]:
import pandas as pd
import numpy as np
import joblib
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
df = pd.read_csv(r"C:\Users\04ama\Downloads\raw adhd data\raw_dataset.csv")

In [3]:


# Remove MRI_Track_Age_at_Scan
if 'MRI_Track_Age_at_Scan' in df.columns:
    df = df.drop(columns=['MRI_Track_Age_at_Scan'])

# Identify quantitative and categorical columns (excluding target and participant_id)
quant_cols = [col for col in df.columns if col.startswith('APQ_') or col.startswith('SDQ_') or col.startswith('EHQ_') or col.startswith('ColorVision')]
cat_cols = [col for col in df.columns if col.startswith('PreInt_') or col.startswith('Basic_') or col.startswith('Sex_F')]
conn_cols = list(df.iloc[:, 1:19902].columns)

# Remove columns with high correlation (>=0.7)
quant_df = df[quant_cols].copy()
corr = quant_df.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] >= 0.7)]
df = df.drop(columns=to_drop)
quant_cols = [col for col in quant_cols if col not in to_drop]

In [4]:
df.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,ADHD_Outcome,Sex_F
0,70z8Q2xdTXM3,0.22293,0.527903,0.429966,0.060457,0.566489,0.315342,0.508408,-0.07829,0.525692,...,1,0.0,1.0,2.0,21.0,45.0,21.0,45.0,1,0
1,WHWymJu6zNZi,0.614765,0.577255,0.496127,0.496606,0.404686,0.439724,0.12259,-0.085452,0.120673,...,1,1.0,8.0,1.0,6.0,5.0,,15.0,1,1
2,4PAQp1M6EyAo,-0.116833,0.458408,0.260703,0.639031,0.769337,0.442528,0.63711,0.19201,0.520379,...,1,0.0,0.0,2.0,18.0,35.0,9.0,20.0,1,1
3,obEacy4Of68I,0.199688,0.752714,0.658283,0.575096,0.692867,0.645789,0.52275,0.412188,0.530843,...,1,0.0,0.0,2.0,21.0,40.0,21.0,40.0,1,1
4,s7WzzDcmDOhF,0.227321,0.613268,0.621447,0.562673,0.736709,0.589813,0.266676,0.359668,0.300771,...,1,2.0,8.0,2.0,9.0,35.0,,,1,1


In [5]:
with open('quant_cols.json', 'w') as f:
    json.dump(quant_cols, f)
with open('cat_cols.json', 'w') as f:
    json.dump(cat_cols, f)
with open('conn_cols.json', 'w') as f:
    json.dump(conn_cols, f)

In [None]:
# STEP 1: SPLIT DATA FIRST (BEFORE ANY PREPROCESSING)
print("ðŸ”„ SPLITTING DATA BEFORE PREPROCESSING")
print("=" * 50)

target_col = 'ADHD_Outcome'
X = df.drop(columns=[target_col, 'participant_id'], errors='ignore')
y = df[target_col]

# First split: train+val vs test (80% vs 20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: train vs val (60% vs 20% of total)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

print(f"Training set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X):.1%})")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X):.1%})")
print(f"Test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X):.1%})")

# Check original distributions
print(f"\nOriginal distributions:")
print(f"Training: {dict(Counter(y_train))}")
print(f"Validation: {dict(Counter(y_val))}")
print(f"Test: {dict(Counter(y_test))}")


2. AFTER IMPUTATION:
Quant cols NaN: 0
Cat cols NaN: 0


In [None]:
# STEP 2: APPLY PREPROCESSING TO ALL SPLITS
print("\nðŸ”§ APPLYING PREPROCESSING")
print("=" * 50)

# 2.1 Impute missing values (fit on training, transform all)
print("Imputing missing values...")
imputer = KNNImputer(n_neighbors=5)

# Fit on training data only
imputer.fit(X_train[quant_cols + cat_cols])

# Transform all splits
X_train[quant_cols + cat_cols] = imputer.transform(X_train[quant_cols + cat_cols])
X_val[quant_cols + cat_cols] = imputer.transform(X_val[quant_cols + cat_cols])
X_test[quant_cols + cat_cols] = imputer.transform(X_test[quant_cols + cat_cols])

joblib.dump(imputer, 'imputer.joblib')
print("âœ… Imputation complete")

# 2.2 Scale quantitative columns (fit on training, transform all)
print("Scaling quantitative features...")
scaler = StandardScaler()

# Fit on training data only
scaler.fit(X_train[quant_cols])

# Transform all splits
X_train[quant_cols] = scaler.transform(X_train[quant_cols])
X_val[quant_cols] = scaler.transform(X_val[quant_cols])
X_test[quant_cols] = scaler.transform(X_test[quant_cols])

joblib.dump(scaler, 'scaler.joblib')
print("âœ… Scaling complete")

# 2.3 One-hot encode categorical columns (fit on training, transform all)
print("Encoding categorical features...")
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit on training data only
encoder.fit(X_train[cat_cols])

# Transform all splits
encoded_train = encoder.transform(X_train[cat_cols])
encoded_val = encoder.transform(X_val[cat_cols])
encoded_test = encoder.transform(X_test[cat_cols])

# Create encoded dataframes
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(cat_cols), index=X_train.index)
encoded_val_df = pd.DataFrame(encoded_val, columns=encoder.get_feature_names_out(cat_cols), index=X_val.index)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(cat_cols), index=X_test.index)

# Drop original categorical columns and add encoded ones
X_train = X_train.drop(columns=cat_cols)
X_val = X_val.drop(columns=cat_cols)
X_test = X_test.drop(columns=cat_cols)

X_train = pd.concat([X_train, encoded_train_df], axis=1)
X_val = pd.concat([X_val, encoded_val_df], axis=1)
X_test = pd.concat([X_test, encoded_test_df], axis=1)

joblib.dump(encoder, 'encoder.joblib')
print("âœ… Encoding complete")

# 2.4 PCA for connectome (fit on training, transform all)
if len(conn_cols) > 0:
    print("Applying PCA to connectome features...")
    pca = PCA(n_components=10)
    
    # Fit on training data only
    pca.fit(X_train[conn_cols])
    
    # Transform all splits
    conn_pca_train = pca.transform(X_train[conn_cols])
    conn_pca_val = pca.transform(X_val[conn_cols])
    conn_pca_test = pca.transform(X_test[conn_cols])
    
    # Create PCA dataframes
    pca_cols = [f'conn_pca_{i+1}' for i in range(10)]
    conn_pca_train_df = pd.DataFrame(conn_pca_train, columns=pca_cols, index=X_train.index)
    conn_pca_val_df = pd.DataFrame(conn_pca_val, columns=pca_cols, index=X_val.index)
    conn_pca_test_df = pd.DataFrame(conn_pca_test, columns=pca_cols, index=X_test.index)
    
    # Drop original connectome columns and add PCA components
    X_train = X_train.drop(columns=conn_cols)
    X_val = X_val.drop(columns=conn_cols)
    X_test = X_test.drop(columns=conn_cols)
    
    X_train = pd.concat([X_train, conn_pca_train_df], axis=1)
    X_val = pd.concat([X_val, conn_pca_val_df], axis=1)
    X_test = pd.concat([X_test, conn_pca_test_df], axis=1)
    
    joblib.dump(pca, 'pca_connectome.joblib')
    print("âœ… PCA complete")

print(f"\nPreprocessing complete:")
print(f"Training features: {X_train.shape[1]}")
print(f"Validation features: {X_val.shape[1]}")
print(f"Test features: {X_test.shape[1]}")

In [None]:
# Final NaN check and cleanup for all splits
print("\nðŸ§¹ FINAL CLEANUP")
print("=" * 30)

for name, X_split in [('Training', X_train), ('Validation', X_val), ('Test', X_test)]:
    nan_count = X_split.isnull().sum().sum()
    print(f"{name} NaN count: {nan_count}")
    
    if nan_count > 0:
        print(f"Cleaning {name} set...")
        # Fill numeric columns with median, others with 0
        numeric_cols = X_split.select_dtypes(include=[np.number]).columns
        X_split[numeric_cols] = X_split[numeric_cols].fillna(X_split[numeric_cols].median())
        X_split = X_split.fillna(0)  # Fill any remaining non-numeric NaN
        
        if name == 'Training':
            X_train = X_split
        elif name == 'Validation':
            X_val = X_split
        else:
            X_test = X_split

print("âœ… All splits cleaned")

Final NaN check: 469
Filling remaining NaN values...
NaN after final cleanup: 0


In [None]:
# STEP 3: APPLY ADASYN ONLY TO TRAINING DATA
print("\nðŸŽ¯ APPLYING ADASYN TO TRAINING DATA ONLY")
print("=" * 50)

# Check original training distribution
original_counts = Counter(y_train)
original_ratio = max(original_counts.values()) / min(original_counts.values())
print(f"Original training distribution: {dict(original_counts)}")
print(f"Original imbalance ratio: {original_ratio:.3f}:1")

# Apply ADASYN with specified parameters
adasyn = ADASYN(n_neighbors=15, random_state=42, sampling_strategy=0.7)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

# Check new training distribution
new_counts = Counter(y_train_balanced)
new_ratio = max(new_counts.values()) / min(new_counts.values())
print(f"\nADASYN applied successfully:")
print(f"New training distribution: {dict(new_counts)}")
print(f"New imbalance ratio: {new_ratio:.3f}:1")
print(f"Samples added: {len(X_train_balanced) - len(X_train):,}")
print(f"Balance improvement: {original_ratio - new_ratio:.3f}")

# Validation and test sets remain unchanged (no data leakage!)
print(f"\nâœ… DATA PREPARATION COMPLETE (NO DATA LEAKAGE)")
print(f"Training (balanced): {X_train_balanced.shape}")
print(f"Validation (original): {X_val.shape}")  
print(f"Test (original): {X_test.shape}")

# Update training data to use balanced version
X_train = X_train_balanced
y_train = y_train_balanced

In [None]:
# STEP 4: TRAIN MODEL WITH BALANCED TRAINING DATA
print("\nðŸ¤– TRAINING MODEL")
print("=" * 30)

# Train logistic regression with optimized parameters
lr = LogisticRegression(
    max_iter=1000, 
    random_state=42, 
    class_weight='balanced', 
    solver='liblinear', 
    C=0.1
)

lr.fit(X_train, y_train)
print("âœ… Model training complete")

# Evaluate on test set with custom threshold
y_test_proba = lr.predict_proba(X_test)[:, 1]
threshold = 0.45
y_test_pred_custom = (y_test_proba >= threshold).astype(int)

# Evaluation
print(f"\nðŸ“Š TEST SET EVALUATION (threshold={threshold})")
print("=" * 50)
report = classification_report(y_test, y_test_pred_custom)
print(report)

cm = confusion_matrix(y_test, y_test_pred_custom)
print("Confusion Matrix:")
print(cm)

macro_f1 = f1_score(y_test, y_test_pred_custom, average='macro')
print(f"Macro F1-score: {macro_f1:.4f}")

# Save model and threshold
joblib.dump(lr, 'adhd_logistic_model.joblib')

with open('adhd_lr_threshold.json', 'w') as f:
    json.dump({'threshold': threshold}, f)

print(f"\nâœ… Model and threshold saved")
print(f"Model file: adhd_logistic_model.joblib")
print(f"Threshold file: adhd_lr_threshold.json")