In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
import xgboost as xgb
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')
# Excel handling
import openpyxl
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import (
    Alignment, 
    Font, 
    PatternFill, 
    Border, 
    Side
)

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# 1. Load all datasets

In [6]:
# Training data
train_categorical = pd.read_excel("widsdatathon2025/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx")
train_quantitative = pd.read_excel("widsdatathon2025/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx")
train_connectome = pd.read_csv("widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
train_solutions = pd.read_excel("widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")

# Test data
test_categorical = pd.read_excel("widsdatathon2025/TEST/TEST_CATEGORICAL.xlsx")
test_quantitative = pd.read_excel("widsdatathon2025/TEST/TEST_QUANTITATIVE_METADATA.xlsx")
test_connectome = pd.read_csv("widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")

print(f"Training data shapes:")
print(f"Categorical metadata: {train_categorical.shape}")
print(f"Quantitative metadata: {train_quantitative.shape}")
print(f"Functional connectomes: {train_connectome.shape}")
print(f"Target solutions: {train_solutions.shape}")

Training data shapes:
Categorical metadata: (1213, 10)
Quantitative metadata: (1213, 19)
Functional connectomes: (1213, 19901)
Target solutions: (1213, 3)


# 2. Exploratory Data Analysis

In [7]:
print("\nPerforming exploratory data analysis...")

# Class distribution analysis
train_solutions_dist = train_solutions.copy()
class_dist = pd.crosstab(train_solutions_dist['ADHD_Outcome'], train_solutions_dist['Sex_F'], 
                         rownames=['ADHD'], colnames=['Female'])
print("\nClass distribution:")
print(class_dist)

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=train_solutions_dist, x='ADHD_Outcome', hue='Sex_F')
plt.title('Distribution of ADHD and Sex in Training Data')
plt.xlabel('ADHD Diagnosis (1=yes, 0=no)')
plt.ylabel('Count')
plt.legend(['Male', 'Female'])
plt.savefig('class_distribution.png')
plt.close()




Performing exploratory data analysis...

Class distribution:
Female    0    1
ADHD            
0       216  166
1       581  250


# 3. Data Preprocessing

In [8]:
print("\nPreprocessing data...")

# 3.1 Set participant_id as index
train_categorical_clean = train_categorical.set_index('participant_id')
train_quantitative_clean = train_quantitative.set_index('participant_id')
train_connectome_clean = train_connectome.set_index('participant_id')
train_solutions_clean = train_solutions.set_index('participant_id')

# Find common participants across all datasets
common_participants = set(train_categorical_clean.index) & set(train_quantitative_clean.index) & \
                     set(train_connectome_clean.index) & set(train_solutions_clean.index)
print(f"Number of common participants across all training datasets: {len(common_participants)}")

# Filter all dataframes to include only common participants
train_categorical_clean = train_categorical_clean.loc[list(common_participants)]
train_quantitative_clean = train_quantitative_clean.loc[list(common_participants)]
train_connectome_clean = train_connectome_clean.loc[list(common_participants)]
train_solutions_clean = train_solutions_clean.loc[list(common_participants)]

# Setup test data
test_categorical_clean = test_categorical.set_index('participant_id')
test_quantitative_clean = test_quantitative.set_index('participant_id')
test_connectome_clean = test_connectome.set_index('participant_id')

# 3.2 Advanced missing value handling
print("\nHandling missing values...")

# Track missing value percentages
missing_cat = train_categorical_clean.isna().mean().sort_values(ascending=False)
missing_quant = train_quantitative_clean.isna().mean().sort_values(ascending=False)

print(f"Top 5 categorical features with missing values:")
print(missing_cat.head(5))
print(f"Top 5 quantitative features with missing values:")
print(missing_quant.head(5))

# For quantitative data - use KNN imputation for better accuracy
quant_features = train_quantitative_clean.columns
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
train_quantitative_imputed = pd.DataFrame(
    knn_imputer.fit_transform(train_quantitative_clean), 
    index=train_quantitative_clean.index,
    columns=quant_features
)

# Impute test quantitative data
test_quantitative_imputed = pd.DataFrame(
    knn_imputer.transform(test_quantitative_clean),
    index=test_quantitative_clean.index,
    columns=quant_features
)

# For categorical data - use most frequent imputation
cat_features = train_categorical_clean.columns
cat_imputer = SimpleImputer(strategy='most_frequent')
train_categorical_imputed = pd.DataFrame(
    cat_imputer.fit_transform(train_categorical_clean),
    index=train_categorical_clean.index,
    columns=cat_features
)

# Impute test categorical data
test_categorical_imputed = pd.DataFrame(
    cat_imputer.transform(test_categorical_clean),
    index=test_categorical_clean.index,
    columns=cat_features
)

# 3.3 Process connectome data
print("\nProcessing connectome data...")

# Check for missing values in connectome data
connectome_na_counts = train_connectome_clean.isna().sum()
print(f"Are there missing values in connectome data? {(connectome_na_counts > 0).any()}")

# Impute if there are missing values
connectome_features = train_connectome_clean.columns
if (connectome_na_counts > 0).any():
    connectome_imputer = SimpleImputer(strategy='mean')
    train_connectome_imputed = pd.DataFrame(
        connectome_imputer.fit_transform(train_connectome_clean),
        index=train_connectome_clean.index,
        columns=connectome_features
    )
    test_connectome_imputed = pd.DataFrame(
        connectome_imputer.transform(test_connectome_clean),
        index=test_connectome_clean.index,
        columns=connectome_features
    )
else:
    train_connectome_imputed = train_connectome_clean.copy()
    test_connectome_imputed = test_connectome_clean.copy()

# 3.4 Advanced feature engineering from connectome data
print("\nAdvanced feature engineering on connectome data...")

def extract_advanced_connectome_features(connectome_df):
    """
    Extract advanced connectivity features from the connectome matrices.
    Assumes each row represents a participant's flattened connectivity matrix.
    """
    summary_df = pd.DataFrame(index=connectome_df.index)
    
    # Basic statistics
    summary_df['connectome_mean'] = connectome_df.mean(axis=1)
    summary_df['connectome_std'] = connectome_df.std(axis=1)
    summary_df['connectome_min'] = connectome_df.min(axis=1)
    summary_df['connectome_max'] = connectome_df.max(axis=1)
    summary_df['connectome_median'] = connectome_df.median(axis=1)
    
    # Quantiles
    for q in [0.1, 0.25, 0.5, 0.75, 0.9]:
        summary_df[f'connectome_q{int(q*100)}'] = connectome_df.quantile(q, axis=1)
    
    # Advanced metrics - calculate from matrix shape if possible
    # Assuming the connectome is a flattened connectivity matrix
    num_features = connectome_df.shape[1]
    
    # Try to infer the number of brain regions (assuming square matrix)
    # For perfect square, n_regions is the square root of feature count
    # Otherwise, use a reasonable default
    n_perfect_square = int(np.sqrt(num_features))
    if n_perfect_square * n_perfect_square == num_features:
        n_regions = n_perfect_square
        print(f"Inferred {n_regions} brain regions from connectome data")
        
        # Matrix-based features for each participant
        for idx in summary_df.index:
            # Reconstruct the connectivity matrix
            flat_matrix = connectome_df.loc[idx].values
            matrix = flat_matrix.reshape(n_regions, n_regions)
            
            # Connectivity density at different thresholds
            for threshold in [0.2, 0.3, 0.4, 0.5, 0.6]:
                density = np.sum(np.abs(matrix) > threshold) / (n_regions * n_regions)
                summary_df.loc[idx, f'density_t{threshold}'] = density
            
            # Positive/negative connectivity
            pos_connections = matrix[matrix > 0]
            neg_connections = matrix[matrix < 0]
            
            if len(pos_connections) > 0:
                summary_df.loc[idx, 'avg_pos_conn'] = np.mean(pos_connections)
                summary_df.loc[idx, 'std_pos_conn'] = np.std(pos_connections)
                summary_df.loc[idx, 'max_pos_conn'] = np.max(pos_connections)
            else:
                summary_df.loc[idx, 'avg_pos_conn'] = 0
                summary_df.loc[idx, 'std_pos_conn'] = 0
                summary_df.loc[idx, 'max_pos_conn'] = 0
                
            if len(neg_connections) > 0:
                summary_df.loc[idx, 'avg_neg_conn'] = np.mean(neg_connections)
                summary_df.loc[idx, 'std_neg_conn'] = np.std(neg_connections)
                summary_df.loc[idx, 'min_neg_conn'] = np.min(neg_connections)
            else:
                summary_df.loc[idx, 'avg_neg_conn'] = 0
                summary_df.loc[idx, 'std_neg_conn'] = 0
                summary_df.loc[idx, 'min_neg_conn'] = 0
                
            # Network metrics - degree distribution
            degree = np.sum(np.abs(matrix) > 0.3, axis=0)
            summary_df.loc[idx, 'degree_mean'] = np.mean(degree)
            summary_df.loc[idx, 'degree_std'] = np.std(degree)
            summary_df.loc[idx, 'degree_max'] = np.max(degree)
    else:
        print(f"Could not infer brain regions from connectome shape {num_features}")
            
    # Calculate skewness and kurtosis from scipy
    try:
        from scipy.stats import skew, kurtosis
        summary_df['connectome_skew'] = connectome_df.apply(skew, axis=1)
        summary_df['connectome_kurtosis'] = connectome_df.apply(kurtosis, axis=1)
    except ImportError:
        print("scipy not available, skipping skewness and kurtosis calculations")
    
    return summary_df

# Extract advanced connectome features
train_connectome_summary = extract_advanced_connectome_features(train_connectome_imputed)
test_connectome_summary = extract_advanced_connectome_features(test_connectome_imputed)

# 3.5 Apply PCA to connectome data to reduce dimensionality while preserving patterns
print("\nApplying PCA to connectome data...")

pca = PCA(n_components=100, random_state=RANDOM_STATE)
train_connectome_pca = pd.DataFrame(
    pca.fit_transform(train_connectome_imputed),
    index=train_connectome_imputed.index,
    columns=[f'PC_{i+1}' for i in range(100)]
)

test_connectome_pca = pd.DataFrame(
    pca.transform(test_connectome_imputed),
    index=test_connectome_imputed.index,
    columns=[f'PC_{i+1}' for i in range(100)]
)

print(f"Explained variance by PCA: {np.sum(pca.explained_variance_ratio_):.4f}")

# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
cum_var = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, len(cum_var) + 1), cum_var)
plt.grid(True)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.savefig('pca_explained_variance.png')
plt.close()

# 3.6 One-hot encode categorical variables
print("\nOne-hot encoding categorical features...")
# Identify categorical columns
categorical_columns = train_categorical_imputed.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoder.fit(pd.concat([train_categorical_imputed[categorical_columns], 
                       test_categorical_imputed[categorical_columns]]))

# Transform train data
train_encoded_cats = pd.DataFrame(
    encoder.transform(train_categorical_imputed[categorical_columns]),
    index=train_categorical_imputed.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Transform test data
test_encoded_cats = pd.DataFrame(
    encoder.transform(test_categorical_imputed[categorical_columns]),
    index=test_categorical_imputed.index,
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Keep non-categorical columns as is
non_cat_cols = [col for col in train_categorical_imputed.columns if col not in categorical_columns]
train_non_cat_data = train_categorical_imputed[non_cat_cols]
test_non_cat_data = test_categorical_imputed[non_cat_cols]

# Combine encoded categorical and non-categorical data
train_categorical_encoded = pd.concat([train_encoded_cats, train_non_cat_data], axis=1)
test_categorical_encoded = pd.concat([test_encoded_cats, test_non_cat_data], axis=1)

# 3.7 Scale numerical features
print("\nScaling numerical features...")
scaler = StandardScaler()
# Fit on combined train and test data for consistent scaling
scaler.fit(pd.concat([train_quantitative_imputed, test_quantitative_imputed]))

train_quantitative_scaled = pd.DataFrame(
    scaler.transform(train_quantitative_imputed),
    index=train_quantitative_imputed.index,
    columns=train_quantitative_imputed.columns
)

test_quantitative_scaled = pd.DataFrame(
    scaler.transform(test_quantitative_imputed),
    index=test_quantitative_imputed.index,
    columns=test_quantitative_imputed.columns
)

# 3.8 Create interaction features between important behavioral metrics and brain features
print("\nCreating interaction features...")

def create_interaction_features(quantitative_df, connectome_summary_df, connectome_pca_df):
    """Create interaction features between behavioral and brain measures"""
    interaction_df = pd.DataFrame(index=quantitative_df.index)
    
    # Check for key behavioral features related to ADHD
    behavior_features = []
    for feature in ['SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Total', 'SWAN_SWAN_Inattentive', 
                    'SWAN_SWAN_Hyperactive', 'SDQ_SDQ_Emotionality', 'SDQ_SDQ_Conduct']:
        if feature in quantitative_df.columns:
            behavior_features.append(feature)
    
    # Check for key connectivity features - using only a subset for efficiency
    connect_features = ['connectome_mean', 'connectome_std', 'avg_pos_conn', 'avg_neg_conn'] \
                     + [f'density_t{t}' for t in [0.2, 0.4, 0.6]] \
                     + ['degree_mean', 'degree_std']
                     
    connect_features = [f for f in connect_features if f in connectome_summary_df.columns]
    
    # Add top PCA components
    pca_features = ['PC_1', 'PC_2', 'PC_3', 'PC_4', 'PC_5']
    pca_features = [f for f in pca_features if f in connectome_pca_df.columns]
    
    # Create interactions for behavioral x connectome summary
    for beh in behavior_features:
        for conn in connect_features:
            feature_name = f"{beh}_{conn}"
            interaction_df[feature_name] = quantitative_df[beh] * connectome_summary_df[conn]
    
    # Create interactions for behavioral x PCA components
    for beh in behavior_features:
        for pc in pca_features:
            feature_name = f"{beh}_{pc}"
            interaction_df[feature_name] = quantitative_df[beh] * connectome_pca_df[pc]
    
    return interaction_df

# Create interaction features if behavioral features exist
train_interaction_features = create_interaction_features(
    train_quantitative_scaled, 
    train_connectome_summary, 
    train_connectome_pca
)

test_interaction_features = create_interaction_features(
    test_quantitative_scaled, 
    test_connectome_summary, 
    test_connectome_pca
)

# 3.9 Combine all processed features
print("\nCombining all features...")
X_train_all = pd.concat([
    train_categorical_encoded, 
    train_quantitative_scaled, 
    train_connectome_pca,  # Use PCA instead of raw connectome features
    train_connectome_summary,
    train_interaction_features
], axis=1)
y_train = train_solutions_clean

# Same for test data
X_test_all = pd.concat([
    test_categorical_encoded, 
    test_quantitative_scaled, 
    test_connectome_pca,  # Use PCA instead of raw connectome features
    test_connectome_summary,
    test_interaction_features
], axis=1)

print(f"Final training data shape: {X_train_all.shape}")
print(f"Final test data shape: {X_test_all.shape}")
print(f"Number of features: {X_train_all.shape[1]}")

# Create a validation set for later testing
X_train_model, X_val, y_train_model, y_val = train_test_split(
    X_train_all, y_train, 
    test_size=0.15, 
    random_state=RANDOM_STATE,
    stratify=y_train[['ADHD_Outcome', 'Sex_F']]
)



Preprocessing data...
Number of common participants across all training datasets: 1213
Number of common participants across all training datasets: 1213

Handling missing values...
Top 5 categorical features with missing values:
Barratt_Barratt_P2_Occ              0.183017
Barratt_Barratt_P2_Edu              0.163232
PreInt_Demos_Fam_Child_Race         0.044518
PreInt_Demos_Fam_Child_Ethnicity    0.035449
Barratt_Barratt_P1_Occ              0.025556
dtype: float64
Top 5 quantitative features with missing values:
MRI_Track_Age_at_Scan    0.296785
ColorVision_CV_Score     0.018961
EHQ_EHQ_Total            0.010717
APQ_P_APQ_P_ID           0.009893
APQ_P_APQ_P_INV          0.009893
dtype: float64

Handling missing values...
Top 5 categorical features with missing values:
Barratt_Barratt_P2_Occ              0.183017
Barratt_Barratt_P2_Edu              0.163232
PreInt_Demos_Fam_Child_Race         0.044518
PreInt_Demos_Fam_Child_Ethnicity    0.035449
Barratt_Barratt_P1_Occ              0.025

# 4. Feature Selection

In [9]:
print("\nPerforming feature selection...")

# 4.1 Feature selection for ADHD prediction
print("\nSelecting features for ADHD prediction...")
# Use Random Forest for feature importance
adhd_rf_selector = RandomForestClassifier(
    n_estimators=200, 
    max_depth=15,
    random_state=RANDOM_STATE,
    class_weight='balanced'
)
adhd_rf_selector.fit(X_train_model, y_train_model['ADHD_Outcome'])

# Get feature importance
adhd_importances = adhd_rf_selector.feature_importances_
adhd_feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': adhd_importances
}).sort_values('Importance', ascending=False)

# Select top features
adhd_top_n = 200  # Select more features initially
adhd_selected_features = adhd_feature_importance.head(adhd_top_n)['Feature'].tolist()

# 4.2 Feature selection for Sex prediction
print("\nSelecting features for Sex prediction...")
sex_rf_selector = RandomForestClassifier(
    n_estimators=200, 
    max_depth=15,
    random_state=RANDOM_STATE,
    class_weight='balanced'
)
sex_rf_selector.fit(X_train_model, y_train_model['Sex_F'])

# Get feature importance
sex_importances = sex_rf_selector.feature_importances_
sex_feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': sex_importances
}).sort_values('Importance', ascending=False)

# Select top features
sex_top_n = 200  # Select more features initially
sex_selected_features = sex_feature_importance.head(sex_top_n)['Feature'].tolist()

# Visualize top 20 features for ADHD and Sex
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
sns.barplot(x='Importance', y='Feature', data=adhd_feature_importance.head(20))
plt.title('Top 20 Features for ADHD Prediction')
plt.tight_layout()

plt.subplot(2, 1, 2)
sns.barplot(x='Importance', y='Feature', data=sex_feature_importance.head(20))
plt.title('Top 20 Features for Sex Prediction')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# 4.3 Final feature sets
X_train_adhd = X_train_model[adhd_selected_features]
X_train_sex = X_train_model[sex_selected_features]

X_val_adhd = X_val[adhd_selected_features]
X_val_sex = X_val[sex_selected_features]

X_test_adhd = X_test_all[adhd_selected_features]
X_test_sex = X_test_all[sex_selected_features]

# Check for feature overlap
common_features = set(adhd_selected_features).intersection(set(sex_selected_features))
print(f"Number of common features: {len(common_features)}")



Performing feature selection...

Selecting features for ADHD prediction...

Selecting features for Sex prediction...

Selecting features for Sex prediction...
Number of common features: 146
Number of common features: 146


# 5. Handle Class Imbalance with Advanced Techniques

In [10]:
print("\nHandling class imbalance with advanced techniques...")

# 5.1 Prepare for multi-task learning
# We'll create a combined X_train with all important features from both tasks
all_selected_features = list(set(adhd_selected_features + sex_selected_features))
X_train_combined = X_train_model[all_selected_features]
X_val_combined = X_val[all_selected_features]
X_test_combined = X_test_all[all_selected_features]

# 5.2 Create sample weights emphasizing female ADHD cases
sample_weights = np.ones(len(y_train_model))
female_adhd_indices = (y_train_model['ADHD_Outcome'] == 1) & (y_train_model['Sex_F'] == 1)
sample_weights[female_adhd_indices] = 3.0  # Triple weight for female ADHD cases

# 5.3 Apply SMOTETomek for balanced data generation (combines oversampling and undersampling)
print("\nApplying SMOTETomek with class weights...")
# Combined class approach (ADHD_Sex combinations)
y_combined = y_train_model['ADHD_Outcome'].astype(str) + '_' + y_train_model['Sex_F'].astype(str)

# Apply SMOTETomek with adjusted sampling strategy
smote_tomek = SMOTETomek(
    sampling_strategy={
        '0_0': len(y_combined[y_combined == '0_0']),  # Keep non-ADHD males the same
        '0_1': len(y_combined[y_combined == '0_1']),  # Keep non-ADHD females the same
        '1_0': len(y_combined[y_combined == '1_0']),  # Keep ADHD males the same
        '1_1': max(len(y_combined[y_combined == '1_1']) * 2, 
                  len(y_combined[y_combined == '0_0']))  # Double ADHD females or match majority
    },
    random_state=RANDOM_STATE
)
X_resampled, y_combined_resampled = smote_tomek.fit_resample(X_train_combined, y_combined)

# Convert back to separate targets
y_resampled = pd.DataFrame({'combined': y_combined_resampled})
y_resampled['ADHD_Outcome'] = y_resampled['combined'].str.split('_').str[0].astype(int)
y_resampled['Sex_F'] = y_resampled['combined'].str.split('_').str[1].astype(int)

print("Class distribution after resampling:")
print(pd.crosstab(y_resampled['ADHD_Outcome'], y_resampled['Sex_F'], 
                 rownames=['ADHD'], colnames=['Female']))



Handling class imbalance with advanced techniques...

Applying SMOTETomek with class weights...
Class distribution after resampling:
Female    0    1
ADHD            
0       168  131
1       479  421
Class distribution after resampling:
Female    0    1
ADHD            
0       168  131
1       479  421


# 6. Model Building

In [14]:
print("\nBuilding models...")

# 6.1 Multi-task Neural Network Model
print("\nBuilding multi-task neural network...")

def build_multitask_nn(input_dim):
    """Build a neural network for multi-task learning of ADHD and Sex"""
    # Input layer
    inputs = Input(shape=(input_dim,))
    
    # Shared layers
    x = Dense(256, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    # Task-specific layers for ADHD
    adhd_branch = Dense(64, activation='relu')(x)
    adhd_branch = BatchNormalization()(adhd_branch)
    adhd_branch = Dropout(0.2)(adhd_branch)
    adhd_output = Dense(1, activation='sigmoid', name='ADHD_Outcome')(adhd_branch)
    
    # Task-specific layers for Sex
    sex_branch = Dense(64, activation='relu')(x)
    sex_branch = BatchNormalization()(sex_branch)
    sex_branch = Dropout(0.2)(sex_branch)
    sex_output = Dense(1, activation='sigmoid', name='Sex_F')(sex_branch)
    
    # Create and compile model
    model = Model(inputs=inputs, outputs=[adhd_output, sex_output])
    model.compile(
        optimizer='adam',
        loss={
            'ADHD_Outcome': 'binary_crossentropy', 
            'Sex_F': 'binary_crossentropy'
        },
        loss_weights={
            'ADHD_Outcome': 1.5,  # Higher weight for ADHD
            'Sex_F': 1.0
        },
        metrics={
            'ADHD_Outcome': 'accuracy',
            'Sex_F': 'accuracy'
        }
    )
    return model

# Build and train multitask neural network
multitask_nn = build_multitask_nn(X_resampled.shape[1])

# Setup callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=7,
)

# Train with validation split
print("\nTraining multi-task neural network...")
multitask_history = multitask_nn.fit(
    X_resampled, 
    {
        'ADHD_Outcome': y_resampled['ADHD_Outcome'],
        'Sex_F': y_resampled['Sex_F']
    },
    epochs=100,
    batch_size=32,
    validation_data=(
        X_val_combined, 
        {
            'ADHD_Outcome': y_val['ADHD_Outcome'],
            'Sex_F': y_val['Sex_F']
        }
    ),
    callbacks=[early_stopping, reduce_lr],
    verbose=2
)

# 6.2 ADHD-specific stacking model
print("\nBuilding ADHD-specific stacking model...")

# Base estimators
adhd_base_estimators = [
    ('xgb', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=2,  # Adjust for class imbalance
        random_state=RANDOM_STATE
    )),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=10,
        min_samples_leaf=4,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('lgbm', LGBMClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        num_leaves=31,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('gb', GradientBoostingClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        min_samples_split=10,
        subsample=0.8,
        random_state=RANDOM_STATE
    ))
]

# Meta-learner
adhd_stacking = StackingClassifier(
    estimators=adhd_base_estimators,
    final_estimator=xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.05,
        random_state=RANDOM_STATE
    ),
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

# Replace the sample_weights code with this before training
resampled_weights = np.ones(len(y_resampled))
female_adhd_indices_resampled = (y_resampled['ADHD_Outcome'] == 1) & (y_resampled['Sex_F'] == 1)
resampled_weights[female_adhd_indices_resampled] = 3.0  # Triple weight for female ADHD cases

# Train ADHD stacking model
print("\nTraining ADHD stacking model...")
adhd_stacking.fit(
    X_resampled, 
    y_resampled['ADHD_Outcome'],
    sample_weight=resampled_weights
)
# Train ADHD stacking model
print("\nTraining ADHD stacking model...")
# Train on both original data and resampled data
adhd_stacking.fit(
    X_resampled, 
    y_resampled['ADHD_Outcome']
)

# 6.3 Sex-specific stacking model
print("\nBuilding Sex-specific stacking model...")

# Base estimators for sex prediction
sex_base_estimators = [
    ('xgb', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE
    )),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=10,
        min_samples_leaf=4,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('lgbm', LGBMClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        num_leaves=31,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )),
    ('gb', GradientBoostingClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        min_samples_split=10,
        subsample=0.8,
        random_state=RANDOM_STATE
    ))
]

# Meta-learner for sex prediction
sex_stacking = StackingClassifier(
    estimators=sex_base_estimators,
    final_estimator=xgb.XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.05,
        random_state=RANDOM_STATE
    ),
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

# Train Sex stacking model
print("\nTraining Sex stacking model...")
sex_stacking.fit(
    X_resampled, 
    y_resampled['Sex_F']
)



Building models...

Building multi-task neural network...

Training multi-task neural network...
Epoch 1/100
Epoch 1/100
38/38 - 10s - 251ms/step - ADHD_Outcome_accuracy: 0.5405 - ADHD_Outcome_loss: 0.8304 - Sex_F_accuracy: 0.5371 - Sex_F_loss: 0.8729 - loss: 2.1211 - val_ADHD_Outcome_accuracy: 0.3132 - val_ADHD_Outcome_loss: 23.7650 - val_Sex_F_accuracy: 0.6538 - val_Sex_F_loss: 3.1415 - val_loss: 38.8042 - learning_rate: 0.0010
Epoch 2/100
38/38 - 10s - 251ms/step - ADHD_Outcome_accuracy: 0.5405 - ADHD_Outcome_loss: 0.8304 - Sex_F_accuracy: 0.5371 - Sex_F_loss: 0.8729 - loss: 2.1211 - val_ADHD_Outcome_accuracy: 0.3132 - val_ADHD_Outcome_loss: 23.7650 - val_Sex_F_accuracy: 0.6538 - val_Sex_F_loss: 3.1415 - val_loss: 38.8042 - learning_rate: 0.0010
Epoch 2/100
38/38 - 1s - 18ms/step - ADHD_Outcome_accuracy: 0.6539 - ADHD_Outcome_loss: 0.6347 - Sex_F_accuracy: 0.5638 - Sex_F_loss: 0.7443 - loss: 1.6972 - val_ADHD_Outcome_accuracy: 0.3132 - val_ADHD_Outcome_loss: 8.4577 - val_Sex_F_accu

# 7. Calibrate probabilities for better confidence estimates

In [16]:
print("\nCalibrating model probabilities...")

# 7.1 Calibrate ADHD model
# First, align the validation features to match the training order
X_val_adhd_aligned = X_val_adhd[adhd_stacking.feature_names_in_]

adhd_calibrator = CalibratedClassifierCV(
    adhd_stacking, 
    method='isotonic', 
    cv='prefit'
)
adhd_calibrator.fit(X_val_adhd_aligned, y_val['ADHD_Outcome'])

# 7.2 Calibrate Sex model
# First, align the validation features to match the training order
X_val_sex_aligned = X_val_sex[sex_stacking.feature_names_in_]

sex_calibrator = CalibratedClassifierCV(
    sex_stacking, 
    method='isotonic', 
    cv='prefit'
)
sex_calibrator.fit(X_val_sex_aligned, y_val['Sex_F'])


Calibrating model probabilities...


# 8. Model Evaluation on Validation Set

In [18]:
print("\nEvaluating models on validation set...")

# 8.1 Evaluate multitask neural network
nn_adhd_preds = multitask_nn.predict(X_val_combined)[0].flatten() > 0.5
nn_sex_preds = multitask_nn.predict(X_val_combined)[1].flatten() > 0.5

print("\nMulti-task Neural Network Performance:")
print("ADHD Classification Report:")
print(classification_report(y_val['ADHD_Outcome'], nn_adhd_preds))
print("Sex Classification Report:")
print(classification_report(y_val['Sex_F'], nn_sex_preds))

# 8.2 Evaluate ADHD stacking model
# Use the aligned validation features that match the model's feature order
adhd_stack_preds = adhd_calibrator.predict(X_val_adhd_aligned)
adhd_stack_probs = adhd_calibrator.predict_proba(X_val_adhd_aligned)[:, 1]

print("\nADHD Stacking Model Performance:")
print("Classification Report:")
print(classification_report(y_val['ADHD_Outcome'], adhd_stack_preds))

# 8.3 Evaluate Sex stacking model
# Use the aligned validation features that match the model's feature order
sex_stack_preds = sex_calibrator.predict(X_val_sex_aligned)
sex_stack_probs = sex_calibrator.predict_proba(X_val_sex_aligned)[:, 1]

print("\nSex Stacking Model Performance:")
print("Classification Report:")
print(classification_report(y_val['Sex_F'], sex_stack_preds))


Evaluating models on validation set...
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 

Multi-task Neural Network Performance:
ADHD Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.70      0.63        57
           1       0.85      0.76      0.80       125

    accuracy                           0.74       182
   macro avg       0.71      0.73      0.72       182
weighted avg       0.76      0.74      0.75       182

Sex Classification Report:
              precision    recall  f1-score   support

           0       0.65      1.00      0.79       119
           1       0.00      0.00      0.00        63

    accuracy                           0.65       182
   macro avg       0.33      0.5

# 9. Final Ensemble for Best Predictions

In [20]:
print("\nCreating final ensemble...")

# 9.1 Ensemble ADHD predictions through weighted average
def ensemble_adhd_predictions(X_val_combined, X_val_adhd_aligned):
    """Ensemble ADHD predictions from neural network and stacking model"""
    # Get probabilities from both models
    nn_probs = multitask_nn.predict(X_val_combined)[0].flatten()
    stack_probs = adhd_calibrator.predict_proba(X_val_adhd_aligned)[:, 1]
    
    # Weighted average (giving more weight to the stacking model)
    ensemble_probs = 0.4 * nn_probs + 0.6 * stack_probs
    ensemble_preds = ensemble_probs > 0.5
    
    return ensemble_preds, ensemble_probs

# 9.2 Ensemble Sex predictions through weighted average
def ensemble_sex_predictions(X_val_combined, X_val_sex_aligned):
    """Ensemble Sex predictions from neural network and stacking model"""
    # Get probabilities from both models
    nn_probs = multitask_nn.predict(X_val_combined)[1].flatten()
    stack_probs = sex_calibrator.predict_proba(X_val_sex_aligned)[:, 1]
    
    # Weighted average (giving more weight to the stacking model)
    ensemble_probs = 0.4 * nn_probs + 0.6 * stack_probs
    ensemble_preds = ensemble_probs > 0.5
    
    return ensemble_preds, ensemble_probs

# Get ensemble predictions for validation set - USE ALIGNED DATA
adhd_ensemble_preds, adhd_ensemble_probs = ensemble_adhd_predictions(X_val_combined, X_val_adhd_aligned)
sex_ensemble_preds, sex_ensemble_probs = ensemble_sex_predictions(X_val_combined, X_val_sex_aligned)

# Evaluate ensembles
print("\nADHD Ensemble Performance:")
print(classification_report(y_val['ADHD_Outcome'], adhd_ensemble_preds))

print("\nSex Ensemble Performance:")
print(classification_report(y_val['Sex_F'], sex_ensemble_preds))


Creating final ensemble...
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 

ADHD Ensemble Performance:
              precision    recall  f1-score   support

           0       0.71      0.68      0.70        57
           1       0.86      0.87      0.87       125

    accuracy                           0.81       182
   macro avg       0.78      0.78      0.78       182
weighted avg       0.81      0.81      0.81       182


Sex Ensemble Performance:
              precision    recall  f1-score   support

           0       0.68      1.00      0.81       119
           1       1.00      0.13      0.23        63

    accuracy                           0.70       182
   macro avg       0.84      0.56      0.52       182
weighted avg       0.79      0.70      0.61       182



# 10. Generate Test Predictions

In [22]:
print("\nGenerating test predictions...")

# Align the test features to match the training order
X_test_adhd_aligned = X_test_adhd[adhd_stacking.feature_names_in_]
X_test_sex_aligned = X_test_sex[sex_stacking.feature_names_in_]

# 10.1 Generate ADHD predictions for test set
test_adhd_nn_probs = multitask_nn.predict(X_test_combined)[0].flatten()
test_adhd_stack_probs = adhd_calibrator.predict_proba(X_test_adhd_aligned)[:, 1]
test_adhd_ensemble_probs = 0.4 * test_adhd_nn_probs + 0.6 * test_adhd_stack_probs
test_adhd_preds = test_adhd_ensemble_probs > 0.5

# 10.2 Generate Sex predictions for test set
test_sex_nn_probs = multitask_nn.predict(X_test_combined)[1].flatten()
test_sex_stack_probs = sex_calibrator.predict_proba(X_test_sex_aligned)[:, 1]
test_sex_ensemble_probs = 0.4 * test_sex_nn_probs + 0.6 * test_sex_stack_probs
test_sex_preds = test_sex_ensemble_probs > 0.5



Generating test predictions...
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


# 11. Create submission file

In [23]:
print("\nCreating submission file...")
# Make sure to use the original participant_id format from the test datasets
submission = pd.DataFrame({
    'participant_id': X_test_all.index,  # Keep the original participant_id format which should be alphanumeric
    'ADHD_Outcome': test_adhd_preds.astype(int),
    'Sex_F': test_sex_preds.astype(int)
})

# Ensure the participant_id is formatted correctly (no index conversion)
# If needed, you can verify that the participant_id is in the correct format:
print("Sample of participant_ids in submission:")
print(submission['participant_id'].head())

# Save submission file with correct format
submission.to_csv('wids_datathon_submission.csv', index=False)
print("Submission file created with format:")
print("participant_id,ADHD_Outcome,Sex_F")
print(f"{submission.iloc[0]['participant_id']}, {submission.iloc[0]['ADHD_Outcome']}, {submission.iloc[0]['Sex_F']}")
print("Submission file created!")

# Save submission file
submission.to_csv('wids_datathon_submission.csv', index=False)
print("Submission file created!")



Creating submission file...
Sample of participant_ids in submission:
0    Cfwaf5FX7jWK
1    vhGrzmvA3Hjq
2    ULliyEXjy4OV
3    LZfeAb1xMtql
4    EnFOUv0YK1RG
Name: participant_id, dtype: object
Submission file created with format:
participant_id,ADHD_Outcome,Sex_F
Cfwaf5FX7jWK, 1, 0
Submission file created!
Submission file created!


# 12. Calibration and Performance Visualization

In [24]:
print("\nVisualizing model calibration and performance...")

# 12.1 Calibration plots
plt.figure(figsize=(16, 8))

# ADHD calibration plot
plt.subplot(1, 2, 1)
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_val['ADHD_Outcome'], adhd_ensemble_probs, n_bins=10
)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Ensemble")

fraction_of_positives, mean_predicted_value = calibration_curve(
    y_val['ADHD_Outcome'], adhd_stack_probs, n_bins=10
)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Stacking")

plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.title('Calibration Plot for ADHD Prediction')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.legend()
plt.grid()

# Sex calibration plot
plt.subplot(1, 2, 2)
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_val['Sex_F'], sex_ensemble_probs, n_bins=10
)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Ensemble")

fraction_of_positives, mean_predicted_value = calibration_curve(
    y_val['Sex_F'], sex_stack_probs, n_bins=10
)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Stacking")

plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.title('Calibration Plot for Sex Prediction')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.legend()
plt.grid()

plt.tight_layout()
plt.savefig('model_calibration.png')
plt.close()

# 12.2 Precision-Recall curves
plt.figure(figsize=(16, 8))

# ADHD precision-recall curve
plt.subplot(1, 2, 1)
precision, recall, _ = precision_recall_curve(y_val['ADHD_Outcome'], adhd_ensemble_probs)
plt.step(recall, precision, 'b-', label='Ensemble')

precision, recall, _ = precision_recall_curve(y_val['ADHD_Outcome'], adhd_stack_probs)
plt.step(recall, precision, 'g-', label='Stacking')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for ADHD Prediction')
plt.legend()
plt.grid()

# Sex precision-recall curve
plt.subplot(1, 2, 2)
precision, recall, _ = precision_recall_curve(y_val['Sex_F'], sex_ensemble_probs)
plt.step(recall, precision, 'b-', label='Ensemble')

precision, recall, _ = precision_recall_curve(y_val['Sex_F'], sex_stack_probs)
plt.step(recall, precision, 'g-', label='Stacking')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Sex Prediction')
plt.legend()
plt.grid()

plt.tight_layout()
plt.savefig('precision_recall_curves.png')
plt.close()

# 12.3 Confusion matrices
plt.figure(figsize=(16, 8))

# ADHD confusion matrix
plt.subplot(1, 2, 1)
adhd_cm = confusion_matrix(y_val['ADHD_Outcome'], adhd_ensemble_preds)
sns.heatmap(adhd_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No ADHD', 'ADHD'], 
            yticklabels=['No ADHD', 'ADHD'])
plt.title('Confusion Matrix for ADHD Prediction')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Sex confusion matrix
plt.subplot(1, 2, 2)
sex_cm = confusion_matrix(y_val['Sex_F'], sex_ensemble_preds)
sns.heatmap(sex_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Male', 'Female'], 
            yticklabels=['Male', 'Female'])
plt.title('Confusion Matrix for Sex Prediction')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('confusion_matrices.png')
plt.close()

print("\nExtracting feature importance from final models...")




Visualizing model calibration and performance...

Extracting feature importance from final models...



# 13. Feature Importance for Final Models

In [25]:
print("\nExtracting feature importance from final models...")

# 13.1 Feature importance from stacking models (using the first base estimator)
adhd_feature_imp = pd.DataFrame({
    'Feature': adhd_selected_features,
    'Importance': adhd_stacking.estimators_[0].feature_importances_
}).sort_values('Importance', ascending=False)

sex_feature_imp = pd.DataFrame({
    'Feature': sex_selected_features,
    'Importance': sex_stacking.estimators_[0].feature_importances_
}).sort_values('Importance', ascending=False)

# 13.2 Visualize top 20 features for final models
plt.figure(figsize=(16, 12))

plt.subplot(2, 1, 1)
sns.barplot(x='Importance', y='Feature', data=adhd_feature_imp.head(20))
plt.title('Top 20 Features for ADHD Prediction (Final Model)')
plt.tight_layout()

plt.subplot(2, 1, 2)
sns.barplot(x='Importance', y='Feature', data=sex_feature_imp.head(20))
plt.title('Top 20 Features for Sex Prediction (Final Model)')
plt.tight_layout()

plt.savefig('final_feature_importance.png')
plt.close()

# 14. Summary of Results
print("\nSummary of Final Results:")

# Calculate F1 scores
adhd_f1 = f1_score(y_val['ADHD_Outcome'], adhd_ensemble_preds)
sex_f1 = f1_score(y_val['Sex_F'], sex_ensemble_preds)
combined_f1 = (adhd_f1 + sex_f1) / 2

print(f"ADHD F1 Score: {adhd_f1:.4f}")
print(f"Sex F1 Score: {sex_f1:.4f}")
print(f"Combined F1 Score: {combined_f1:.4f}")

# Create a summary table
summary_df = pd.DataFrame({
    'Model': ['ADHD (Ensemble)', 'Sex (Ensemble)', 'Combined'],
    'F1 Score': [adhd_f1, sex_f1, combined_f1],
    'Accuracy': [
        np.mean(y_val['ADHD_Outcome'] == adhd_ensemble_preds), 
        np.mean(y_val['Sex_F'] == sex_ensemble_preds),
        np.mean((y_val['ADHD_Outcome'] == adhd_ensemble_preds) & 
                (y_val['Sex_F'] == sex_ensemble_preds))
    ]
})

print("\nPerformance Summary:")
print(summary_df)

if combined_f1 >= 0.8:
    print("\nTarget score of 0.8 achieved! Final combined F1 score:", combined_f1)
else:
    print("\nTarget score not yet achieved. Current combined F1 score:", combined_f1)
    print("Consider further optimization through:")
    print("1. Hyperparameter tuning of individual models")
    print("2. Adjusting ensemble weights")
    print("3. More advanced feature engineering")
    print("4. Alternative resampling strategies")

print("\nWiDS Datathon solution completed!")


Extracting feature importance from final models...

Summary of Final Results:
ADHD F1 Score: 0.8651
Sex F1 Score: 0.2254
Combined F1 Score: 0.5452

Performance Summary:
             Model  F1 Score  Accuracy
0  ADHD (Ensemble)  0.865079  0.813187
1   Sex (Ensemble)  0.225352  0.697802
2         Combined  0.545216  0.582418

Target score not yet achieved. Current combined F1 score: 0.5452157388777107
Consider further optimization through:
1. Hyperparameter tuning of individual models
2. Adjusting ensemble weights
3. More advanced feature engineering
4. Alternative resampling strategies

WiDS Datathon solution completed!
