# Logistic regression 


#### outperformed LGBM due to the linear nature of the relationships in the data, achieving better results on the private leaderboard : 0.79463 compared to the public leaderboard : 0.66145 .




This code builds a two-step machine learning pipeline using logistic regression with repeated stratified cross-validation to predict a child’s sex and ADHD outcome. First, it trains a model to predict the probability of being female (Sex_F), then uses that prediction to create interaction features with other variables to improve the ADHD prediction. It handles imbalanced data by giving more weight to ADHD-positive samples and evaluates performance using F1 score and ROC-AUC across multiple folds.

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import ndcg_score
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import lightgbm as lgb
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
from scipy.stats import hmean
import scipy
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LassoCV



### preprocessing

In [None]:


path = r'C:\Users\Family\Downloads\widsdatathon2025 (3)'

def read_data(base_path:str) -> pd.DataFrame :
    path = Path(base_path)
    trc=pd.read_excel(path /'TRAIN_NEW'  / 'TRAIN_CATEGORICAL_METADATA_new.xlsx')
    trq=pd.read_excel(path /'TRAIN_NEW'  / 'TRAIN_QUANTITATIVE_METADATA_new.xlsx')
    trf=pd.read_csv(path   /'TRAIN_NEW'  / 'TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
    trs=pd.read_excel(path /'TRAIN_NEW'  / 'TRAINING_SOLUTIONS.xlsx')  
    tsc=pd.read_excel(path /'TEST'      / 'TEST_CATEGORICAL.xlsx')
    tsq=pd.read_excel(path /'TEST'       / 'TEST_QUANTITATIVE_METADATA.xlsx')    
    tsf=pd.read_csv(path   /'TEST'       / 'TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')    
    sub=pd.read_excel(path / 'SAMPLE_SUBMISSION.xlsx')    
    dic=pd.read_excel(path /'Data Dictionary.xlsx')
    return trc, trq, trf, trs, tsc, tsq, tsf, sub, dic

trc, trq, trf, trs, tsc, tsq, tsf, sub, dic = read_data(base_path=path)

# Data Merging 
cq = pd.merge(trc, trq, on='participant_id', how='left')
feat = pd.merge(cq, trf, on='participant_id', how='left')  
qc = pd.merge(tsc, tsq, on='participant_id', how='left')
train = pd.merge(feat, trs, on='participant_id', how='left') 
test = pd.merge(qc, tsf, on='participant_id', how='left')
train_sex =train

In [3]:
train_ids = train['participant_id']
test_ids = test['participant_id'] # I will store them for later usage in grouping in validation why?  I don't want the same user to appear in both train and test. 
num_feats = trq # numerical features
cat_feats = trc # seperate categorical and numerical features help me reteriving them later easily for preprocessing.
target_cols = ['ADHD_Outcome', 'Sex_F']
groups = train_ids



In [4]:
'''
Train has 25 features/columns with missing values: ['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ', 'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan']
Test  has 23 features/columns with missing values: ['PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ', 'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial']
fMRI has no missing values
Extra columns in train: ['MRI_Track_Age_at_Scan', 'MRI_Track_Scan_Location']
'''

# Find columns with missing values only
train_missing_features_to_impute = train.columns[train.isnull().any()].tolist() # List of features with missing values in train, only 25 and no missing data in fMRI data
test_missing_features_to_impute = test.columns[test.isnull().any()].tolist() # List of features with missing values in test, only 23 and no missing data in fMRI data


# Initialize the imputer
imputer = IterativeImputer(estimator=LassoCV(random_state=42), max_iter=5, random_state=42)

# Impute in-place for train
if train_missing_features_to_impute:
	train[train_missing_features_to_impute] = imputer.fit_transform(train[train_missing_features_to_impute])

# Impute in-place for test
if test_missing_features_to_impute:
	test[test_missing_features_to_impute] = imputer.fit_transform(test[test_missing_features_to_impute])

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_

In [None]:
train.isnull().sum().sum(), test.isnull().sum().sum() 

(0, 0)

In [8]:
scaler = StandardScaler()

# Only apply scaling to numerical columns that are not part of the target or categorical features
numerical_features = [col for col in train.columns if col not in target_cols and col not in cat_feats]

# Fit scaler on the numerical features of the train set and transform train and test sets
train[numerical_features] = scaler.fit_transform(train[numerical_features])  # Fit and transform for train set
test[numerical_features] = scaler.transform(test[numerical_features]) 

In [10]:
y_sex = train['Sex_F']  
y_adhd = train['ADHD_Outcome']  


# feature importance in sex

In [12]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

# Assuming 'train' and 'test' are your DataFrames
# and target_cols is defined as ['ADHD_Outcome', 'Sex_F']

# Prepare features (X) - drop targets and participant_id
X = train.drop(columns=target_cols + ['participant_id'])

# Convert categorical features to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Encode target (y_sex should be your target Series)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_sex)

# Initialize and fit the LGBM model for Sex prediction
model_sex = lgb.LGBMClassifier(
    class_weight='balanced',  # Important for imbalanced data
    random_state=42
)
model_sex.fit(X, y)

# Get feature importances and create a DataFrame
importance_df_sex = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model_sex.feature_importances_
}).sort_values('Importance', ascending=False)

# Filter features with importance > 0 (or set a higher threshold)
important_features_sex = importance_df_sex[importance_df_sex['Importance'] > 0]['Feature'].tolist()

# Filter the data - ensure test has same features
train_sex = train[important_features_sex]
test_sex = test[important_features_sex]

# Verify the filtered data
print(f"\nNumber of important features for Sex: {len(important_features_sex)}")
print("\nTop 10 important features for Sex prediction:")
print(importance_df_sex.head(10))

[LightGBM] [Info] Number of positive: 416, number of negative: 797
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.603047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5075431
[LightGBM] [Info] Number of data points in the train set: 1213, number of used features: 19927
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

Number of important features for Sex: 2310

Top 10 important features for Sex prediction:
                    Feature  Importance
19098  158throw_191thcolumn          21
16356  114throw_199thcolumn          15
13249   83throw_192thcolumn          14
17753  133throw_171thcolumn          11
18830  152throw_184thcolumn          10
14901   99throw_124thcolumn          10
19896  191throw_197thcolumn           9
19321  164throw_189thcolumn           9
7861     44throw_69thcolumn           8
12304    76thr

In [57]:
# Drop only the columns from `trf` that are present in `train`
columns_to_drop = [col for col in trf.columns if col in test.columns]
test_adhd = test.drop(columns=columns_to_drop)

# Drop the target columns
test_adhd = test_adhd

In [None]:
# Drop only the columns from `trf` that are present in `train`
columns_to_drop = [col for col in trf.columns if col in train.columns]
train_adhd = train.drop(columns=columns_to_drop)

# Drop the target columns
train_adhd = train_adhd.drop(columns=target_cols)

Important features of ADHD

In [None]:

features_adhd = ['Basic_Demos_Enroll_Year', 'Barratt_Barratt_P1_Edu',
       'Barratt_Barratt_P2_Edu', 'EHQ_EHQ_Total', 'ColorVision_CV_Score',
       'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
       'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
       'MRI_Track_Age_at_Scan']

# Features to be interacted with predicted probability of Sex_F = 1
interactions = [
    "APQ_P_APQ_P_INV", "APQ_P_APQ_P_PP", "SDQ_SDQ_Hyperactivity",
    "MRI_Track_Age_at_Scan", "SDQ_SDQ_Generating_Impact"
]

thersholed uesed here to solve the imblanased data problem

# ADHD and SEX using Logistic Regression

In [None]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score, f1_score
import numpy as np

# Constants
SEED = 42
REPEATS = 5
FOLDS = 5
t_sex = 0.3
t_adhd = 0.4

# Evaluation function
def eval_metrics(y_true, y_pred, weights, label="None", thresh=0.3):
    """Evaluate predictions using F1 Score and ROC-AUC."""
    roc_auc = roc_auc_score(y_true, y_pred)
    f1 = f1_score(y_true, (y_pred > thresh).astype(int), sample_weight=weights)
    print(f"{label} -> F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
    return f1, roc_auc

# Initialize lists to store scores and out-of-fold predictions
scores_sex = []
scores_adhd = []
sex_oof = np.zeros(len(y_sex))
adhd_oof = np.zeros(len(y_adhd))

# Cross-validation setup
rskf = RepeatedStratifiedKFold(n_splits=FOLDS, n_repeats=REPEATS, random_state=SEED)
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

# Logistic RegressionCV parameters
logreg_params = {
    "penalty": "l2",
    "cv": skf,
    "fit_intercept": True,
    "scoring": "f1",
    "random_state": SEED,
    "solver": "saga"
}

model_sex = LogisticRegressionCV(**logreg_params)
model_adhd = LogisticRegressionCV(**logreg_params)

# Start cross-validation loop
for fold, (train_idx, val_idx) in enumerate(rskf.split(train_adhd, y_adhd), 1):
    print(f"\n=== Fold {fold} ===")

    # Split data
    X_train, X_val = train_adhd.iloc[train_idx].copy(), train_adhd.iloc[val_idx].copy()
    y_train_sex, y_val_sex = y_sex.iloc[train_idx], y_sex.iloc[val_idx]
    y_train_adhd, y_val_adhd = y_adhd.iloc[train_idx], y_adhd.iloc[val_idx]

    # Sample weights
    weights_train = np.where(y_train_adhd == "11", 2, 1)  # Assign higher weight to ADHD positive samples
    weights_val = np.where(y_val_adhd == "11", 2, 1)  # Assign higher weight to ADHD positive samples

    # Train model to predict Sex_F
    model_sex.fit(X_train, y_train_sex, sample_weight=weights_train)
    sex_train_pred = model_sex.predict_proba(X_train)[:, 1]
    sex_val_pred = model_sex.predict_proba(X_val)[:, 1]
    sex_oof[val_idx] += sex_val_pred / REPEATS

    sex_f1, sex_roc_auc = eval_metrics(y_val_sex, sex_val_pred, weights_val, "Sex_F", thresh=t_sex)
    scores_sex.append((sex_f1, sex_roc_auc))

    # Add predicted sex probabilities
    X_train = X_train.assign(sex_proba=sex_train_pred)
    X_val = X_val.assign(sex_proba=sex_val_pred)

    # Create interaction features
    for col in interactions:
        X_train[f"I_{col}"] = X_train[col] * X_train["sex_proba"]
        X_val[f"I_{col}"] = X_val[col] * X_val["sex_proba"]

    # Train model to predict ADHD outcome
    model_adhd.fit(X_train[features_adhd], y_train_adhd, sample_weight=weights_train)
    adhd_val_pred = model_adhd.predict_proba(X_val[features_adhd])[:, 1]
    adhd_oof[val_idx] += adhd_val_pred / REPEATS

    adhd_f1, adhd_roc_auc = eval_metrics(y_val_adhd, adhd_val_pred, weights_val, "Outcome ADHD", thresh=t_adhd)
    scores_adhd.append((adhd_f1, adhd_roc_auc))

# Print final results
print(f"\n=== CV Results ===")
print(f"Sex Mean roc auc Score: {np.mean([b for b, _ in scores_sex]):.4f}")
print(f"Sex Mean F1: {np.mean([f for _, f in scores_sex]):.4f}")
print(f"ADHD Mean roc auc Score: {np.mean([b for b, _ in scores_adhd]):.4f}")
print(f"ADHD Mean F1: {np.mean([f for _, f in scores_adhd]):.4f}")


=== Fold 1 ===
Sex_F -> F1: 0.5139, ROC-AUC: 0.6287
Sex_F -> F1: 0.5139, ROC-AUC: 0.6287
Outcome ADHD -> F1: 0.8146, ROC-AUC: 0.8465

=== Fold 2 ===
Outcome ADHD -> F1: 0.8146, ROC-AUC: 0.8465

=== Fold 2 ===
Sex_F -> F1: 0.4946, ROC-AUC: 0.5423
Sex_F -> F1: 0.4946, ROC-AUC: 0.5423
Outcome ADHD -> F1: 0.8117, ROC-AUC: 0.8256

=== Fold 3 ===
Outcome ADHD -> F1: 0.8117, ROC-AUC: 0.8256

=== Fold 3 ===
Sex_F -> F1: 0.5357, ROC-AUC: 0.6349
Sex_F -> F1: 0.5357, ROC-AUC: 0.6349
Outcome ADHD -> F1: 0.8117, ROC-AUC: 0.7383

=== Fold 4 ===
Outcome ADHD -> F1: 0.8117, ROC-AUC: 0.7383

=== Fold 4 ===
Sex_F -> F1: 0.5235, ROC-AUC: 0.6267
Sex_F -> F1: 0.5235, ROC-AUC: 0.6267
Outcome ADHD -> F1: 0.8137, ROC-AUC: 0.8133

=== Fold 5 ===
Outcome ADHD -> F1: 0.8137, ROC-AUC: 0.8133

=== Fold 5 ===
Sex_F -> F1: 0.4453, ROC-AUC: 0.4910
Sex_F -> F1: 0.4453, ROC-AUC: 0.4910
Outcome ADHD -> F1: 0.8137, ROC-AUC: 0.7506

=== Fold 6 ===
Outcome ADHD -> F1: 0.8137, ROC-AUC: 0.7506

=== Fold 6 ===
Sex_F -> F1: 0

## only features_adhd

I tried it using only the features that I thought were important

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import brier_score_loss, f1_score
import numpy as np

FOLDS = 5
SEED = 42

# Initialize storage
scores_f1 = []
scores_brier = []

# Store OOF predictions
adhd_oof = np.zeros(len(y_adhd))

# Classification threshold
t_adhd = 0.4

# Stratified K-Fold
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

params = {
    "penalty": "l1",
    "cv": skf,
    "fit_intercept": True,
    "scoring": "f1",
    "random_state": SEED,
    "solver": "saga"
    }

model = LogisticRegressionCV(**params)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_adhd, y_adhd), 1):
    print(f"\n=== Fold {fold} ===")

    # Data split
    X_train, X_val = train_adhd.iloc[train_idx], train_adhd.iloc[val_idx]
    y_train = y_adhd.iloc[train_idx]
    y_val = y_adhd.iloc[val_idx]
    
    # Sample weights as in the first script
    weights_train = ((y_sex.iloc[train_idx] == 1) & (y_train == 1)).astype(int) + 1
    weights_val = ((y_sex.iloc[val_idx] == 1) & (y_val == 1)).astype(int) + 1

    # Column check
    missing = [col for col in features_adhd if col not in X_train.columns]
    if missing:
        print(f"Missing columns: {missing}")
    features_adhd_valid = [col for col in features_adhd if col in X_train.columns]

    # Fit and predict
    model.fit(X_train[features_adhd_valid], y_train, sample_weight=weights_train)
    pred_proba = model.predict_proba(X_val[features_adhd_valid])[:, 1]
    adhd_oof[val_idx] = pred_proba

    # Calculate scores
    brier = brier_score_loss(y_val, pred_proba)
    f1 = f1_score(y_val, (pred_proba > t_adhd).astype(int), sample_weight=weights_val)

    print(f"F1 Score: {f1:.4f}, Brier Score: {brier:.4f}")

    scores_f1.append(f1)
    scores_brier.append(brier)

# ===== Results Summary =====

scores_f1 = np.array(scores_f1)
scores_brier = np.array(scores_brier)

print("\n=== Final Evaluation ===")
print(f"Mean F1: {scores_f1.mean():.4f}")
print(f"Std F1: {scores_f1.std():.4f}")
print(f"Mean Brier: {scores_brier.mean():.4f}")
print(f"Std Brier: {scores_brier.std():.4f}")



=== Fold 1 ===
F1 Score: 0.8498, Brier Score: 0.2172

=== Fold 2 ===
F1 Score: 0.8498, Brier Score: 0.2172

=== Fold 2 ===
F1 Score: 0.8469, Brier Score: 0.2192

=== Fold 3 ===
F1 Score: 0.8469, Brier Score: 0.2192

=== Fold 3 ===
F1 Score: 0.8481, Brier Score: 0.2192

=== Fold 4 ===
F1 Score: 0.8481, Brier Score: 0.2192

=== Fold 4 ===
F1 Score: 0.8527, Brier Score: 0.2177

=== Fold 5 ===
F1 Score: 0.8527, Brier Score: 0.2177

=== Fold 5 ===
F1 Score: 0.8516, Brier Score: 0.2177

=== Final Evaluation ===
Mean F1: 0.8498
Std F1: 0.0021
Mean Brier: 0.2182
Std Brier: 0.0008
F1 Score: 0.8516, Brier Score: 0.2177

=== Final Evaluation ===
Mean F1: 0.8498
Std F1: 0.0021
Mean Brier: 0.2182
Std Brier: 0.0008


## all adhd data

Here are all the features

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import brier_score_loss, f1_score
import numpy as np

FOLDS = 5
SEED = 42

# Initialize storage
scores_f1 = []
scores_brier = []

# Store OOF predictions
adhd_oof = np.zeros(len(y_adhd))

# Classification threshold
t_adhd = 0.4

# Stratified K-Fold
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

params = {
    "penalty": "l1",
    "cv": skf,
    "fit_intercept": True,
    "scoring": "f1",
    "random_state": SEED,
    "solver": "saga"
}

model = LogisticRegressionCV(**params)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_adhd, y_adhd), 1):
    print(f"\n=== Fold {fold} ===")

    # Data split
    X_train, X_val = train_adhd.iloc[train_idx], train_adhd.iloc[val_idx]
    y_train = y_adhd.iloc[train_idx]
    y_val = y_adhd.iloc[val_idx]
    
    # Sample weights as in the first script
    weights_train = ((y_sex.iloc[train_idx] == 1) & (y_train == 1)).astype(int) + 1
    weights_val = ((y_sex.iloc[val_idx] == 1) & (y_val == 1)).astype(int) + 1

    
    # Fit and predict
    model.fit(X_train, y_train, sample_weight=weights_train)
    pred_proba = model.predict_proba(X_val)[:, 1]
    adhd_oof[val_idx] = pred_proba

    # Calculate scores
    brier = brier_score_loss(y_val, pred_proba)
    f1 = f1_score(y_val, (pred_proba > t_adhd).astype(int), sample_weight=weights_val)

    print(f"F1 Score: {f1:.4f}, Brier Score: {brier:.4f}")

    scores_f1.append(f1)
    scores_brier.append(brier)

# ===== Results Summary =====

scores_f1 = np.array(scores_f1)
scores_brier = np.array(scores_brier)

print("\n=== Final Evaluation ===")
print(f"Mean F1: {scores_f1.mean():.4f}")
print(f"Std F1: {scores_f1.std():.4f}")
print(f"Mean Brier: {scores_brier.mean():.4f}")
print(f"Std Brier: {scores_brier.std():.4f}")



=== Fold 1 ===
F1 Score: 0.8498, Brier Score: 0.2172

=== Fold 2 ===
F1 Score: 0.8498, Brier Score: 0.2172

=== Fold 2 ===
F1 Score: 0.8469, Brier Score: 0.1961

=== Fold 3 ===
F1 Score: 0.8469, Brier Score: 0.1961

=== Fold 3 ===
F1 Score: 0.8481, Brier Score: 0.2192

=== Fold 4 ===
F1 Score: 0.8481, Brier Score: 0.2192

=== Fold 4 ===
F1 Score: 0.8527, Brier Score: 0.2177

=== Fold 5 ===
F1 Score: 0.8527, Brier Score: 0.2177

=== Fold 5 ===
F1 Score: 0.8516, Brier Score: 0.2177

=== Final Evaluation ===
Mean F1: 0.8498
Std F1: 0.0021
Mean Brier: 0.2136
Std Brier: 0.0088
F1 Score: 0.8516, Brier Score: 0.2177

=== Final Evaluation ===
Mean F1: 0.8498
Std F1: 0.0021
Mean Brier: 0.2136
Std Brier: 0.0088


## prediction

In [None]:
test_adhd = test[features_adhd]
test_sex = test[important_features_sex]
# Drop only the columns from `trf` that are present in `train`
columns_to_drop = [col for col in trf.columns if col in test.columns]
test_adhd = train.drop(columns=columns_to_drop)

# Drop the target columns
test_adhd = test_adhd.drop(columns=target_cols)

## Submission file

In [None]:
# Ensure all arrays have the same length
if len(test_ids) == len(sex_pred_binary):
    # Create a DataFrame for submission
    submission = pd.DataFrame({
        'participant_id': test_ids,
         'ADHD_Outcome' : adhd_pred_binary ,
        'Sex_F': sex_pred_binary  
    })

    # Save the submission file
    submission.to_csv('submission_Sex_0.7416_adhd_0.76417.csv', index=False)
    print("Submission file for Sex created successfully!")

Submission file for Sex created successfully!
