In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import json

# Load and preprocess data
# Load the CSV files
df_solutions = pd.read_csv("TRAINING_SOLUTIONS.csv")
df_categorical = pd.read_csv("TRAIN_CATEGORICAL_METADATA.csv")
df_quantitative = pd.read_csv("TRAIN_QUANTITATIVE_METADATA.csv")

# Merge the metadata dataframes first
merged_df = df_categorical.merge(df_quantitative, on="participant_id", how="inner")

# Merge the solutions dataframe last
merged_df = merged_df.merge(df_solutions, on="participant_id", how="inner")

# Display the first few rows of the merged dataframe
merged_df.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,1,5,0,5,1,0,10,,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,6,8,7,8,10,4,5,,1,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0,...,2,8,5,7,6,4,9,8.239904,1,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0,...,4,16,9,10,8,4,6,,1,1
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0,...,4,11,4,10,7,3,9,8.940679,1,1


In [2]:
# Create separate dataframes
X = merged_df.iloc[:, 2:-2]  # Drop first 2 and last 2 columns
y_adhd = merged_df.iloc[:, -2]  # Second-to-last column
y_f = merged_df.iloc[:, -1]  # Last column

In [3]:
# Fill NaN values with column means
X = X.fillna(X.mean())

X.head()

Unnamed: 0,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,EHQ_EHQ_Total,ColorVision_CV_Score,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,1,0.0,0,1,21,45,21,45,40.0,13,...,0,6,1,5,0,5,1,0,10,11.245678
1,3,1.0,2,3,15,15,0,0,-94.47,14,...,0,18,6,8,7,8,10,4,5,11.245678
2,1,1.0,8,1,18,40,0,0,-46.67,14,...,1,14,2,8,5,7,6,4,9,8.239904
3,3,0.0,8,3,15,30,18,0,-26.68,10,...,6,24,4,16,9,10,8,4,6,11.245678
4,3,0.0,1,3,15,20,0,0,0.0,14,...,1,18,4,11,4,10,7,3,9,8.940679


In [4]:
# Scale the features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# More conservative hyperparameter grid
param_grid = {
    'objective': ['binary'],
    'metric': ['binary_logloss'],
    'boosting_type': ['gbdt'],
    # Reduce model complexity
    'num_leaves': [3, 5, 7],  # Much smaller trees
    'max_depth': [2, 3],  # Shallower trees
    'min_child_samples': [50, 100, 200],  # Require more samples per leaf
    # Stronger regularization
    'reg_alpha': [1.0, 2.0, 5.0],  # Stronger L1
    'reg_lambda': [1.0, 2.0, 5.0],  # Stronger L2
    # More aggressive feature and data sampling
    'colsample_bytree': [0.5, 0.6, 0.7],  # Use fewer features per tree
    'subsample': [0.5, 0.6, 0.7],  # Use less data per tree
    'bagging_freq': [1],  # Perform bagging at every iteration
    # Learning rate and iterations
    'learning_rate': [0.005, 0.01],  # Smaller learning rate
    'n_estimators': [500, 1000],  # More iterations to compensate
    # Additional anti-overfitting parameters
    'min_data_in_leaf': [100, 200],  # Require more data in leaves
    'min_split_gain': [0.5, 1.0],  # Require larger gains to split
    'path_smooth': [1.0, 5.0],  # Add path smoothing
    'feature_fraction_seed': [42]
}

In [5]:
def evaluate_model(X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_roc_scores = []
    fold_val_roc_scores = []
    fold_acc_scores = []
    fold_val_acc_scores = []
    
    # More conservative base model
    model = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        boosting_type='gbdt',
        verbose=-1,
        class_weight='balanced',
        importance_type='gain',  # Track feature importance
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),  # More patient early stopping
            lgb.log_evaluation(period=0)
        ]
    )
    
    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=30,  # More hyperparameter search iterations
        cv=skf,
        scoring='roc_auc',
        random_state=42,
        n_jobs=-1,
        refit=True
    )
    
    # Add feature selection based on variance
    from sklearn.feature_selection import VarianceThreshold
    selector = VarianceThreshold(threshold=0.01)  # Remove low-variance features
    X_selected = pd.DataFrame(selector.fit_transform(X), 
                            columns=X.columns[selector.get_support()])
    
    final_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=43)
    for fold, (train_idx, val_idx) in enumerate(final_skf.split(X_selected, y)):
        X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Calculate and apply class weights
        class_counts = np.bincount(y_train)
        class_weights = len(y_train) / (2 * class_counts)
        sample_weights = np.array([class_weights[yi] for yi in y_train])
        
        eval_set = [(X_val.values, y_val.values)]
        
        random_search.fit(
            X_train,
            y_train,
            sample_weight=sample_weights,
            eval_set=eval_set
        )
        
        best_model = random_search.best_estimator_
        
        # Get predictions
        train_pred_proba = best_model.predict_proba(X_train)[:, 1]
        val_pred_proba = best_model.predict_proba(X_val)[:, 1]
        train_pred = best_model.predict(X_train)
        val_pred = best_model.predict(X_val)
        
        # Calculate metrics
        train_roc = roc_auc_score(y_train, train_pred_proba)
        val_roc = roc_auc_score(y_val, val_pred_proba)
        train_acc = accuracy_score(y_train, train_pred)
        val_acc = accuracy_score(y_val, val_pred)
        
        fold_roc_scores.append(train_roc)
        fold_val_roc_scores.append(val_roc)
        fold_acc_scores.append(train_acc)
        fold_val_acc_scores.append(val_acc)
        
        print(f'Fold {fold + 1}:')
        print(f'Training ROC-AUC: {train_roc:.4f}')
        print(f'Validation ROC-AUC: {val_roc:.4f}')
        print(f'Training Accuracy: {train_acc:.4f}')
        print(f'Validation Accuracy: {val_acc:.4f}')
        print(f'ROC-AUC Gap: {train_roc - val_roc:.4f}')
        print(f'Accuracy Gap: {train_acc - val_acc:.4f}')
        print(f'Best parameters for this fold: {random_search.best_params_}\n')
    
    print('Average Scores:')
    print(f'Training ROC-AUC: {np.mean(fold_roc_scores):.4f} ± {np.std(fold_roc_scores):.4f}')
    print(f'Validation ROC-AUC: {np.mean(fold_val_roc_scores):.4f} ± {np.std(fold_val_roc_scores):.4f}')
    print(f'Training Accuracy: {np.mean(fold_acc_scores):.4f} ± {np.std(fold_acc_scores):.4f}')
    print(f'Validation Accuracy: {np.mean(fold_val_acc_scores):.4f} ± {np.std(fold_val_acc_scores):.4f}')
    
    # Print best accuracies
    print(f'Best Training Accuracy: {max(fold_acc_scores):.4f}')
    print(f'Best Validation Accuracy: {max(fold_val_acc_scores):.4f}')
    
    # Get feature importances
    feature_imp = pd.DataFrame({
        'feature': X_selected.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print('\nTop 10 Most Important Features:')
    print(feature_imp.head(10))
    
    results = {
        'roc_auc': np.mean(fold_val_roc_scores),
        'accuracy': np.mean(fold_val_acc_scores),
        'model': random_search.best_estimator_,
        'params': random_search.best_params_,
        'feature_importance': feature_imp
    }
    
    return results

In [6]:
# Evaluate models for both target variables
print("Evaluating ADHD model...")
results_adhd = evaluate_model(X, y_adhd)

print("\nEvaluating F model...")
results_f = evaluate_model(X, y_f)

# Save best parameters
with open('best_params_adhd.json', 'w') as f:
    json.dump(results_adhd['params'], f, indent=4)

with open('best_params_f.json', 'w') as f:
    json.dump(results_f['params'], f, indent=4)

print("\nFinal Results:")
print("ADHD Model:")
print(f"ROC-AUC: {results_adhd['roc_auc']:.4f}")
print(f"Accuracy: {results_adhd['accuracy']:.4f}")
print("\nF Model:")
print(f"ROC-AUC: {results_f['roc_auc']:.4f}")
print(f"Accuracy: {results_f['accuracy']:.4f}")
print("\nBest hyperparameters saved to best_params_adhd.json and best_params_f.json")

Evaluating ADHD model...


  _data = np.array(data, dtype=dtype, copy=copy,


Fold 1:
Training ROC-AUC: 0.8524
Validation ROC-AUC: 0.8569
Training Accuracy: 0.6969
Validation Accuracy: 0.6872
ROC-AUC Gap: -0.0045
Accuracy Gap: 0.0097
Best parameters for this fold: {'subsample': 0.7, 'reg_lambda': 1.0, 'reg_alpha': 1.0, 'path_smooth': 1.0, 'objective': 'binary', 'num_leaves': 3, 'n_estimators': 500, 'min_split_gain': 0.5, 'min_data_in_leaf': 100, 'min_child_samples': 100, 'metric': 'binary_logloss', 'max_depth': 2, 'learning_rate': 0.005, 'feature_fraction_seed': 42, 'colsample_bytree': 0.6, 'boosting_type': 'gbdt', 'bagging_freq': 1}



  _data = np.array(data, dtype=dtype, copy=copy,


Fold 2:
Training ROC-AUC: 0.8537
Validation ROC-AUC: 0.8422
Training Accuracy: 0.6866
Validation Accuracy: 0.6831
ROC-AUC Gap: 0.0115
Accuracy Gap: 0.0035
Best parameters for this fold: {'subsample': 0.7, 'reg_lambda': 2.0, 'reg_alpha': 1.0, 'path_smooth': 1.0, 'objective': 'binary', 'num_leaves': 3, 'n_estimators': 500, 'min_split_gain': 1.0, 'min_data_in_leaf': 100, 'min_child_samples': 50, 'metric': 'binary_logloss', 'max_depth': 3, 'learning_rate': 0.005, 'feature_fraction_seed': 42, 'colsample_bytree': 0.6, 'boosting_type': 'gbdt', 'bagging_freq': 1}



  _data = np.array(data, dtype=dtype, copy=copy,


Fold 3:
Training ROC-AUC: 0.8724
Validation ROC-AUC: 0.8158
Training Accuracy: 0.7227
Validation Accuracy: 0.6708
ROC-AUC Gap: 0.0567
Accuracy Gap: 0.0519
Best parameters for this fold: {'subsample': 0.7, 'reg_lambda': 5.0, 'reg_alpha': 2.0, 'path_smooth': 1.0, 'objective': 'binary', 'num_leaves': 7, 'n_estimators': 1000, 'min_split_gain': 0.5, 'min_data_in_leaf': 100, 'min_child_samples': 200, 'metric': 'binary_logloss', 'max_depth': 2, 'learning_rate': 0.005, 'feature_fraction_seed': 42, 'colsample_bytree': 0.6, 'boosting_type': 'gbdt', 'bagging_freq': 1}

Fold 4:
Training ROC-AUC: 0.8592
Validation ROC-AUC: 0.8342
Training Accuracy: 0.7034
Validation Accuracy: 0.6983
ROC-AUC Gap: 0.0250
Accuracy Gap: 0.0051
Best parameters for this fold: {'subsample': 0.7, 'reg_lambda': 2.0, 'reg_alpha': 1.0, 'path_smooth': 1.0, 'objective': 'binary', 'num_leaves': 3, 'n_estimators': 500, 'min_split_gain': 1.0, 'min_data_in_leaf': 100, 'min_child_samples': 50, 'metric': 'binary_logloss', 'max_depth'