In [1]:
import pandas as pd

# Load the CSV files
df_solutions = pd.read_csv("TRAINING_SOLUTIONS.csv")
df_categorical = pd.read_csv("TRAIN_CATEGORICAL_METADATA.csv")
df_quantitative = pd.read_csv("TRAIN_QUANTITATIVE_METADATA.csv")

# Merge the metadata dataframes first
merged_df = df_categorical.merge(df_quantitative, on="participant_id", how="inner")

# Merge the solutions dataframe last
merged_df = merged_df.merge(df_solutions, on="participant_id", how="inner")

# Save the merged dataframe to a new CSV file
merged_df.to_csv("merged_output.csv", index=False)

# Display the first few rows of the merged dataframe
merged_df.head()


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,1,5,0,5,1,0,10,,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,6,8,7,8,10,4,5,,1,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0,...,2,8,5,7,6,4,9,8.239904,1,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0,...,4,16,9,10,8,4,6,,1,1
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0,...,4,11,4,10,7,3,9,8.940679,1,1


In [2]:
# Create separate dataframes
X = merged_df.iloc[:, 2:-2]  # Drop first 2 and last 2 columns
y_adhd = merged_df.iloc[:, -2]  # Second-to-last column
y_f = merged_df.iloc[:, -1]  # Last column

In [3]:
from sklearn.preprocessing import StandardScaler

# Fill NaN values in X with the average of each column
X = X.fillna(X.mean())

X.head()

Unnamed: 0,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,EHQ_EHQ_Total,ColorVision_CV_Score,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,1,0.0,0,1,21,45,21,45,40.0,13,...,0,6,1,5,0,5,1,0,10,11.245678
1,3,1.0,2,3,15,15,0,0,-94.47,14,...,0,18,6,8,7,8,10,4,5,11.245678
2,1,1.0,8,1,18,40,0,0,-46.67,14,...,1,14,2,8,5,7,6,4,9,8.239904
3,3,0.0,8,3,15,30,18,0,-26.68,10,...,6,24,4,16,9,10,8,4,6,11.245678
4,3,0.0,1,3,15,20,0,0,0.0,14,...,1,18,4,11,4,10,7,3,9,8.940679


In [4]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_adhd, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Create separate dataframes
X = merged_df.iloc[:, 2:-2]  # Drop first 2 and last 2 columns
y_adhd = merged_df.iloc[:, -2]  # Second-to-last column
y_f = merged_df.iloc[:, -1]  # Last column

# Scale the data
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Number of folds
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Function to perform K-Fold cross-validation
def evaluate_model(X, y):
    metrics = []
    params = {
        'objective': 'binary',  # Binary classification
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
    }

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

        num_round = 100
        bst = lgb.train(params, train_data, num_round)

        y_pred = bst.predict(X_test)
        y_pred_labels = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

        accuracy = accuracy_score(y_test, y_pred_labels)
        metrics.append(accuracy)

    return np.max(metrics)

# Evaluate models for both target variables
best_accuracy_adhd = evaluate_model(X, y_adhd)
best_accuracy_f = evaluate_model(X, y_f)

print(f'Best Accuracy (ADHD): {best_accuracy_adhd:.4f}')
print(f'Best Accuracy (F): {best_accuracy_f:.4f}')


[LightGBM] [Info] Number of positive: 653, number of negative: 317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 684
[LightGBM] [Info] Number of data points in the train set: 970, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.673196 -> initscore=0.722675
[LightGBM] [Info] Start training from score 0.722675
[LightGBM] [Info] Number of positive: 666, number of negative: 304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 684
[LightGBM] [Info] Number of data points in the train set: 970, number of used features: 26
[LightGBM] [Info] [binary:BoostFro

In [11]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import json


# Define hyperparameter grid
param_grid = {
    'objective': ['binary'],  # Fixed to 'binary'
    'metric': ['binary_logloss'],  # Fixed to 'binary_logloss'
    'boosting_type': ['gbdt', 'dart', 'goss'], # Experiment with different boosting types
    'num_leaves': [15, 31, 63, 127],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
}

# Function to perform K-Fold cross-validation with randomized search
def evaluate_model(X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Create LightGBM model
    model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', boosting_type='gbdt')

    # Perform randomized search
    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=10,  # Number of iterations
        cv=kf,  # Use KFold object directly
        scoring='accuracy',
        random_state=42
    )

    random_search.fit(X, y)  # Fit on the entire data for hyperparameter tuning

    # Get best model and evaluate on the entire data
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X)
    y_pred_labels = (y_pred > 0.5).astype(int)
    best_accuracy = accuracy_score(y, y_pred_labels)

    return best_accuracy, best_model  # Return accuracy and best model

# Evaluate models for both target variables
best_accuracy_adhd, best_model_adhd = evaluate_model(X, y_adhd)
best_accuracy_f, best_model_f = evaluate_model(X, y_f)

# Get best hyperparameters
best_params_adhd = best_model_adhd.get_params()
best_params_f = best_model_f.get_params()

# Save hyperparameters to JSON files
with open('best_params_adhd.json', 'w') as f:
    json.dump(best_params_adhd, f, indent=4)

with open('best_params_f.json', 'w') as f:
    json.dump(best_params_f, f, indent=4)

print(f'Best Accuracy (ADHD): {best_accuracy_adhd:.4f}')
print(f'Best Accuracy (F): {best_accuracy_f:.4f}')
print("Best hyperparameters saved to best_params_adhd.json and best_params_f.json")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 344, number of negative: 626
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 684
[LightGBM] [Info] Number of data points in the train set: 970, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354639 -> initscore=-0.598709
[LightGBM] [Info] Start training from score -0.598709
[LightGBM] [Info] Number of positive: 316, number of negative: 654
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 684
[LightGBM] [Info] Number of data points in the train set: 