In [44]:
import pandas as pd

# Load the CSV files
df_solutions = pd.read_csv("TRAINING_SOLUTIONS.csv")
df_categorical = pd.read_csv("TRAIN_CATEGORICAL_METADATA.csv")
df_quantitative = pd.read_csv("TRAIN_QUANTITATIVE_METADATA.csv")

# Merge the metadata dataframes first
merged_df = df_categorical.merge(df_quantitative, on="participant_id", how="inner")

# Merge the solutions dataframe last
merged_df = merged_df.merge(df_solutions, on="participant_id", how="inner")

# Save the merged dataframe to a new CSV file
merged_df.to_csv("merged_output.csv", index=False)

# Display the first few rows of the merged dataframe
merged_df.head()


Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,...,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,2016,1,0.0,0,1,21,45,21,45,...,1,5,0,5,1,0,10,,1,1
1,CPaeQkhcjg7d,2019,3,1.0,2,3,15,15,0,0,...,6,8,7,8,10,4,5,,1,0
2,Nb4EetVPm3gs,2016,1,1.0,8,1,18,40,0,0,...,2,8,5,7,6,4,9,8.239904,1,0
3,p4vPhVu91o4b,2018,3,0.0,8,3,15,30,18,0,...,4,16,9,10,8,4,6,,1,1
4,M09PXs7arQ5E,2019,3,0.0,1,3,15,20,0,0,...,4,11,4,10,7,3,9,8.940679,1,1


In [45]:
# Create separate dataframes
X = merged_df.iloc[:, 2:-2]  # Drop first 2 and last 2 columns
y_adhd = merged_df.iloc[:, -2]  # Second-to-last column
y_f = merged_df.iloc[:, -1]  # Last column

In [46]:
from sklearn.preprocessing import StandardScaler

# Fill NaN values in X with the average of each column
X = X.fillna(X.mean())

X.head()

Unnamed: 0,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ,EHQ_EHQ_Total,ColorVision_CV_Score,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,1,0.0,0,1,21,45,21,45,40.0,13,...,0,6,1,5,0,5,1,0,10,11.245678
1,3,1.0,2,3,15,15,0,0,-94.47,14,...,0,18,6,8,7,8,10,4,5,11.245678
2,1,1.0,8,1,18,40,0,0,-46.67,14,...,1,14,2,8,5,7,6,4,9,8.239904
3,3,0.0,8,3,15,30,18,0,-26.68,10,...,6,24,4,16,9,10,8,4,6,11.245678
4,3,0.0,1,3,15,20,0,0,0.0,14,...,1,18,4,11,4,10,7,3,9,8.940679


In [47]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_adhd, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [62]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Standardizing for models that need scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models with optimized hyperparameters
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Ridge Classifier": RidgeClassifier(),
    "Perceptron": Perceptron(max_iter=1000, tol=1e-3, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Bagging": BaggingClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(verbose=-1, random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

# Train and evaluate models
f1_scores = {}

for name, model in models.items():
    if name in ["Logistic Regression", "SVM", "KNN", "Perceptron", "MLP Neural Network", "Ridge Classifier"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    f1_scores[name] = f1_score(y_test, y_pred, pos_label=1)

# Print F1 scores
print("F1 Scores:")
for name, score in f1_scores.items():
    print(f"{name}: {score:.4f}")

print("All models trained and evaluated successfully.")




F1 Scores:
Logistic Regression: 0.8712
Ridge Classifier: 0.8681
Perceptron: 0.8306
Decision Tree: 0.8136
Random Forest: 0.8564
SVM: 0.8686
Naive Bayes: 0.8257
KNN: 0.8602
MLP Neural Network: 0.8286
AdaBoost: 0.8618
Bagging: 0.8389
XGBoost: 0.8392
LightGBM: 0.8787
CatBoost: 0.8750
All models trained and evaluated successfully.
