In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from google.colab import drive

In [None]:
drive.mount('/content/drive', force_remount=True)
os.chdir("/content/drive/MyDrive/TeamLimbic-BTT-SpringStudio/widsdatathon2025/TRAIN")

Mounted at /content/drive


In [None]:
# Load datasets
train_targets = pd.read_excel("TRAINING_SOLUTIONS.xlsx")
train_categorical = pd.read_excel("TRAIN_CATEGORICAL_METADATA.xlsx")
train_quant = pd.read_excel("TRAIN_QUANTITATIVE_METADATA.xlsx")
train_connectome = pd.read_csv("TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [None]:
# Merge categorical and quantitative features
train_data = pd.merge(train_categorical, train_quant, on="participant_id", how="left")

# Merge fMRI connectome data
train_data = pd.merge(train_data, train_connectome, on="participant_id", how="left")

# Merge with targets
train_data = pd.merge(train_data, train_targets, on="participant_id", how="left")

# Drop participant_id since it's just an identifier
train_data = train_data.drop(columns=["participant_id"])

# Fill missing values with the median (recommended for numerical features)
train_data.fillna(train_data.median(numeric_only=True), inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

# Define target variables
y = train_data[["ADHD_Outcome", "Sex_F"]]  # Labels
X = train_data.drop(columns=["ADHD_Outcome", "Sex_F"])  # Features only

# Train-Test Split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (970, 19927) (970, 2)
Test shape: (243, 19927) (243, 2)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data & transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# Define base classifier (Logistic Regression)
rf = LogisticRegression(random_state=0)

# Wrap it in MultiOutputClassifier
multi_rf = MultiOutputClassifier(rf)

# Train the model
multi_rf.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = multi_rf.predict(X_test_scaled)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate ADHD prediction
print("ADHD Prediction Report:")
print(classification_report(y_test["ADHD_Outcome"], y_pred[:, 0]))

# Evaluate Sex prediction
print("Sex Prediction Report:")
print(classification_report(y_test["Sex_F"], y_pred[:, 1]))

# Overall accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {overall_accuracy:.2f}")

ADHD Prediction Report:
              precision    recall  f1-score   support

           0       0.50      0.17      0.25        65
           1       0.76      0.94      0.84       178

    accuracy                           0.73       243
   macro avg       0.63      0.55      0.54       243
weighted avg       0.69      0.73      0.68       243

Sex Prediction Report:
              precision    recall  f1-score   support

           0       0.77      0.94      0.84       171
           1       0.69      0.33      0.45        72

    accuracy                           0.76       243
   macro avg       0.73      0.63      0.65       243
weighted avg       0.74      0.76      0.73       243

Overall Accuracy: 0.56


In [None]:
print(train_data["MRI_Track_Age_at_Scan"].describe())

count    1213.000000
mean       11.095369
std         2.721666
min         0.000000
25%         9.583960
50%        10.739219
75%        11.966005
max        21.564453
Name: MRI_Track_Age_at_Scan, dtype: float64


In [None]:
print(train_data[train_data["MRI_Track_Age_at_Scan"] == 0])

      Basic_Demos_Enroll_Year  Basic_Demos_Study_Site  \
64                       2016                       1   
1192                     2016                       1   

      PreInt_Demos_Fam_Child_Ethnicity  PreInt_Demos_Fam_Child_Race  \
64                                 0.0                            0   
1192                               0.0                            0   

      MRI_Track_Scan_Location  Barratt_Barratt_P1_Edu  Barratt_Barratt_P1_Occ  \
64                          0                      15                       0   
1192                        0                      18                      40   

      Barratt_Barratt_P2_Edu  Barratt_Barratt_P2_Occ  EHQ_EHQ_Total  ...  \
64                        21                      35          40.00  ...   
1192                      18                      45          86.67  ...   

      195throw_198thcolumn  195throw_199thcolumn  196throw_197thcolumn  \
64               -0.008837             -0.116215              0.161

In [None]:
from scipy.stats import skew

skewness = skew(train_data["MRI_Track_Age_at_Scan"].dropna())
print(f"Skewness: {skewness:.4f}")

Skewness: 0.8045


In [None]:
train_data["MRI_Track_Age_at_Scan"] = train_data["MRI_Track_Age_at_Scan"].fillna(train_data["MRI_Track_Age_at_Scan"].median())
train_data["PreInt_Demos_Fam_Child_Ethnicity"] = train_data["PreInt_Demos_Fam_Child_Ethnicity"].fillna(train_data["PreInt_Demos_Fam_Child_Ethnicity"].mode()[0])

In [None]:
from scipy.stats import zscore
numerical_columns = train_data.select_dtypes(include=['number']).columns
z_scores = train_data[numerical_columns].apply(zscore)

# Find rows with Z-scores above 3 or below -3
outliers = (z_scores > 3) | (z_scores < -3)
outliers = outliers.any(axis=1)
outliers_data = train_data[outliers]

print(f"Number of outliers detected: {outliers_data.shape[0]}")


Number of outliers detected: 1213


In [None]:
Q1 = train_data[numerical_columns].quantile(0.25)
Q3 = train_data[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Filter rows where the values are outside of 1.5*IQR
filtered_data = train_data[~((train_data[numerical_columns] < (Q1 - 1.5 * IQR)) | (train_data[numerical_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
print(f"Data shape after removing outliers: {filtered_data.shape}")

Data shape after removing outliers: (0, 19929)


In [None]:
# Check for imbalance in ADHD_Outcome
print(train_data['ADHD_Outcome'].value_counts())

# Check for imbalance in Sex_F
print(train_data['Sex_F'].value_counts())


ADHD_Outcome
1    831
0    382
Name: count, dtype: int64
Sex_F
0    797
1    416
Name: count, dtype: int64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
adhd_model = LogisticRegression(random_state=0, max_iter=10000, solver='liblinear', penalty='l1')
sex_model = LogisticRegression(random_state=0, max_iter=10000, solver='newton-cg', penalty=None)
#base_model = LogisticRegression(random_state=0, max_iter=10000, solver='newton-cg', penalty=None)

# Multi-output model for both ADHD_Outcome and Sex_F
#model = MultiOutputClassifier(base_model)

# Fit the model
adhd_model.fit(X_train, y_train['ADHD_Outcome'])
sex_model.fit(X_train, y_train['Sex_F'])
#model.fit(X_train, y_train)

# Make predictions
y_pred_adhd = adhd_model.predict(X_test)
y_pred_sex = sex_model.predict(X_test)
#y_pred = model.predict(X_test)

# Calculate accuracy for both targets
accuracy_adhd = accuracy_score(y_test['ADHD_Outcome'], y_pred_adhd)
accuracy_sex = accuracy_score(y_test['Sex_F'], y_pred_sex)
#accuracy_adhd = accuracy_score(y_test['ADHD_Outcome'], y_pred[:, 0])
#accuracy_sex = accuracy_score(y_test['Sex_F'], y_pred[:, 1])

# Output the accuracy scores
print(f"Accuracy for ADHD_Outcome: {accuracy_adhd:.4f}")
print(f"Accuracy for Sex_F: {accuracy_sex:.4f}")

# Overall accuracy
#overall_accuracy = accuracy_score(y_test, y_pred)
overall_correct = (y_test['ADHD_Outcome'] == y_pred_adhd) & (y_test['Sex_F'] == y_pred_sex)
overall_accuracy = overall_correct.mean()
print(f"Overall Accuracy: {overall_accuracy:.2f}")

Accuracy for ADHD_Outcome: 0.8025
Accuracy for Sex_F: 0.7037
Overall Accuracy: 0.58


Testing different parameters (C=1.0)


---


newton-cg (l2): 0.7984, 0.7695, 0.61

newton-cg (None): 0.7901, 0.7860, 0.61


---


lbfgs (l2): 0.7942, 0.7695, 0.60

lbfgs (None): 0.778, 0.7449, 0.58


---


sag (l2): 0.7984, 0.6996, 0.57

sag (None): 0.7984, 0.6996, 0.57


---


newton-cholesky (l2): 0.7901, 0.7695, 0.61

newton-cholesky (None): 0.7778, 0.7449, 0.58

---


saga (l1): 0.8025, 0.7078, 0.58

saga (l2): 0.8025, 0.7160, 0.59

saga (elasticnet, l1_ratio=0.5): 0.8025 0.7078, 0.58

saga (None): 0.8025, 0.7119, 0.58



---


liblinear (l1): 0.8066, 0.6955, 0.58

liblinear (l2): 0.7942, 0.7695, 0.60