# Import important libraries

In [76]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

# Models
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.base import BaseEstimator, ClassifierMixin, clone

# Model Evalutaion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Read Files

In [87]:
df_connectcome = pd.read_csv("TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
df_cat = pd.read_excel("TRAIN_CATEGORICAL_METADATA_new.xlsx")
df_quant = pd.read_excel("TRAIN_QUANTITATIVE_METADATA_new.xlsx")
df_solutions = pd.read_excel("TRAINING_SOLUTIONS.xlsx")

df_cat.drop(columns = ['Basic_Demos_Enroll_Year', 'PreInt_Demos_Fam_Child_Ethnicity'], inplace = True)

tmp = pd.merge(df_cat, df_quant, on = 'participant_id')
tmp1 = pd.merge(tmp, df_connectcome, on = 'participant_id')
df = pd.merge(tmp1, df_solutions, on = 'participant_id')
df.set_index('participant_id', inplace = True)
y = df[['ADHD_Outcome', 'Sex_F']]
x = df.drop(columns = ['ADHD_Outcome', 'Sex_F'])

X_train, X_test, y_train, y_test, = train_test_split(x, y, test_size=0.3)

cat_col = df_cat.drop(columns = ['participant_id']).columns
quant_col = df_quant.drop(columns = 'participant_id').columns
df_connectcome.set_index('participant_id', inplace = True)

labels = ['ADHD', 'Sex']

# Handling missing data

In [91]:
def impute_miss_cat_mode (df):
    missing_percentage = df[cat_col].isna().mean() * 100
    for column, miss in missing_percentage.items():
        if miss < 5.0:
        # since the missing values are less than 5% of the data, the are considered to be missinf completely at randaom and therefore imputed by the mean
            mode_value = df[column].mode(dropna=True)[0]
            df[column].fillna(mode_value, inplace=True)
    return df

In [93]:
# percentage of missing data is > 5%
def custom_impute_cat(df):
    # Impute occupation if it's missing but education is available
    occ_by_edu = df.groupby('Barratt_Barratt_P2_Edu')['Barratt_Barratt_P2_Occ'].agg(lambda x: x.mode().iloc[0])
    mask_occ_missing = df['Barratt_Barratt_P2_Occ'].isna() & df['Barratt_Barratt_P2_Edu'].notna()
    df.loc[mask_occ_missing, 'Barratt_Barratt_P2_Occ'] = df.loc[mask_occ_missing, 'Barratt_Barratt_P2_Edu'].map(occ_by_edu)

    # Impute education if it's missing but occupation is available
    edu_by_occ = df.groupby('Barratt_Barratt_P2_Occ')['Barratt_Barratt_P2_Edu'].agg(lambda x: x.mode().iloc[0])
    mask_edu_missing = df['Barratt_Barratt_P2_Edu'].isna() & df['Barratt_Barratt_P2_Occ'].notna()
    df.loc[mask_edu_missing, 'Barratt_Barratt_P2_Edu'] = df.loc[mask_edu_missing, 'Barratt_Barratt_P2_Occ'].map(edu_by_occ)

    # If both are missing, fill with default values
    missing_both = df['Barratt_Barratt_P2_Edu'].isna() & df['Barratt_Barratt_P2_Occ'].isna()
    df.loc[missing_both, 'Barratt_Barratt_P2_Edu'] = 0
    df.loc[missing_both, 'Barratt_Barratt_P2_Occ'] = -1
    return df

In [94]:
def impute_miss_quant (df):
    missing_percentage = df[quant_col].isna().mean() * 100
    for column, miss in missing_percentage.items():
    # since the missing values are less than 5% of the data, the are considered to be missinf completely at randaom and therefore imputed by the mean
        if miss < 5.0:
            median_value = df[column].median(skipna=True)
            df[column] = df[column].fillna(median_value)
    return df

In [95]:
# percentage of missing data is > 5%
def custom_impute_quant (df):

    missing_indices = df[df['MRI_Track_Age_at_Scan'].isna()].index
    shuffled_indices = np.random.permutation(missing_indices)
    
    half = len(shuffled_indices) // 2
    first_half = shuffled_indices[:half]
    second_half = shuffled_indices[half:]
    
    df.loc[first_half, 'MRI_Track_Age_at_Scan'] = 9
    df.loc[second_half, 'MRI_Track_Age_at_Scan'] = 10
    return df

In [97]:
def handling_missing (df):
    df = impute_miss_cat_mode(df)
    df = custom_impute_cat(df)
    df = impute_miss_quant(df)
    df = custom_impute_quant(df)
    return df

# Scale the data

In [101]:
# scale quantitavie data only
def st_scale (df):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(df[quant_col])
    tmp = pd.DataFrame(scaled_values, columns=quant_col, index=df.index)
    return pd.concat([df.drop(columns=quant_col), tmp] , axis = 1)

# SMOTE

In [103]:
# combine the output labels to one target label and balance the "not majority" class which all the classes that are males with ADHD = 10
def smote (x,y):

    y_combined = y['ADHD_Outcome'].astype(str) + "_" + y['Sex_F'].astype(str)
    smote = SMOTE(sampling_strategy='not majority', k_neighbors=5)
    X_res, y_combined_res = smote.fit_resample(x, y_combined)
    # after balancing the labels split back the target output into two columns
    y_split = y_combined_res.str.split("_", expand=True)
    y_res = pd.DataFrame({
        'ADHD_Outcome': y_split[0].astype(int),
        'Sex_F': y_split[1].astype(int) })

    return X_res, y_res

# ONE_HOT ENCODING

In [105]:
# one-hot encoding for all categoricak features
def one_hot_encoding(df):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    df_enc = encoder.fit_transform(df[cat_col])
    encoded_cols = encoder.get_feature_names_out(cat_col)
    df_encoded = pd.DataFrame(df_enc, columns=encoded_cols, index=df.index)
    df_final = pd.concat([df.drop(columns=cat_col), df_encoded], axis=1)
    return df_final , encoder

# PREPROCESS

In [108]:
X_train = handling_missing(X_train)
X_train, enc = one_hot_encoding(X_train)
X_train = st_scale(X_train)
X_train,y_train = smote(X_train,y_train)

X_test = handling_missing(X_test)
t = pd.DataFrame(
    enc.transform(X_test[cat_col]),
    columns=enc.get_feature_names_out(cat_col),
    index=X_test.index
)
X_test= pd.concat([X_test.drop(columns=cat_col), t], axis = 1)
X_test = st_scale(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)


# Custome Multioutput Classifier

In [115]:
class CustomMultiOutputClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        """
        estimators: list of (name, estimator) tuples
        Each estimator will predict one column of the target.
        """
        self.estimators = estimators

    def fit(self, X, Y):
        """
        Fit each estimator to its respective output label (column of Y).
        """
        self.fitted_estimators_ = []
        for i, (name, estimator) in enumerate(self.estimators):
            est = clone(estimator)
            est.fit(X, Y.iloc[:, i])
            self.fitted_estimators_.append((name, est))
        return self

    def predict(self, X):
        """
        Predict each output label using its respective estimator.
        Returns a 2D numpy array: [n_samples, n_outputs]
        """
        preds = []
        for name, est in self.fitted_estimators_:
            pred = est.predict(X)
            preds.append(pred.reshape(-1, 1))  
        return np.hstack(preds)

lr_model = LogisticRegression(class_weight={0: 4, 1: 1 } , verbose=1, solver='saga', max_iter=700,penalty='l1') # solver='saga', max_iter=1000,penalty='l1'
xgb = xgb.XGBClassifier(scale_pos_weight= 0.25)

custom_model = CustomMultiOutputClassifier([
    ('ADHD_Outcome', xgb),    
    ('Sex_F', lr_model) 
])

custom_model.fit(X_train, y_train)

y_pred = custom_model.predict(X_test)

print("=== ADHD Classification Report ===")
print(classification_report(y_test['ADHD_Outcome'], y_pred[:, 0]))
print(confusion_matrix(y_test['ADHD_Outcome'], y_pred[:, 0]))

print("=== Gender Classification Report ===")
print(classification_report(y_test['Sex_F'], y_pred[:, 1]))
print(confusion_matrix(y_test['Sex_F'], y_pred[:, 1]))

max_iter reached after 541 seconds




=== ADHD Classification Report ===
              precision    recall  f1-score   support

           0       0.54      0.77      0.63        97
           1       0.90      0.76      0.82       267

    accuracy                           0.76       364
   macro avg       0.72      0.76      0.73       364
weighted avg       0.80      0.76      0.77       364

[[ 75  22]
 [ 65 202]]
=== Gender Classification Report ===
              precision    recall  f1-score   support

           0       0.75      0.90      0.82       233
           1       0.72      0.46      0.56       131

    accuracy                           0.74       364
   macro avg       0.74      0.68      0.69       364
weighted avg       0.74      0.74      0.72       364

[[210  23]
 [ 71  60]]
