In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neural_network import BernoulliRBM
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

# --- STEP 1: Load and Process MRI Data ---
mri_file_path = "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv"
mri_data_numeric = pd.read_csv(mri_file_path)

# Extract participant IDs if available for merging later
if "participant_id" in mri_data_numeric.columns:
    participant_ids = mri_data_numeric["participant_id"]
    mri_data_numeric = mri_data_numeric.drop(columns=["participant_id"])
else:
    participant_ids = None

# --- STEP 2: Load and Merge Categorical & Quantitative Data ---
df_solutions = pd.read_csv("TRAINING_SOLUTIONS.csv")
df_categorical = pd.read_csv("TRAIN_CATEGORICAL_METADATA.csv")
df_quantitative = pd.read_csv("TRAIN_QUANTITATIVE_METADATA.csv")

# Merge categorical and quantitative metadata
merged_metadata = df_categorical.merge(df_quantitative, on="participant_id", how="inner")

# Merge labels
merged_metadata = merged_metadata.merge(df_solutions, on="participant_id", how="inner")

# --- STEP 3: Train-Test Split (Prevents Data Leakage) ---
# Merge MRI data with participant IDs
if participant_ids is not None:
    mri_data_numeric["participant_id"] = participant_ids

# Merge MRI data with metadata
merged_df = merged_metadata.merge(mri_data_numeric, on="participant_id", how="inner")

# Extract target variables
y_adhd = merged_df["ADHD_Outcome"]
y_f = merged_df["Sex_F"]

# Remove target labels and participant IDs
X_data = merged_df.drop(columns=["participant_id", "ADHD_Outcome", "Sex_F"])

# Split data before feature extraction and preprocessing
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_adhd, test_size=0.2, random_state=42, stratify=y_adhd
)

# --- STEP 4: Apply RBM for Feature Extraction ---
# Separate MRI features
mri_features = [col for col in X_data.columns if col.startswith("mri_")]

scaler = StandardScaler()

# Scale and transform MRI features
X_train_mri = scaler.fit_transform(X_train[mri_features])
X_test_mri = scaler.transform(X_test[mri_features])

rbm = BernoulliRBM(n_components=100, learning_rate=0.01, n_iter=10, random_state=42)
X_train_rbm_features = rbm.fit_transform(X_train_mri)
X_test_rbm_features = rbm.transform(X_test_mri)

# Create DataFrames for extracted RBM features
rbm_feature_names = [f"rbm_feature_{i}" for i in range(X_train_rbm_features.shape[1])]
X_train_rbm_df = pd.DataFrame(X_train_rbm_features, columns=rbm_feature_names, index=X_train.index)
X_test_rbm_df = pd.DataFrame(X_test_rbm_features, columns=rbm_feature_names, index=X_test.index)

# Drop original MRI features and add RBM features
X_train = X_train.drop(columns=mri_features).join(X_train_rbm_df)
X_test = X_test.drop(columns=mri_features).join(X_test_rbm_df)

# --- STEP 5: Preprocess Categorical and Numeric Features ---
categorical_features = X_data.select_dtypes(include=["object"]).columns.tolist()
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

# Fit on training data and transform both train and test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# --- STEP 6: Train Binary Classifier ---
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(X_train_processed, y_train)

# --- STEP 7: Evaluate Model ---
y_pred = classifier.predict(X_test_processed)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy on Test Set:", accuracy)
print("Classification Report:\n", report)


KeyError: 'adhd_label'