In [None]:
import pandas as pd
import numpy as np
import joblib
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Load data
df = pd.read_csv(r"C:\Users\04ama\Downloads\raw adhd data\raw_data.csv")

# Feature engineering
def handedness_group(score):
    # Remove the pd.isnull check - let KNN handle missing values
    if score <= -0.5:
        return 0.0  # Left
    elif score >= 0.5:
        return 2.0  # Right  
    else:
        return 1.0  # Ambidextrous

# Create handedness group as numeric from the start
df['Handedness_Group'] = df['EHQ_EHQ_Total'].apply(handedness_group)

# Remove MRI_Track_Age_at_Scan
if 'MRI_Track_Age_at_Scan' in df.columns:
    df = df.drop(columns=['MRI_Track_Age_at_Scan'])

# Identify quantitative and categorical columns (excluding target and participant_id)
quant_cols = [col for col in df.columns if col.startswith('APQ_') or col.startswith('SDQ_') or col.startswith('EHQ_') or col.startswith('ColorVision')]
cat_cols = [col for col in df.columns if col.startswith('PreInt_') or col.startswith('Basic_') or 'Handedness' in col]
conn_cols = list(df.iloc[:, 1:19902].columns)

# Remove columns with high correlation (>=0.7)
quant_df = df[quant_cols].copy()
corr = quant_df.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] >= 0.7)]
df = df.drop(columns=to_drop)
quant_cols = [col for col in quant_cols if col not in to_drop]

# Save column lists
with open('quant_cols.json', 'w') as f:
    json.dump(quant_cols, f)
with open('cat_cols.json', 'w') as f:
    json.dump(cat_cols, f)
with open('conn_cols.json', 'w') as f:
    json.dump(conn_cols, f)

# 1. Impute missing values
imputer = KNNImputer(n_neighbors=5)
df[quant_cols + cat_cols] = imputer.fit_transform(df[quant_cols + cat_cols])
joblib.dump(imputer, 'imputer.joblib')

# After imputation
print("\n2. AFTER IMPUTATION:")
print(f"Quant cols NaN: {df[quant_cols].isnull().sum().sum()}")
print(f"Cat cols NaN: {df[cat_cols].isnull().sum().sum()}")

# 2. Scale quantitative columns
scaler = StandardScaler()
df[quant_cols] = scaler.fit_transform(df[quant_cols])
joblib.dump(scaler, 'scaler.joblib')

# 3. One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols), index=df.index)
df = df.drop(columns=cat_cols)
df = pd.concat([df, encoded_df], axis=1)
joblib.dump(encoder, 'encoder.joblib')

# 4. PCA for connectome
if len(conn_cols) > 0:
    pca = PCA(n_components=10)
    conn_pca = pca.fit_transform(df[conn_cols])
    pca_cols = [f'conn_pca_{i+1}' for i in range(10)]
    conn_pca_df = pd.DataFrame(conn_pca, columns=pca_cols, index=df.index)
    df = df.drop(columns=conn_cols)
    df = pd.concat([df, conn_pca_df], axis=1)
    joblib.dump(pca, 'pca_connectome.joblib')
else:
    pca_cols = []

# Final NaN check
print(f"Final NaN check: {df.isnull().sum().sum()}")
if df.isnull().sum().sum() > 0:
    print("Filling remaining NaN values...")
    # Fill numeric columns with median, others with 0
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    df = df.fillna(0)  # Fill any remaining non-numeric NaN
    print(f"NaN after final cleanup: {df.isnull().sum().sum()}")

# 4. Modelling
target_col = 'ADHD_Outcome'
X = df.drop(columns=[target_col, 'participant_id'], errors='ignore')
y = df[target_col]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression with GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Predict on test set with custom threshold
y_test_proba = grid_search.best_estimator_.predict_proba(X_test)[:, 1]
threshold = 0.45
y_test_pred_custom = (y_test_proba >= threshold).astype(int)

# Evaluation
report = classification_report(y_test, y_test_pred_custom)
print(report)
cm = confusion_matrix(y_test, y_test_pred_custom)
print("Confusion Matrix:\n", cm)
print("Macro F1-score:", f1_score(y_test, y_test_pred_custom, average='macro'))

# Save model and threshold
joblib.dump(grid_search.best_estimator_, 'logistic_regression_model.joblib')
with open('adhd_lr_threshold.json', 'w') as f:
    json.dump({'threshold': threshold}, f)