In [5]:


# `train.py`
import argparse
import json
import os
from pathlib import Path
import warnings

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, RocCurveDisplay
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore", category=UserWarning)

def load_dataset(path: str):
    df = pd.read_csv(path)
    # Standard UCI file often uses 'name' as an ID column; drop if present
    if 'name' in df.columns:
        df = df.drop(columns=['name'])
    # Expect 'status' as target: 1 = PD, 0 = healthy
    if 'status' not in df.columns:
        raise ValueError("Expected target column 'status' not found.")
    y = df['status'].astype(int)
    X = df.drop(columns=['status'])
    # Keep only numeric columns
    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    X = X[num_cols]
    return X, y, num_cols

def build_pipelines(numeric_features):
    numeric_proc = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    pre = ColumnTransformer(
        transformers=[("num", numeric_proc, numeric_features)],
        remainder="drop"
    )

    svm_clf = Pipeline(steps=[
        ("pre", pre),
        ("clf", SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42))
    ])

    rf_clf = Pipeline(steps=[
        ("pre", pre),
        ("clf", RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        ))
    ])

    return {
        "svm_rbf": svm_clf,
        "random_forest": rf_clf
    }

def evaluate_model(model, X_train, X_test, y_train, y_test, name, outdir, plot=False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Probabilities for ROC-AUC
    if hasattr(model.named_steps["clf"], "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        # e.g., SVC with probability=True supports predict_proba; otherwise fallback to decision_function
        if hasattr(model.named_steps["clf"], "decision_function"):
            scores = model.decision_function(X_test)
            # Min-max to [0,1] for ROC convenience
            s_min, s_max = scores.min(), scores.max()
            y_proba = (scores - s_min) / (s_max - s_min + 1e-12)
        else:
            y_proba = None

    metrics = {
        "accuracy": float(accuracy_score(y_test, y_pred)),
        "precision": float(precision_score(y_test, y_pred, zero_division=0)),
        "recall": float(recall_score(y_test, y_pred, zero_division=0)),
        "f1": float(f1_score(y_test, y_pred, zero_division=0)),
    }
    if y_proba is not None:
        metrics["roc_auc"] = float(roc_auc_score(y_test, y_proba))

    # Plots
    if plot:
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        fig_cm, ax_cm = plt.subplots()
        im = ax_cm.imshow(cm, interpolation="nearest")
        ax_cm.set_title(f"Confusion Matrix - {name}")
        ax_cm.set_xlabel("Predicted")
        ax_cm.set_ylabel("True")
        for (i, j), v in np.ndenumerate(cm):
            ax_cm.text(j, i, str(v), ha="center", va="center")
        fig_cm.tight_layout()
        fig_cm.savefig(os.path.join(outdir, f"confusion_matrix_{name}.png"), dpi=150)
        plt.close(fig_cm)

        # ROC curve
        if y_proba is not None:
            fig_roc, ax_roc = plt.subplots()
            RocCurveDisplay.from_predictions(y_test, y_proba, name=name, ax=ax_roc)
            ax_roc.set_title(f"ROC Curve - {name}")
            fig_roc.tight_layout()
            fig_roc.savefig(os.path.join(outdir, f"roc_{name}.png"), dpi=150)
            plt.close(fig_roc)

    return metrics, y_pred, y_proba, model

def cross_validate(model, X, y, cv):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, scoring="f1")
    return float(scores.mean()), float(scores.std())

def save_feature_importance_if_any(model, X_columns, outdir, tag):
    # Try to extract feature importances (only for tree models)
    try:
        clf = model.named_steps["clf"]
        if hasattr(clf, "feature_importances_"):
            # Need to map back to original numeric columns after preprocessing
            importances = clf.feature_importances_
            fi = pd.DataFrame({
                "feature": X_columns,
                "importance": importances
            }).sort_values("importance", ascending=False)
            fi.to_csv(os.path.join(outdir, f"feature_importance_{tag}.csv"), index=False)
            return True
    except Exception:
        pass
    return False

def main():
    parser = argparse.ArgumentParser(description="Train PD detection models on voice features.")
    parser.add_argument("--data", type=str, required=True, help="Path to parkinsons.csv")
    parser.add_argument("--outdir", type=str, default="./artifacts", help="Output directory")
    parser.add_argument("--test-size", type=float, default=0.2, help="Test size ratio")
    parser.add_argument("--seed", type=int, default=42, help="Random seed")
    parser.add_argument("--cv", type=int, default=5, help="K-folds for cross-validation")
    parser.add_argument("--plot", action="store_true", help="Save confusion matrix and ROC plots")
    args = parser.parse_args()

    Path(args.outdir).mkdir(parents=True, exist_ok=True)

    X, y, num_cols = load_dataset(args.data)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=args.test_size, random_state=args.seed, stratify=y
    )

    models = build_pipelines(num_cols)

    all_results = {}
    best_name = None
    best_f1 = -1.0
    best_model = None

    for name, model in models.items():
        # Cross-val on train for robust selection
        cv_mean, cv_std = cross_validate(model, X_train, y_train, cv=args.cv)

        # Fit and evaluate on holdout test
        metrics, y_pred, y_proba, fitted = evaluate_model(
            model, X_train, X_test, y_train, y_test, name, args.outdir, plot=args.plot
        )

        result = {
            "cv_f1_mean": cv_mean,
            "cv_f1_std": cv_std,
            **metrics
        }
        all_results[name] = result

        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            best_name = name
            best_model = fitted

    # Save metrics
    with open(os.path.join(args.outdir, "metrics.json"), "w") as f:
        json.dump(all_results, f, indent=2)

    # Save a text report
    lines = ["Model comparison (higher is better):\n"]
    for name, res in all_results.items():
        lines.append(
            f"- {name}: "
            f"F1={res.get('f1'):.4f}, "
            f"Acc={res.get('accuracy'):.4f}, "
            f"Prec={res.get('precision'):.4f}, "
            f"Rec={res.get('recall'):.4f}, "
            f"ROC-AUC={res.get('roc_auc', float('nan')):.4f}, "
            f"CV(F1)={res.get('cv_f1_mean'):.4f}±{res.get('cv_f1_std'):.4f}"
        )
    lines.append(f"\nBest model: {best_name} (F1={best_f1:.4f})")
    report_path = os.path.join(args.outdir, "report.txt")
    with open(report_path, "w") as f:
        f.write("\n".join(lines))

    # Save best model
    model_path = os.path.join(args.outdir, "best_model.joblib")
    joblib.dump(best_model, model_path)

    # Try to save feature importance if RF won
    saved_fi = save_feature_importance_if_any(best_model, num_cols, args.outdir, best_name)

    print("\n".join(lines))
    print(f"\nSaved best model to: {model_path}")
    print(f"Saved metrics to: {os.path.join(args.outdir, 'metrics.json')}")
    print(f"Saved report to: {report_path}")
    if args.plot:
        print("Saved confusion matrix and ROC plots.")
    if saved_fi:
        print(f"Saved feature_importance_{best_name}.csv")

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --data DATA [--outdir OUTDIR]
                             [--test-size TEST_SIZE] [--seed SEED] [--cv CV]
                             [--plot]
ipykernel_launcher.py: error: the following arguments are required: --data


SystemExit: 2