In [79]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

In [80]:
# 1. LOAD THE DATASET
# ============================================================
df = pd.read_csv("../dataset/parkinson_disease/parkinsons.csv")
print(df.shape)
df.head()

df = df.drop(columns=["name"])   # not useful for prediction

(195, 24)


In [81]:
# 2. SELECT TOP 15 FEATURES
# ============================================================
top15_features = [
    "MDVP:Fo(Hz)",
    "MDVP:Fhi(Hz)",
    "MDVP:Flo(Hz)",
    "MDVP:Jitter(%)",
    "MDVP:Jitter(Abs)",
    "MDVP:RAP",
    "MDVP:PPQ",
    "Jitter:DDP",
    "MDVP:Shimmer",
    "MDVP:Shimmer(dB)",
    "Shimmer:APQ5",
    "MDVP:APQ",
    "NHR",
    "HNR",
    "PPE"
]

X = df[top15_features]
y = df["status"]  # 1 = Parkinson's, 0 = healthy

print("\nUsing features:", top15_features)
print("X shape:", X.shape, "y shape:", y.shape)


Using features: ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ5', 'MDVP:APQ', 'NHR', 'HNR', 'PPE']
X shape: (195, 15) y shape: (195,)


In [82]:
# 5. DEFINE & TRAIN BASE MODELS
# ============================================================
from sklearn.pipeline import Pipeline
# SVM (RBF)
svm = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", SVC(kernel="rbf", C=3, gamma="scale", probability=True, random_state=42))
])

# XGBoost
xgb = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        tree_method="hist",
        random_state=42
    ))
])

In [83]:
# 6. STACKING MODEL
# ============================================================
estimators = [
    ("svm", svm),
    ("xgb", xgb),
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    stack_method="predict_proba",
    cv=5
)

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

model = Pipeline([
    ("scaler", StandardScaler()),
    ("stack", stack)
])

In [85]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
cv_auc = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
cv_f1 = cross_val_score(model, X, y, cv=skf, scoring='f1')

print("CV Accuracy:", cv_accuracy.mean())
print("CV ROC-AUC:", cv_auc.mean())
print("CV F1:", cv_f1.mean())

CV Accuracy: 0.887179487179487
CV ROC-AUC: 0.9511877394636015
CV F1: 0.9298793485234164


In [86]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [87]:
stack.fit(X_train, y_train)

In [88]:
# 7. EVALUATE ALL MODELS
# ============================================================
models = {
    "SVM": svm,
    "XGBoost": xgb,
    "Stacking Model": stack
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print("Classification Report:\n", classification_report(y_test, preds))


=== SVM ===
Accuracy: 0.8717948717948718
Confusion Matrix:
 [[ 6  4]
 [ 1 28]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.60      0.71        10
           1       0.88      0.97      0.92        29

    accuracy                           0.87        39
   macro avg       0.87      0.78      0.81        39
weighted avg       0.87      0.87      0.86        39


=== XGBoost ===
Accuracy: 0.9230769230769231
Confusion Matrix:
 [[ 8  2]
 [ 1 28]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.80      0.84        10
           1       0.93      0.97      0.95        29

    accuracy                           0.92        39
   macro avg       0.91      0.88      0.90        39
weighted avg       0.92      0.92      0.92        39


=== Stacking Model ===
Accuracy: 0.9230769230769231
Confusion Matrix:
 [[ 8  2]
 [ 1 28]]
Classification Report:
               preci

In [89]:
# 8. PICK BEST MODEL
# ============================================================
best_name = max(models, key=lambda m: accuracy_score(y_test, models[m].predict(X_test)))
best_model = models[best_name]

print("\nBest model:", best_name)

best_model.fit(X_train, y_train)


Best model: XGBoost


In [None]:
# 9. SAVE FINAL MODEL ARTIFACTS
# ============================================================
artifacts = {
    "model": best_model,
    "features": top15_features,
}

os.makedir("../saved_mdl", exist_ok=True)
joblib.dump(artifacts, "../saved_mdl/parkinsons_model.pkl")

print("\nSaved 15-feature Parkinson's model to ../saved_models/parkinsons_model.pkl")


Saved 15-feature Parkinson's model to ../saved_models/parkinsons_model.pkl
