# 🧠 Shaking Palsy Detection using Voice & Machine Learning
This notebook covers the complete ML pipeline for Parkinson's Disease detection using voice features.
We'll explore multiple models, perform evaluation, and export the best-performing one.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
import joblib


In [None]:
# Load the dataset
df = pd.read_csv("data/parkinsons.data")
df.head()


In [None]:
# Drop unnecessary columns
df = df.drop(['name'], axis=1)

# Separate features and target
X = df.drop(['status'], axis=1)
y = df['status']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.2, stratify=y_resampled, random_state=42)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_test, y_proba)
    results[name] = {
        "Model": model,
        "ROC AUC": score,
        "Report": classification_report(y_test, y_pred, output_dict=True),
        "Confusion": confusion_matrix(y_test, y_pred)
    }

results


In [None]:
# Visualizing ROC-AUC scores
for name in results:
    fpr, tpr, _ = roc_curve(y_test, results[name]["Model"].predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {results[name]['ROC AUC']:.2f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.grid()
plt.show()


In [None]:
# Assume XGBoost is best (you can programmatically select based on score)
best_model = results["XGBoost"]["Model"]
joblib.dump(best_model, "models/final_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")
