In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from scipy.stats import uniform, randint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
parkinsons_data = pd.read_csv('parkinsons.csv')

In [3]:
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [4]:
parkinsons_data.shape

(195, 24)

In [5]:
parkinsons_data['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [6]:
parkinsons_data.isna().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

##### Split data into feautures and labels

In [7]:
X = parkinsons_data.drop(columns = ['status','name'])
Y = parkinsons_data['status']

##### Standardize the data

In [8]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
X = standardized_data
Y = parkinsons_data['status']
print(X,Y)

[[-0.82929965 -0.43616456 -0.95203729 ...  0.48047686 -0.21053082
   0.86888575]
 [-0.77097169 -0.53097409 -0.05772056 ...  1.31118546  0.27507712
   1.80360503]
 [-0.90947638 -0.7231683  -0.10987483 ...  1.01768236 -0.10362861
   1.40266141]
 ...
 [ 0.49557839  0.47010361 -0.96839309 ... -0.81807931  0.78033848
  -0.83241014]
 [ 1.07876114  2.19004398 -0.95417967 ... -0.22906571 -0.63700298
  -0.92610456]
 [ 1.45481664  0.69224632 -0.88348115 ... -0.43085284  0.45480231
  -0.64505466]] 0      1
1      1
2      1
3      1
4      1
      ..
190    0
191    0
192    0
193    0
194    0
Name: status, Length: 195, dtype: int64


##### Train test split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2, stratify = Y)

In [10]:
print(X.shape, X_train.shape, X_test.shape)

(195, 22) (156, 22) (39, 22)


###### Hyperparameter tuned models

In [11]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grids for randomized search (coarse search)
logistic_param_grid_coarse = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
logistic_random_search_coarse = RandomizedSearchCV(LogisticRegression(), logistic_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBClassifier(objective='binary:logistic'), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(svm.SVC(), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
logistic_random_search_coarse.fit(X_train, Y_train)
xgboost_random_search_coarse.fit(X_train, Y_train)
svm_random_search_coarse.fit(X_train, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_logistic_params_coarse = logistic_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
logistic_param_grid_fine = {
    'penalty': [best_logistic_params_coarse['penalty']],
    'C': [best_logistic_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'solver': [best_logistic_params_coarse['solver']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'gamma': [best_svm_params_coarse['gamma'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
logistic_grid_search_fine = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBClassifier(objective='binary:logistic'), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(svm.SVC(), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
logistic_grid_search_fine.fit(X_train, Y_train)
xgboost_grid_search_fine.fit(X_train, Y_train)
svm_grid_search_fine.fit(X_train, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Logistic Regression Best Parameters (Fine Search):", logistic_grid_search_fine.best_params_)
print("XGBoost Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVM Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
logistic_cv_score_fine = logistic_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if logistic_cv_score_fine >= xgboost_cv_score_fine and logistic_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = logistic_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= logistic_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)






Logistic Regression Best Parameters (Fine Search): {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
XGBoost Best Parameters (Fine Search): {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
SVM Best Parameters (Fine Search): {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best Model Train Accuracy (Fine Search): 1.0
Best Model Test Accuracy (Fine Search): 0.9743589743589743


In [12]:
y_pred = best_model_fine.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
conf_matrix = confusion_matrix(Y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9743589743589743
Precision: 1.0
Recall: 0.9655172413793104
F1 Score: 0.9824561403508771
Confusion Matrix:
[[10  0]
 [ 1 28]]


##### SVM

In [11]:
# Define the XGBoost classifier
SVM_classifier = svm.SVC(probability=True)

# Define the hyperparameter distributions for RandomizedSearchCV
random_param_dist = {'C': [0.1, 1, 10, 100],
                     'gamma': [0.01, 0.1, 1, 'scale', 'auto'],
                     'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
                    }

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    SVM_classifier, random_param_dist, n_iter=100, scoring='accuracy', cv=5
)

random_search.fit(X_train, Y_train)

# Display the best parameters from RandomizedSearchCV
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)




Best Parameters from RandomizedSearchCV: {'kernel': 'rbf', 'gamma': 0.1, 'C': 10}


In [12]:
# Define the hyperparameter grid for GridSearchCV based on the results from RandomizedSearchCV
param_grid = {'C': [random_search.best_params_['C']],
              'gamma': [random_search.best_params_['gamma']],
              'kernel': [random_search.best_params_['kernel']]
             }

# Perform GridSearchCV
grid_search = GridSearchCV(
    SVM_classifier, param_grid, scoring='accuracy', cv=5
)

grid_search.fit(X_train, Y_train)

# Display the best parameters and accuracy score from GridSearchCV
print("Best Parameters from GridSearchCV:", grid_search.best_params_)
print("Best Accuracy Score from GridSearchCV:", grid_search.best_score_)


Best Parameters from GridSearchCV: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best Accuracy Score from GridSearchCV: 0.9169354838709678


##### Model Eval

In [13]:
#Evaluate the final model on train set
final_model = grid_search.best_estimator_
y_pred_train = final_model.predict(X_train)
train_accuracy = accuracy_score(Y_train, y_pred_train)
print("Final Model Accuracy on Train Set:", train_accuracy)

# Evaluate the final model on the test set
y_pred_test = final_model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred_test)
print("Final Model Accuracy on Test Set:", accuracy)

Final Model Accuracy on Train Set: 1.0
Final Model Accuracy on Test Set: 0.9487179487179487


In [14]:
accuracy = accuracy_score(Y_test, y_pred_test)
precision = precision_score(Y_test, y_pred_test)
recall = recall_score(Y_test, y_pred_test)
f1 = f1_score(Y_test, y_pred_test)
roc_auc = roc_auc_score(Y_test, final_model.predict_proba(X_test)[:, 1])  # Assuming binary classification

# Display the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

# Display the confusion matrix
conf_matrix = confusion_matrix(Y_test, y_pred_test)
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.9487179487179487
Precision: 1.0
Recall: 0.9310344827586207
F1 Score: 0.9642857142857143
ROC AUC Score: 0.993103448275862

Confusion Matrix:
[[10  0]
 [ 2 27]]


##### Predictive System

In [13]:
input_data = (139.224,586.567,66.157,0.03011,0.00022,0.01854,0.01628,0.05563,0.09419,0.93,0.05551,0.05005,0.06023,0.16654,0.2593,10.489,0.596362,0.641418,-3.269487,0.270641,2.690917,0.444774)
# Convert input data to a NumPy array
input_data_as_np_array = np.asarray(input_data)

# Reshape the array for prediction on a single instance
input_data_reshaped = input_data_as_np_array.reshape(1, -1)

# Standardize the input data using the same scaler
std_data = scaler.transform(input_data_reshaped)

# Make predictions
prediction = best_model_fine.predict(std_data)
if prediction ==0:
    print("Does not have Parkinson's Disease")
else:
    print("Has Parkinson's Disease")


Has Parkinson's Disease




##### dumping model to save it for later use

In [14]:
import joblib
joblib.dump(best_model_fine, 'best_model.pkl')
#if you want load the model later on for use without having to retrain the model
#loaded_model = joblib.load(svm_model.pkl)

['best_model.pkl']