In [1]:
# import the libraries 
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pickle
import warnings

#### Attribute Information:
##### Matrix column entries (attributes):
* **name** - ASCII subject name and recording number
* **MDVP:Fo(Hz)** - Average vocal fundamental frequency
* **MDVP:Fhi(Hz)** - Maximum vocal fundamental frequency
* **MDVP:Flo(Hz)** - Minimum vocal fundamental frequency
* **MDVP:Jitter(%), MDVP:Jitter(Abs), MDVP:RAP, MDVP:PPQ, Jitter:DDP** - Several measures of variation in fundamental frequency
* **MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA** - Several measures of variation in amplitude
* **NHR, HNR** - Two measures of the ratio of noise to tonal components in the voice
* **status** - The health status of the subject (one) - Parkinson's, (zero) - healthy
* **RPDE, D2** - Two nonlinear dynamical complexity measures
* **DFA** - Signal fractal scaling exponent
* **spread1,spread2,PPE** - Three nonlinear measures of fundamental frequency variation

In [2]:
parkinsons_data = pd.read_csv("parkinsons.csv")
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [3]:
parkinsons_data.shape

(195, 24)

In [4]:
parkinsons_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

There is no missing value in any of the columns.

In [5]:
parkinsons_data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,...,0.046993,0.024847,21.885974,0.753846,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,...,0.030459,0.040418,4.425764,0.431878,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,...,0.024735,0.005925,19.198,1.0,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,...,0.03836,0.01166,22.085,1.0,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,...,0.060795,0.02564,25.0755,1.0,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [6]:
# distribution of target variable 
parkinsons_data['status'].value_counts()

status
1    147
0     48
Name: count, dtype: int64

**1** -> Parkinson's Positive

**0** -> Healthy

In [7]:
parkinsons_data['name'].duplicated().sum()

0

There is no duplicated name. Hence we can drop the **"name"** column

In [8]:
X = parkinsons_data.drop(columns=['name','status'], axis=1)
y = parkinsons_data['status']

In [9]:
# split the dataset into training and testing sets with 20% of testings
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)

In [10]:
print(X_train.shape, X_test.shape)

(156, 22) (39, 22)


In [11]:
# Data Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Model Training

#### 1.Support Vector Machine 

In [12]:
model_svm = SVC()

param_grid = {
    'C': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01,0.1,1] 
}

grid_search = GridSearchCV(model_svm, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# the best SVM model
best_svm_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_svm_model.predict(X_test)

# Print the best params and evaluate the predictions
print("Best hyperparameter:", grid_search.best_params_)
accuracy_svm = accuracy_score(y_test, y_pred)
print("Accuracy score of the test data:", accuracy_svm)

Best hyperparameter: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Accuracy score of the test data: 0.9230769230769231


* **C: Regularization parameter**. A smaller value of C creates a wider margin but may lead to more misclassifications, while a larger value of C creates a narrower margin but may lead to overfitting¶
* **kernel:** Type of kernel function used to transform the data into a higher-dimensional space where it can be linearly separated. Common choices include linear, polynomial, and radial basis function (RBF) kernels
* **gamma:** Kernel coefficient for RBF, polynomial, and sigmoid kernels. A smaller value of gamma creates a larger influence radius for the support vectors, while a larger value of gamma creates a smaller influence radius and can lead to overfitting
* **degree:** Degree of the polynomial kernel function. Only used for polynomial kernel

#### 2. K-Nearest Neighbors (KNN)

In [13]:
model_knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
}

grid_search = GridSearchCV(model_knn, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# the best SVM model
best_knn_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_knn_model.predict(X_test)

# Print the best params and evaluate the predictions
print("Best hyperparameter:", grid_search.best_params_)
accuracy_knn = accuracy_score(y_test, y_pred)
print("Accuracy score of the test data:", accuracy_knn)

Best hyperparameter: {'n_neighbors': 3}
Accuracy score of the test data: 0.8205128205128205


#### 3. Random Forest Classifier

In [14]:
model_rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(model_rf, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# the best SVM model
best_rf_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_rf_model.predict(X_test)

# Print the best params and evaluate the predictions
print("Best hyperparameter:", grid_search.best_params_)
accuracy_rf = accuracy_score(y_test, y_pred)
print("Accuracy score of the test data:", accuracy_rf)

Best hyperparameter: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 50}
Accuracy score of the test data: 0.7948717948717948


#### 4. Decision Tree Classifier

In [15]:
model_dt = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}

grid_search = GridSearchCV(model_dt, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# the best SVM model
best_dt_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_dt_model.predict(X_test)

# Print the best params and evaluate the predictions
print("Best hyperparameter:", grid_search.best_params_)
accuracy_dt = accuracy_score(y_test, y_pred)
print("Accuracy score of the test data:", accuracy_dt)

Best hyperparameter: {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 4}
Accuracy score of the test data: 0.7435897435897436


#### 5. XGBoost classifier

In [16]:
model_xgb = XGBClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(model_xgb, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# the best SVM model
best_xgb_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_xgb_model.predict(X_test)

# Print the best params and evaluate the predictions
print("Best hyperparameter:", grid_search.best_params_)
accuracy_xgb = accuracy_score(y_test, y_pred)
print("Accuracy score of the test data:", accuracy_xgb)

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


Best hyperparameter: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Accuracy score of the test data: 0.8717948717948718


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


* **n_estimators:** This parameter controls the number of boosting rounds (trees) to build. A higher value generally improves the model's performance, but it also increases the risk of overfitting.

* **learning_rate:** Also known as the "shrinkage" factor, this parameter scales the contribution of each tree in the ensemble. A smaller learning rate means slower convergence but may lead to better generalization. It is often used in conjunction with a higher number of n_estimators.

* **max_depth:** The maximum depth of a tree in the ensemble. Increasing this value allows the model to learn more complex interactions but also increases the risk of overfitting. You should set this value based on the complexity of your data.

In [17]:
accuracy_dict = {'Support Vector Machine': accuracy_svm,
                 'K-Nearest Neighbors': accuracy_knn,
                 'Decision Tree Classifier': accuracy_dt, 
                 'Random Forest Classifier': accuracy_rf, 
                 'XGBoost classifier': accuracy_xgb}

df = pd.DataFrame(list(accuracy_dict.items()), columns=['Classifier', 'Accuracy'])
print(df)

                 Classifier  Accuracy
0    Support Vector Machine  0.923077
1       K-Nearest Neighbors  0.820513
2  Decision Tree Classifier  0.743590
3  Random Forest Classifier  0.794872
4        XGBoost classifier  0.871795


In [18]:
# Find the classifier with the highest accuracy
best_classifier = df[df['Accuracy'] == df['Accuracy'].max()]['Classifier'].values[0]
best_accuracy = df['Accuracy'].max()

print("Classifier with the highest accuracy:")
print(f"Classifier: {best_classifier}")
print(f"Accuracy: {best_accuracy}")

Classifier with the highest accuracy:
Classifier: Support Vector Machine
Accuracy: 0.9230769230769231


### Building a predictive system

In [19]:
input_data =(198.76400,396.96100,74.90400,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,0.24100,0.01265,0.01321,0.01588,0.03794,0.07223,19.02000,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306)

# changing the input data into numpy array
input_date_as_np_array = np.array(input_data)

#  reshape the array 
input_data_reshape = input_date_as_np_array.reshape(1,-1)

# standardize the data
std_data = scaler.transform(input_data_reshape)

prediction = best_svm_model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print("The person does not have Parkinson's disease")
else:
    print("The person has Parkinson's disease")

[0]
The person does not have Parkinson's disease




### Saving the trained model

In [20]:
filename = 'ParkinsonsDiseasemodel.sav'
scaler_parkinsons_filename = "ParkinsonsDiseaseScaler.sav"
pickle.dump(best_svm_model, open(filename, 'wb'))
pickle.dump(scaler, open(scaler_parkinsons_filename, 'wb'))

In [21]:
# loading the saved model
loaded_model = pickle.load(open('ParkinsonsDiseasemodel.sav','rb'))
loaded_scaler = pickle.load(open("ParkinsonsDiseaseScaler.sav",'rb'))

In [22]:
input_data =(198.76400,396.96100,74.90400,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,0.24100,0.01265,0.01321,0.01588,0.03794,0.07223,19.02000,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306)

# changing the input data into numpy array
input_date_as_np_array = np.array(input_data)

#  reshape the array 
input_data_reshape = input_date_as_np_array.reshape(1,-1)

# standardize the data
std_data = loaded_scaler.transform(input_data_reshape)

prediction = loaded_model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print("The person does not have Parkinson's disease")
else:
    print("The person has Parkinson's disease")

[0]
The person does not have Parkinson's disease




In [23]:
for x in X.columns:
    print(x)

MDVP:Fo(Hz)
MDVP:Fhi(Hz)
MDVP:Flo(Hz)
MDVP:Jitter(%)
MDVP:Jitter(Abs)
MDVP:RAP
MDVP:PPQ
Jitter:DDP
MDVP:Shimmer
MDVP:Shimmer(dB)
Shimmer:APQ3
Shimmer:APQ5
MDVP:APQ
Shimmer:DDA
NHR
HNR
RPDE
DFA
spread1
spread2
D2
PPE
