In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn import decomposition, ensemble
import tensorflow 
import xgboost
import pandas as pd
import numpy as np

Test-Train

In [2]:
df = pd.read_csv("parkinsons.data")
df.head(-5) #This data set belongs to the University of California. When it is analysed, the status of having Parkinson's and not having Parkinson's is determined in the "status" column.

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,...,0.08270,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.10470,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,phon_R01_S49_3,116.286,177.291,96.983,0.00314,0.00003,0.00134,0.00192,0.00403,0.01564,...,0.02001,0.00737,24.199,0,0.598515,0.654331,-5.592584,0.133917,2.058658,0.214346
186,phon_R01_S49_4,116.556,592.030,86.228,0.00496,0.00004,0.00254,0.00263,0.00762,0.01660,...,0.02460,0.01397,23.958,0,0.566424,0.667654,-6.431119,0.153310,2.161936,0.120605
187,phon_R01_S49_5,116.342,581.289,94.246,0.00267,0.00002,0.00115,0.00148,0.00345,0.01300,...,0.01892,0.00680,25.023,0,0.528485,0.663884,-6.359018,0.116636,2.152083,0.138868
188,phon_R01_S49_6,114.563,119.167,86.647,0.00327,0.00003,0.00146,0.00184,0.00439,0.01185,...,0.01672,0.00703,24.775,0,0.555303,0.659132,-6.710219,0.149694,1.913990,0.121777


In [3]:
df["status"].value_counts() #Parkinson's disease status "1", healthy status "0".

status
1    147
0     48
Name: count, dtype: int64

In [4]:
target = df["status"]
x_train, x_test, y_train, y_test = model_selection.train_test_split(df[["MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Jitter(%)",
                                                                       "MDVP:Jitter(Abs)","MDVP:RAP","MDVP:PPQ","Jitter:DDP","MDVP:Shimmer","MDVP:Shimmer(dB)","Shimmer:APQ3",
                                                                       "Shimmer:APQ5","MDVP:APQ","Shimmer:DDA","NHR","HNR","RPDE","DFA","spread1","spread2","D2","PPE"]],df["status"],
                                                                       test_size=0.2,random_state=42,stratify=target)
# I wanted to make the training set 80% and the test set 20%.

MinMaxScaler

In [5]:
scaler = MinMaxScaler((-1,1))
x_scaler_train = scaler.fit_transform(x_train)
x_scaler_test = scaler.fit_transform(x_test)
#I chose to use "MinMaxScaler" instead of "StandardScaler" because I wanted to preserve the original distribution of the data. 

Logistic Regression

In [34]:
loj = linear_model.LogisticRegression(solver="liblinear")
loj_model = loj.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(loj_model, x_scaler_test, y_test, cv = 10).mean()
print("Logistic Regression Accuracy Rate:", accuracy)

Logistic Regression Accuracy Rate: 0.9


In [35]:
loj_params = {"penalty":["l1","l2"],
              "tol":[0.0001,0.001,0.01,0.1,0.00001],
              "C":[1,2,3,4,5,0.5,0.1],
              "intercept_scaling":[1,2,3,4,5,0.5,0.1],
              "max_iter": [100,200,500,50,25],}
loj_cv_model = GridSearchCV(loj,loj_params,cv=10,n_jobs=-1,verbose=2)
loj_cv_model.fit(x_scaler_train,y_train)

Fitting 10 folds for each of 2450 candidates, totalling 24500 fits


In [36]:
loj_cv_model.best_params_

{'C': 1, 'intercept_scaling': 5, 'max_iter': 100, 'penalty': 'l1', 'tol': 0.01}

In [37]:
loj_tuned = linear_model.LogisticRegression(solver="liblinear",C=1,intercept_scaling=5,max_iter=100,penalty="l1",tol=0.01)
loj_tuned = loj_tuned.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(loj_tuned, x_scaler_test, y_test, cv = 10).mean()
print("Logistic Regression Accuracy Rate:", accuracy)

Logistic Regression Accuracy Rate: 0.925


Naive Bayes

In [39]:
nb = naive_bayes.GaussianNB()
nb_model = nb.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(nb_model, x_scaler_test, y_test, cv=10).mean()
print("Naive Bayes Accuracy Rate:", accuracy)
#I did'n want to tune the model because the accuracy of the "NaiveBayes" model is very low.

Naive Bayes Accuracy Rate: 0.725


Support Vector Machine

In [40]:
svc =SVC(kernel= "linear")
svc_model = svc.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(svc_model, x_scaler_test, y_test, cv = 10).mean()
print("Support Vector Machine Accuracy Rate:", accuracy)

Support Vector Machine Accuracy Rate: 0.9


In [42]:
svc_params = {"C": np.arange(1,10),"kernel" : ["rbf","linear"],"gamma" :["scale","auto"],"tol" : np.arange(0.001,0.01,0.001)}
svc_cv_model = GridSearchCV(svc,svc_params,cv= 10, n_jobs=-1,verbose=2)
svc_cv_model.fit(x_scaler_train,y_train)

Fitting 10 folds for each of 324 candidates, totalling 3240 fits


In [43]:
svc_cv_model.best_params_

{'C': 9, 'gamma': 'scale', 'kernel': 'rbf', 'tol': 0.001}

In [44]:
svc_tuned = SVC( kernel= "rbf",C= 9, gamma= "scale", tol= 0.001)
svc_tuned_model = svc_tuned.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(svc_tuned_model, x_scaler_test, y_test, cv=10).mean()
print("Support Vector Machine Accuracy Rate:", accuracy)

Support Vector Machine Accuracy Rate: 0.925


Artificial Neural Networks

In [45]:
mlp = MLPClassifier()
mlp_model = mlp.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(mlp_model, x_scaler_test, y_test, cv = 10).mean()
print("Artificial Neural Networks Accuracy Rate:", accuracy)



Artificial Neural Networks Accuracy Rate: 0.8666666666666666




In [52]:
mlp_params = {"alpha": [0.0001,0.001,0.01,0.1,1],
               "hidden_layer_sizes" : [(1,100),(10,10),(3,5),(5,3)],
               "learning_rate": ["constant", "invscaling", "adaptive"],
               "learning_rate_init": [0.0001,0.001,0.01,0.1],
               "max_iter" : [200,500,700,900,1000]}
mlp_cv_model = GridSearchCV(mlp,mlp_params,cv= 5, n_jobs=-1,verbose=2)
mlp_cv_model.fit(x_scaler_train,y_train)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


In [53]:
mlp_cv_model.best_params_

{'alpha': 0.0001,
 'hidden_layer_sizes': (10, 10),
 'learning_rate': 'invscaling',
 'learning_rate_init': 0.01,
 'max_iter': 700}

In [71]:
mlp_tuned = MLPClassifier(alpha= 0.0001,hidden_layer_sizes=(10,10),learning_rate= "invscaling",learning_rate_init=0.01,max_iter=700, solver= "lbfgs",tol=0.005)
mlp_tuned_model = mlp_model.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(mlp_tuned_model, x_scaler_test, y_test, cv=10).mean()
print("Artificial Neural Networks Accuracy Rate:", accuracy)



Artificial Neural Networks Accuracy Rate: 0.8416666666666666




Classification and Regression Trees (CART)

In [6]:
cart = DecisionTreeClassifier()
cart_model = cart.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(cart_model, x_scaler_test, y_test, cv = 10).mean()
print("Classification and Regression Trees Accuracy Rate:", accuracy)

Classification and Regression Trees Accuracy Rate: 0.7416666666666667


In [27]:
cart_params = {"max_depth": list(range(1,20)),
               "min_samples_split":list(range(2,10)),
               "min_samples_leaf": list(range(1,10)),
               "max_leaf_nodes": list(range(2,20)),}
cart_cv_model = GridSearchCV(cart,cart_params,cv=10,n_jobs= -1, verbose= 1)
cart_cv_model.fit(x_scaler_train,y_train)

Fitting 10 folds for each of 24624 candidates, totalling 246240 fits


In [28]:
cart_cv_model.best_params_

{'max_depth': 15,
 'max_leaf_nodes': 18,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [29]:
cart_tuned = DecisionTreeClassifier(max_depth= 15,max_leaf_nodes= 18,min_samples_leaf= 1,min_samples_split= 2)
cart_tuned_model = cart_tuned.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(cart_tuned_model, x_scaler_test, y_test, cv=10).mean()
print("Classification and Regression Trees Accuracy Rate:", accuracy)

Classification and Regression Trees Accuracy Rate: 0.825


Random Forest

In [30]:
rf = RandomForestClassifier()
rf_model = rf.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(rf_model, x_scaler_test, y_test, cv=10).mean()
print("Random Forest Accuracy Rate:", accuracy)

Random Forest Accuracy Rate: 0.875


In [32]:
rf_params = {"n_estimators" : [1000,1500,1700,1800,2000],
             "max_depth": list(range(19,40)),}
rf_cv_model = GridSearchCV(rf,rf_params,cv=10,n_jobs= -1, verbose= 2)
rf_cv_model.fit(x_scaler_train,y_train)

Fitting 10 folds for each of 105 candidates, totalling 1050 fits


In [33]:
rf_cv_model.best_params_

{'max_depth': 20, 'n_estimators': 1000}

In [38]:
rf_tuned = RandomForestClassifier(max_depth= 1,n_estimators= 1000)
rf_tuned_model = rf_tuned.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(rf_tuned_model, x_scaler_test, y_test, cv=10).mean()
print("Random Forest Accuracy Rate:", accuracy)

Random Forest Accuracy Rate: 0.875


Gradient Boosting Machines (GBM)

In [60]:
gbm = GradientBoostingClassifier()
gbm_model = gbm.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(gbm_model, x_scaler_test, y_test, cv=10).mean()
print("GBM Accuracy Rate:", accuracy)

GBM Accuracy Rate: 0.8


In [40]:
gbm_params= {"learning_rate": [0.1,0.2,0.4,0.6,0.8,1],
             "max_depth": [3,5,8,25,50],
             "n_estimators" : [100,200,500,1000],
             "min_samples_split": list(range(2,10)),}
gbm_cv_model = GridSearchCV(gbm,gbm_params,cv=5,n_jobs= -1, verbose= 2)
gbm_cv_model.fit(x_scaler_train,y_train)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


In [41]:
gbm_cv_model.best_params_

{'learning_rate': 0.6,
 'max_depth': 3,
 'min_samples_split': 9,
 'n_estimators': 100}

In [59]:
gbm_tuned = GradientBoostingClassifier(max_depth=3,min_samples_split=2,learning_rate=0.1,n_estimators= 1000)
gbm_tuned_model = gbm_tuned.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(gbm_tuned_model, x_scaler_test, y_test, cv=10).mean()
print("GBM Accuracy Rate:", accuracy)

GBM Accuracy Rate: 0.825


XGBoost

In [61]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(xgb_model, x_scaler_test, y_test, cv=10).mean()
print("XGBoost Accuracy Rate:", accuracy)

XGBoost Accuracy Rate: 0.85


In [63]:
xgb_params = {"n_estimators" : [100,200,400,500],
              "subsample": [0.5,0.6,0.8,1],
              "max_depth": [6,10,15,20,25,35,50],
              "learning_rate": [0.1,0.2,1,0.05,0.02]}
xgb_cv_model = GridSearchCV(xgb,xgb_params,cv=5,n_jobs= -1, verbose= 2)
xgb_cv_model.fit(x_scaler_train,y_train)

Fitting 5 folds for each of 560 candidates, totalling 2800 fits


In [64]:
xgb_cv_model.best_params_

{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 400, 'subsample': 0.5}

In [66]:
xgb_tuned = xgboost.XGBClassifier(learning_rate = 0.1, max_depth = 6,n_estimators= 400, subsample =0.5)
xgb_tuned_model = xgb_tuned.fit(x_scaler_train,y_train)
accuracy = model_selection.cross_val_score(xgb_tuned_model, x_scaler_test, y_test, cv=10).mean()
print("XGBoost Accuracy Rate:", accuracy)

XGBoost Accuracy Rate: 0.9


Conclusion : "Support Vector Machine" gave the best result in this project and the accuracy score was 92.5%. This rate is satisfactory enough. Better results can be achieved by changing the parameters. 