In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Parkinsson disease.csv')

In [3]:
data = data.drop(['name'], axis=1)
df_corr = data.corr()
df_corr.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
MDVP:Fo(Hz),1.0,0.400985,0.596546,-0.118003,-0.382027,-0.076194,-0.112165,-0.076213,-0.098374,-0.073742,...,-0.094732,-0.021981,0.059144,-0.383535,-0.383894,-0.446013,-0.413738,-0.24945,0.17798,-0.372356
MDVP:Fhi(Hz),0.400985,1.0,0.084951,0.102086,-0.029198,0.097177,0.091126,0.09715,0.002281,0.043465,...,-0.003733,0.163766,-0.024893,-0.166136,-0.112404,-0.343097,-0.076658,-0.002954,0.176323,-0.069543
MDVP:Flo(Hz),0.596546,0.084951,1.0,-0.139919,-0.277815,-0.100519,-0.095828,-0.100488,-0.144543,-0.119089,...,-0.150737,-0.10867,0.210851,-0.3802,-0.400143,-0.050406,-0.394857,-0.243829,-0.100629,-0.340071
MDVP:Jitter(%),-0.118003,0.102086,-0.139919,1.0,0.935714,0.990276,0.974256,0.990276,0.769063,0.804289,...,0.746635,0.906959,-0.728165,0.27822,0.360673,0.098572,0.693577,0.385123,0.433434,0.721543
MDVP:Jitter(Abs),-0.382027,-0.029198,-0.277815,0.935714,1.0,0.922911,0.897778,0.922913,0.703322,0.716601,...,0.69717,0.834972,-0.65681,0.338653,0.441839,0.175036,0.735779,0.388543,0.310694,0.748162


In [4]:
# find highly correlated features and drop them
higly_correlated_features = set()

for feature_column in range(0,len(df_corr.columns)):
    if feature_column == 'status':
        continue
    feature_column_name = df_corr.columns[feature_column]
    for feature_row in range(0,len(df_corr.index)):
        feature_row_name = df_corr.index[feature_row]
        if feature_row_name == feature_column_name:
            continue
        corr_value = df_corr.iloc[feature_column][feature_row]
        if corr_value > 0.67:
            higly_correlated_features.add(feature_row_name)
print(higly_correlated_features)
data = data.drop(higly_correlated_features, axis=1)


{'MDVP:APQ', 'Jitter:DDP', 'PPE', 'Shimmer:APQ5', 'spread1', 'Shimmer:APQ3', 'MDVP:Jitter(%)', 'MDVP:RAP', 'MDVP:Shimmer(dB)', 'Shimmer:DDA', 'MDVP:Jitter(Abs)', 'NHR', 'MDVP:PPQ', 'MDVP:Shimmer'}


In [5]:
x = data.drop(['status'],axis = 1).values
y = data['status'].values

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, shuffle= True, random_state = 0)

### Random Forest Classifier

In [10]:
for est in range(5,60,5):
    for depth in range (5,20,2):
        classifier = RandomForestClassifier(n_estimators = est, max_depth = depth)
        classifier.fit(x_train,y_train)
        y_train_pred = classifier.predict(x_train)
        y_test_pred = classifier.predict(x_test)
        
        accuracy_train = accuracy_score(y_train,y_train_pred)
        accuracy_test = accuracy_score(y_test,y_test_pred)
        # check for tuning parameters (looking for accuracy > 0.98 and difference between train and 
        #test < 0.5 to avoid most overfitting configurations)
        if accuracy_test > 0.95 and accuracy_train - accuracy_test < 0.5:
            print('est: ' + str(est) + ', depth: ' + str(depth))
            print('Accuracy\t\ttrain: %.4f , test: %.4f' %(accuracy_train,accuracy_test))
        

est: 10, depth: 9
Accuracy		train: 0.9936 , test: 0.9744
est: 10, depth: 17
Accuracy		train: 0.9808 , test: 0.9744
est: 10, depth: 19
Accuracy		train: 1.0000 , test: 0.9744
est: 15, depth: 11
Accuracy		train: 1.0000 , test: 0.9744
est: 15, depth: 17
Accuracy		train: 1.0000 , test: 0.9744
est: 20, depth: 11
Accuracy		train: 1.0000 , test: 0.9744
est: 20, depth: 15
Accuracy		train: 1.0000 , test: 1.0000
est: 25, depth: 5
Accuracy		train: 0.9872 , test: 0.9744
est: 25, depth: 9
Accuracy		train: 1.0000 , test: 0.9744
est: 25, depth: 11
Accuracy		train: 1.0000 , test: 0.9744
est: 30, depth: 13
Accuracy		train: 1.0000 , test: 0.9744
est: 30, depth: 15
Accuracy		train: 1.0000 , test: 0.9744
est: 30, depth: 17
Accuracy		train: 1.0000 , test: 0.9744
est: 35, depth: 9
Accuracy		train: 1.0000 , test: 0.9744
est: 35, depth: 17
Accuracy		train: 1.0000 , test: 0.9744
est: 35, depth: 19
Accuracy		train: 1.0000 , test: 0.9744
est: 40, depth: 11
Accuracy		train: 1.0000 , test: 0.9744
est: 40, depth: 13