In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
import os

In [None]:
df=pd.read_csv('parkinsons.csv')
df

Attribute Information: Target column - Status

Matrix column entries (attributes):
name - ASCII subject name and recording number
MDVP:Fo(Hz) - Average vocal fundamental frequency
MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
MDVP:Flo(Hz) - Minimum vocal fundamental frequency
MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several 
measures of variation in fundamental frequency
MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
NHR,HNR - Two measures of ratio of noise to tonal components in the voice

status - Health status of the subject (one) - Parkinson's, (zero) - healthy

RPDE,D2 - Two nonlinear dynamical complexity measures
DFA - Signal fractal scaling exponent
spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation


In [None]:
#Pandas Profiling Report 

import pandas_profiling as pf
display(pf.ProfileReport(df))


In [None]:
#Display the shape 

display (df.shape)

#Number of rows 
print (len(df))


In [None]:
#Display the data type of all columns  

display (df.dtypes )


In [None]:
#Display Details 

print (df.info())


In [None]:
#Describe the details 

display (df.describe())


In [None]:
#Check for Null Values 

display (df.isna().sum() )


In [None]:

#Display column details  

print (df.columns)



In [None]:
#Display the dependent variable  
# status - health status of the subject (one) - Parkinson's, (zero) – healthy

print (df['status'])


In [None]:
##Create Histogram with Status column 
# The dataset has high number of patients effected with Parkinson's disease.

plt.figure(figsize=(10, 6))
df.status.hist()
plt.xlabel('Status')
plt.ylabel('Frequencies')
plt.plot()
plt.show()


In [None]:

#Create Bar graph- X-Axis Status, Y- Axis NHR
'''
The patients affected with Parkinson's disease have high NHR 
which is the measure of the ratio of noise to tonal components in the voice.
'''
plt.figure(figsize=(10, 6))
sns.barplot(x="status",y="NHR",data=df);
plt.show()


In [None]:
#Create Bar graph- X-Axis Status, Y- Axis HNR
'''
The patients affected with Parkinson's disease have high HNR
that is the measure of the ratio of noise to tonal components in the voice.
'''
plt.figure(figsize=(10, 6))
sns.barplot(x="status",y="HNR",data=df);
plt.show()


In [None]:

#Create Bar graph- X-Axis Status, Y- Axis RPDE
'''
The nonlinear dynamical complexity measure RPDE is high in the patients affected with Parkinson's disease.
'''
plt.figure(figsize=(10, 6))
sns.barplot(x="status",y="RPDE",data=df);
plt.show()


In [None]:
#Create Distribution plot – This used to check skewness in data  

import warnings
warnings.filterwarnings('ignore')
rows=3
cols=7
fig, ax=plt.subplots(nrows=rows,ncols=cols,figsize=(16,4))
col=df.columns
index=1
for i in range(rows):
    for j in range(cols):
        sns.distplot(df[col[index]],ax=ax[i][j])
        index=index+1
        
plt.tight_layout()
plt.show()


In [None]:
#Display Co relation Matrix 
corr = df.corr()
display (corr)


In [None]:
#Display Heat Map 

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10

sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='cubehelix',annot = True)
plt.show()


In [None]:

#Heatmap with Default Parameters 

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10
sns.heatmap(corr)
plt.show()



In [None]:
##Drop the name column 
# Removing  name column for machine learning algorithms.
df.drop(['name'],axis=1,inplace=True)
display (df)


In [None]:
#Spitting the dataset into x and y

#Create X
X=df.drop(labels=['status'],axis=1)
display (X.head())

#Create  – Y
Y=df['status']
display (Y.head())



In [None]:
#Splitting the data into x_train, y_train, x_test, y_test

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=40)

print (X.shape,Y.shape)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)



In [None]:
##Create a Logistic Regression Model 

log_reg = LogisticRegression().fit(X_train, Y_train)
#predict on train 
train_preds = log_reg.predict(X_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds))

#predict on test
test_preds = log_reg.predict(X_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds))
print('-'*50)

#Confusion matrix
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds))
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds))


In [None]:
#Create Random Forest Model 

RF=RandomForestClassifier().fit(X_train,Y_train)
#predict on train 
train_preds2 = RF.predict(X_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds2))

#predict on test
test_preds2 = RF.predict(X_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds2))

#Confusion matrix
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds2))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds2))
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds2))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds2))


In [None]:

#Wrong Predictions made

print((Y_test !=test_preds2).sum(),'/',((Y_test == test_preds2).sum()+(Y_test != test_preds2).sum()))


In [None]:
#Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds2))


#What is the meaning of kappa value?

The higher the kappa value, the stronger the degree of agreement. When: 
Kappa = 1, perfect agreement exists. Kappa < 0, agreement is weaker than expected by chance; this rarely happens. Kappa close to 0, the degree of agreement is the same as would be expected by chance


In [None]:
#Display the test and Predicted Values 

ddf=pd.DataFrame(data=[test_preds2,Y_test])
display (ddf)




In [None]:
 #Transpose and display

display (ddf.T)  


In [None]:
#Decision Tree Classifier  




from sklearn.tree import DecisionTreeClassifier
#fit the model on train data 
DT = DecisionTreeClassifier().fit(X,Y)


#predict on train 
train_preds3 = DT.predict(X_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds3))


#predict on test
test_preds3 = DT.predict(X_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds3))
print('-'*50)
#Confusion matrix
print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds3))
print("confusion_matrix test is: \n", confusion_matrix(Y_test, test_preds3))
print('Wrong predictions out of total')
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds3))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds3))


In [None]:
##Wrong Prediction and Kappa Score   

# Wrong Predictions made.
print((Y_test !=test_preds3).sum(),'/',((Y_test == test_preds3).sum()+(Y_test != test_preds3).sum()))
print('-'*50)

# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds3))


In [None]:
##Naïve Bayce  algorithm 

from sklearn.naive_bayes import GaussianNB

#fit the model on train data 

NB=GaussianNB()
NB.fit(X_train,Y_train)

#predict on train 

train_preds4 = NB.predict(X_train)

#accuracy on train

print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds4))

#predict on test

test_preds4 = NB.predict(X_test)

#accuracy on test

print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds4))
print('-'*50)

#Confusion matrix

print("confusion_matrix train is: \n", confusion_matrix(Y_train, train_preds4))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds4))
print('Wrong predictions out of total')
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds4))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds4))


In [None]:
##Wrong Prediction and Kappa Score   

# Wrong Predictions made.

print((Y_test !=test_preds4).sum(),'/',((Y_test == test_preds4).sum()+(Y_test != test_preds4).sum()))
print('-'*50)

# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds4))


In [None]:
#K Neighbours Classifier 

from sklearn.neighbors import KNeighborsClassifier

#fit the model on train data 
KNN = KNeighborsClassifier().fit(X_train,Y_train)

#predict on train 
train_preds5 = KNN.predict(X_train)

#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds5))

#predict on test
test_preds5 = KNN.predict(X_test)

#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds5))
print('-'*50)

#Confusion matrix

print("confusion_matrix train is:\n ", confusion_matrix(Y_train, train_preds5))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds5))
print('Wrong predictions out of total')
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds5))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds5))



In [None]:
##Wrong Prediction and Kappa Score   

# Wrong Predictions made.
print((Y_test !=test_preds5).sum(),'/',((Y_test == test_preds5).sum()+(Y_test != test_preds5).sum()))

print('-'*50)
# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds5))


In [None]:
##Support Vector Machine 

from sklearn.svm import SVC

#fit the model on train data 

SVM = SVC(kernel='linear')
SVM.fit(X_train, Y_train)

#predict on train 

train_preds6 = SVM.predict(X_train)


#accuracy on train

print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds6))

#predict on test

test_preds6 = SVM.predict(X_test)


#accuracy on test

print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds6))
print('-'*50)


#Confusion matrix

print("confusion_matrix train is: \n", confusion_matrix(Y_train, train_preds6))
print("confusion_matrix test is:\n ", confusion_matrix(Y_test, test_preds6))
print('Wrong predictions out of total')
print('-'*50)

print("recall", metrics.recall_score(Y_test, test_preds6))
print('-'*50)
print('\nClassification Report Train is ')
print(classification_report (Y_train, train_preds6))
print('\nClassification Report Test is ')
print(classification_report (Y_test, test_preds6))   


In [None]:
##Wrong Prediction and Kappa Score   

# Wrong Predictions made.
print((Y_test !=test_preds6).sum(),'/',((Y_test == test_preds6).sum()+(Y_test != test_preds6).sum()))
print('-'*50)
# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test,test_preds6))


In [None]:
##Create Pickle File    

import pickle 
# Saving model to disk
pickle.dump(SVM,open('deploy_SVM.pkl','wb'))
# Open the Pickle File 
model=pickle.load(open('deploy_SVM.pkl','rb'))
# Prediction 
print (model.predict (X_train))
