**PARKINSON'S DISEASE PREDICTOR**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# extracting the dataset
data = pd.read_csv('parkinsons.csv')

In [None]:
data.head()

In [None]:
# Heat-map to understand the correlation between the variables
plt.figure(figsize=(15,15))
correlation = data.corr()
sns.heatmap(correlation, annot=True)
plt.title('HEAT-MAP', fontweight='bold')
plt.tight_layout()


In [None]:
# Name is not relevent when compared to other variables, so, let's remove Name from the dataframe
data.drop('name', axis=1, inplace=True)

In [None]:
data.head(10)

In [None]:
# let's rearrange the columns in the dataframe
data.columns

In [None]:
col = ['status','MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE']

In [None]:
re_arranged_data = data[col]

In [None]:
re_arranged_data.head()

In [None]:
re_arranged_data.info()

In [None]:
re_arranged_data.isnull().sum()

In [None]:
# Luckily none of the data is missing.

In [None]:
data['D2'].hist(bins=30, color='red')
plt.title('D2', fontweight='bold')
plt.tight_layout()

In [None]:
re_arranged_data.describe()

In [None]:
# checking, is there any outliers in certain variables?
plt.figure(figsize=(8,8))
sns.set_theme(style='darkgrid')
sns.distplot(re_arranged_data['MDVP:Fo(Hz)'])
plt.tight_layout()

In [None]:
# MIN-88
# MAX-260

In [None]:
# checking, is there any outliers in certain variables?
plt.figure(figsize=(8,8))
sns.distplot(re_arranged_data['MDVP:Fhi(Hz)'])
plt.tight_layout()

In [None]:
# MIN-102
# MAX-592

In [None]:
# checking, is there any outliers in certain variables?
plt.figure(figsize=(8,8))
sns.distplot(re_arranged_data['MDVP:Flo(Hz)'])
plt.tight_layout()

In [None]:
# MIN-65
# MAX-239

In [None]:
# checking, is there any outliers in certain variables?
plt.figure(figsize=(8,8))
sns.set_theme(style='darkgrid')
sns.distplot(re_arranged_data['HNR'])
plt.tight_layout()

In [None]:
# MIN-8
# MAX-33

In [None]:
# There is no outliers exist

In [None]:
# let's check for the vif assumptions
from statsmodels.stats.outliers_influence import variance_inflation_factor

# create variable for the vif
variable = re_arranged_data[['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variable.values, i) for i in range(variable.shape[1])]
vif['features'] = variable.columns
vif

In [None]:
# All the variables are important and relevent

In [None]:
preprocessed_data = re_arranged_data

In [None]:
y = preprocessed_data['status']
X = preprocessed_data.drop('status',axis=1)

In [None]:
y

In [None]:
X

In [None]:
# let's scale the data into standard form

from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X = scalar.fit_transform(X)

In [None]:
X

In [None]:
# let's split the data into train test and split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=365)

In [None]:
X_train.shape, X_test.shape

In [None]:
# let's instantiate one by one model
log_regression = LogisticRegression(random_state=365)
log_regression.fit(X_train, y_train)

In [None]:
log_output = log_regression.predict(X_test)

In [None]:
log_output

In [None]:
# let's calculate the accuracy score, draw insights from confusion matrix
cm = confusion_matrix(y_test, log_output)
acc = accuracy_score(y_test, log_output)
cl = classification_report(y_test, log_output)

print(f'''The confusion metrix is: 
{cm}''')
print(f'''The accuracy score of the LOGISTIC REGRESSION IS: {round(acc, 6)*100}%''')
print(f'''The classification report is: 
{cl}''')

In [None]:
# let's instantiate Randomforest

random_forest = RandomForestClassifier(n_estimators=100, criterion='gini',random_state=365)
random_forest.fit(X_train, y_train)

In [None]:
out_random_forest = random_forest.predict(X_test)

In [None]:
out_random_forest

In [None]:
# let's calculate the accuracy score, draw insights from confusion matrix
cm = confusion_matrix(y_test, out_random_forest)
acc = accuracy_score(y_test, out_random_forest)
cl = classification_report(y_test, out_random_forest)

print(f'''The confusion metrix is: 
{cm}''')
print(f'''The accuracy score of the RANDOMFOREST CLASSIFIER IS: {round(acc, 6)*100}%''')
print(f'''The classification report is: 
{cl}''')

In [None]:
# let's instantiate decision tree classifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
tree_output = tree.predict(X_test)

In [None]:
tree_output

In [None]:
# let's calculate the accuracy score, draw insights from confusion matrix
cm = confusion_matrix(y_test, tree_output)
acc = accuracy_score(y_test, tree_output)
cl = classification_report(y_test, tree_output)

print(f'''The confusion metrix is: 
{cm}''')
print(f'''The accuracy score of the DECISIONTREE CLASSIFIER IS: {round(acc, 6)*100}%''')
print(f'''The classification report is: 
{cl}''')

In [None]:
# let's instantiate naive baiyes

naive = GaussianNB()
naive.fit(X_train, y_train)

In [None]:
naive_out = naive.predict(X_test)

In [None]:
naive_out

In [None]:
# let's calculate the accuracy score, draw insights from confusion matrix
cm = confusion_matrix(y_test, naive_out)
acc = accuracy_score(y_test, naive_out)
cl = classification_report(y_test, naive_out)

print(f'''The confusion metrix is: 
{cm}''')
print(f'''The accuracy score of the GAUSSIAN NB IS: {round(acc, 6)*100}%''')
print(f'''The classification report is: 
{cl}''')

In [None]:
# let's instantiate KNeighbour classifier

k_neighbour = KNeighborsClassifier(n_neighbors=5, algorithm='auto',weights='uniform')
k_neighbour.fit(X_train, y_train)

In [None]:
k_neighbour_out = k_neighbour.predict(X_test)

In [None]:
k_neighbour_out

In [None]:
# let's calculate the accuracy score, draw insights from confusion matrix
cm = confusion_matrix(y_test, k_neighbour_out)
acc = accuracy_score(y_test, k_neighbour_out)
cl = classification_report(y_test, k_neighbour_out)

print(f'''The confusion metrix is: 
{cm}''')
print(f'''The accuracy score of the LOGISTIC REGRESSION IS: {round(acc, 6)*100}%''')
print(f'''The classification report is: 
{cl}''')

In [None]:
all_features = [['LOGISTIC REGRESSION', '92.30%'], ['KNEIGHBOR CLASSIFIER','89.74%'], ['GUASSIAN NB', '71.79%'],
                ['DECISIONTREE CLASSIFIER', '89.74%'], ['RANDOMFOREST CLASSIFIER', '92.30%']]

In [None]:
all_features

In [None]:
data_frame = pd.DataFrame(all_features, columns=['ML MODEL','ACCURACY SCORE'])

In [None]:
data_frame.sort_values(ascending=False, by='ACCURACY SCORE')

In [None]:
# function to find the maximum accuracy& ML MOdel in the dataframe
def max_finder(frame):
  for index, row in frame.iterrows():
    if row[1] >= '90%':
      print(row[0], row[1])

In [None]:
max_finder(data_frame)

In [None]:
# let's use randomforest model as it's better fits to the problem

import joblib
joblib.dump(random_forest, 'parkinson_disease_prediction_model.joblib')