# **PARKISNSON'S CLASSIFIER USING VARIOUS MODELS**
The dataset used is [Parkinson Disease Detection](https://www.kaggle.com/datasets/debasisdotcom/parkinson-disease-detection),from Kaggle uploaded by Debasis Samal.

In [60]:
# IMPORT LIBRARIES AND LOAD DATASET
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt   
import warnings # to ignore warnings
warnings.filterwarnings('ignore')
import pandas as pd # data processing,
df = pd.read_csv("../input/parkinson-disease-detection/Parkinsson disease.csv")

In [61]:
# Displaying the head of the dataset
df.head(10)

In [62]:
# Displaying the shape and datatype for each attribute

print('Shape of the dataset: ',df.shape,'\n\n')

df.info()

In [63]:
# Dispalying the descriptive statistics describe each attribute

df.describe().T

Almost all the columns' mean is greater than the median(50%).
The mean is greater we can say that there are more number of columns are highly skewed to the right.




In [64]:
# Checking Null or Empty Values

df.isna().sum()

In [65]:
df = df.drop('name',1)  # as we said earlier dropping the 'name' column as it is not significant for model building

### Univariate analysis

In [66]:
# Plotting histogram of the columns to study the data distribution
k=1
plt.figure(figsize=(26,30))

# using for loop to iterate over all the columns in the dataframe and plot the histogram of those

for col in df.columns[0:]:
    plt.subplot(6,4,k)
    plt.hist(df[col],color='lightblue', edgecolor = 'black', alpha = 0.5)
#     sns.distplot(df[col],kde=False)
    plt.title(col)
    k=k+1

### Bivariate analysis

In [67]:
# Using histogrm from seaborn plotting of spread1 for status column

sns.distplot( df[df.status == 0]['spread1'],color='blue'); # spread1 for who are normal
sns.distplot( df[df.status == 1]['spread1'],color='yellow'); # spread1 for who have PD

In [68]:
fig, ax = plt.subplots(1,2,figsize=(14,6))

# Bivariate Boxplot to see the difference between NHR and HNR
sns.boxplot(x=df['status'],y=df['NHR'],ax=ax[0]);   # boxplot of status Vs NHR
sns.boxplot(x=df['status'],y=df['HNR'],ax=ax[1]);   # boxplot of status Vs NHR

In [69]:
plt.figure(figsize=(7,7))
plt.pie(df.status.value_counts(),colors=['lightgreen','yellow'],explode=[0,0.02],autopct='%1.0f%%',labels=['0(healthy)',"1(parkinson's)"]);

**Ratio of Healthy Patients to Parkison's Patients is 75:25**

In [70]:
# checking the correlation of dataset 
fig, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(df.corr(),cmap="mako",square=True,annot = True,linewidth=0.2)

In [71]:
correlation_values=df.corr()['status']
pd.DataFrame(correlation_values.sort_values(ascending=False))

In [72]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X = df.drop('status',1)  # predictors
y = df.status            # target attributez

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)  # making 70:30 split

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [73]:
from sklearn.preprocessing import MinMaxScaler

rc = MinMaxScaler() # instantiating the object for minmaxscaler

columns = list(X_train.columns)  # storing the columns

X_train_scaled = pd.DataFrame(rc.fit_transform(X_train))
X_train_scaled.columns = columns  # assigning the columns after scaling the values

X_test_scaled = pd.DataFrame(rc.fit_transform(X_test))
X_test_scaled.columns = columns  # assigning the columns after scaling the values

In [74]:
# **Logistic Regression is a classification algorithm. 
# **It is used to predict a binary outcome (1 / 0, Yes / No, True / False) given a set of independent variables,

from sklearn.linear_model import LogisticRegression

# create an instance for LogisticRegression
Logistic = LogisticRegression(solver="liblinear")

# fit the model
Logistic.fit(X_train_scaled, y_train)

# predict on created model
l_predict = Logistic.predict(X_test_scaled)

In [75]:
# checking the score of the testset
acc_logistic_test = Logistic.score(X_test_scaled, y_test)*100

In [76]:
# storing accuracy results of each model in the dataframe for final comparision 
result_df = pd.DataFrame({'Model': ['Logistic Regression'], 'Accuracy' : [acc_logistic_test]}).drop_duplicates()
result_df

In [77]:
# Naive Bayes Classifier
# Bayes Theorem assumes predictors or input features are independent of each other,

from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes as all the columns are numerical

# create an instance for GaussianNB
n_model = GaussianNB()

# fit the model
n_model.fit(X_train_scaled, y_train)

# prediction using created model
n_predict = n_model.predict(X_test_scaled)

In [78]:
# checking the score of the test set
acc_naive_test = n_model.score(X_test_scaled, y_test)*100

In [79]:
# storing accuracy results of each model in the dataframe for final comparision
tempResult_df = pd.DataFrame({'Model': ['Naive Bayes'], 'Accuracy' : [acc_naive_test]})
result_df = pd.concat([result_df,tempResult_df]).drop_duplicates()
result_df

In [80]:
# KNN Classifier(K-Nearest Neighberhood Classifier)
from sklearn.neighbors import KNeighborsClassifier

# create instance for KNeighborsClassifier and using k value = 5
knn_model = KNeighborsClassifier(n_neighbors=5)

# fit the model
knn_model.fit(X_train_scaled, y_train)

# prediction using created model
knn_predict = knn_model.predict(X_test_scaled)

In [81]:
# checking the score of the test set
acc_knn_test = knn_model.score(X_test_scaled, y_test)*100 

In [82]:
# Storing accuracy results of each model in the dataframe for final comparision 
tempResult_df = pd.DataFrame({'Model': ['KNN Scaled'], 'Accuracy' : [acc_knn_test]})
result_df = pd.concat([result_df,tempResult_df]).drop_duplicates()
result_df



In [83]:
# Decision tree algorithm falls under the category of supervised learning. 
# Decision tree uses the tree representation to solve the problem in which each leaf node corresponds to a class label and attributes are represented on the internal node of the tree

from sklearn.tree import DecisionTreeClassifier

# using entropy technique we are making splits
decision_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, random_state = 100) 

# fitting the model
decision_tree.fit(X_train_scaled, y_train) 

# predicting the model on test set
d_pred = decision_tree.predict(X_test_scaled)

In [84]:
# checking the score of the testset
acc_DT_test = decision_tree.score(X_test_scaled, y_test)*100

In [85]:
# storing accuracy results of each model in the dataframe for final comparision 
tempResult_df = pd.DataFrame({'Model': ['Decision Tree'], 'Accuracy' : acc_DT_test})
result_df = pd.concat([result_df,tempResult_df])
result_df

In [86]:
#META CLASSIFIER PACKAGE
from mlxtend.classifier import StackingClassifier  # importing stacking classifier package
#Type of A Ensemble Technique with Mutliple Classifiers

In [87]:
from sklearn.svm import SVC  # importing SVM classifier

# creating four individual classification models
model1 = DecisionTreeClassifier(criterion = 'entropy',max_depth = 6)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = GaussianNB()
model4 = SVC(C = 10,gamma=0.01)

# giving logistic regression as meta classifier/model
meta_model = LogisticRegression()

In [88]:
# calling stacking classifier with all the base models and meta model
stcl = StackingClassifier(classifiers = [model1,model2,model3,model4], meta_classifier = meta_model)

In [89]:
from sklearn.model_selection import cross_val_score

# loop through all the models created with meta model
for models, label in zip ([model1,model2,model3,model4, stcl], ['DecisionTreeClassifier','KNN','NaiveBayes','SVM','StackingClassifier']):
    
    scores = cross_val_score (models, X, y, cv=10, scoring='accuracy')
    print(scores,label)
#     print("Accuracy:",scores.mean(),label)

In [90]:
# storing accuracy results of each model in the dataframe for final comparision 
tempResult_df = pd.DataFrame({'Model': ['Stacking Classifier'], 'Accuracy' : scores.mean()*100})
result_df = pd.concat([result_df,tempResult_df])
result_df

In [91]:
# RANDOM FORESTS - ENSEMBLE TECHNIQUE
from sklearn.ensemble import RandomForestClassifier  # importing random forest classifier

rfcl = RandomForestClassifier() # calling the randomforest with 20 decision trees
rfcl = rfcl.fit(X_train_scaled, y_train)  # fitting the model

In [92]:
rfcl.score(X_test_scaled, y_test)  # score of train and test set

In [93]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
rf_pred = rfcl.predict(X_test_scaled)

In [94]:
# Let's check the report of our default model
print(classification_report(y_test,rf_pred))

In [95]:
# Printing the accuracy score of actual values and predictions
acc_rf = accuracy_score(y_test,rf_pred)*100
print('Accuracy score of Random Forest Classifier: ',acc_rf,'%','\n')

# Printing confusion matrix
cm = confusion_matrix(y_test,rf_pred)

df1 = pd.DataFrame(cm,columns=['No','Yes'], index = ['No','Yes'])
print('\t\tConfusion matrix')
sns.heatmap(df1,annot=True,cbar=False);

In [96]:
df1

In [97]:
# storing accuracy results of each model in the dataframe for final comparision
tempResult_df = pd.DataFrame({'Model': ['Random Forest'], 'Accuracy' : acc_rf})
result_df = pd.concat([result_df,tempResult_df]).drop_duplicates()
result_df

In [98]:
# GRID SEARCH CV FOR OPTIMAL HYPERPARAMETERS FINDINGS
# Creating the parameter grid based on the results of random search 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [2,4,8,10],
    'n_estimators': [50,100,200, 300], 
    'max_features': [5, 10, 15]
    }

# Create a base model
rf = RandomForestClassifier(random_state=100)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1,scoring='accuracy')

In [100]:
grid_search.fit(X_train_scaled, y_train);

In [101]:
# printing the optimal accuracy score and hyperparameters
print('Best Accuracy that can be achivied is',grid_search.best_score_,'using',grid_search.best_params_)

In [102]:
rf_tuned = RandomForestClassifier(max_depth= 8, max_features= 5, n_estimators= 50)

In [103]:
rf_tuned.fit(X_train_scaled,y_train)

In [104]:
rf_tuned_score = rf_tuned.score(X_test_scaled,y_test)  
tempResult_df = pd.DataFrame({'Model': ['Random Forest Tuned'], 'Accuracy' : rf_tuned_score*100})
result_df = pd.concat([result_df,tempResult_df]).drop_duplicates()
result_df
result_df

In [105]:
result_df

In [106]:
# KNN - "Best Performing ALgorithm"
from sklearn import metrics
print('KNN')
pd.DataFrame(metrics.confusion_matrix(y_test,knn_predict))

In [107]:
#PLOTTING MODEL ACCURACY
fig=plt.figure(figsize=(12,5))
fig.suptitle('MODELS ACCURACY COMPARISION',size = 12,style='italic')
sns.barplot(result_df['Model'],result_df['Accuracy']);