In [None]:
import os
import cv2
import PIL
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as T
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Define constants

In [None]:
train_dir = "/kaggle/input/parkinson-dataset/spiral/training/"
test_dir  = "/kaggle/input/parkinson-dataset/spiral/testing/"

train_dir_wave = "/kaggle/input/parkinson-dataset/wave/training/"
test_dir_wave  = "/kaggle/input/parkinson-dataset/wave/testing/"

IMG_WIDTH, IMG_HEIGHT = (200, 200)
dim = (IMG_WIDTH, IMG_HEIGHT)
CLASSES_NO = 2
print("CLASSES_NO = {}".format(CLASSES_NO))

# Load and preprocess the dataset

In [None]:
def data_preprocessing(img):
    image = cv2.resize(img, dim)
            
    # convert to grayscale
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # threshold the image such that the drawing appears as white
    # on a black background
    image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    
    return image

In [None]:
def fetch_data(root):
    X = []
    y = []

    healthy_url = root + "healthy"
    parkinsons_url = root + "parkinson"
    
    for image_path in  os.listdir(healthy_url):
        
        image = cv2.imread(healthy_url + "/" + image_path, cv2.IMREAD_COLOR)
        #print(image.shape)
        
        """
            There is some other files in training set, they are not the part of dataset.
            The path, whose names begin with "V" are valid.
            Example wrong paths : 
            
            essential-tremor-spiral-60684c046495c7a3.png
            12883_2018_1027_Fig1_HTML.png
            gr1.jpg
            1200px-Spiral_drawing_of_Essential_Tremor_patient.svg.png
            1643638_orig.jpg
            
            
        """
        if image_path.startswith("V") :
            image = data_preprocessing(image)
            X.append(image)
            y.append("healthy") 
        
    for image_path in  os.listdir(parkinsons_url):
        image = cv2.imread(parkinsons_url + "/" + image_path)
        #print(image)
        if image_path.startswith("V") : 
            #print(image_path)
            image = data_preprocessing(image)
            
            X.append(image)
            y.append("parkinson")
    
    return np.array(X),np.array(y)

# Take a quick look at the data

In [None]:
X_train_full, y_train_full = fetch_data(train_dir)
X_test, y_test = fetch_data(test_dir)

print("Training set: ", X_train_full.shape)
print("Training Labels : ", y_train_full.shape)
print("Test set: ", X_test.shape)
print("Test Labels : ", y_test.shape)

In [None]:
X_train_full[0]

In [None]:
y_train_full[0]

In [None]:
y_train_full[50]

In [None]:
# Healthy
for i in range(10):
    print("Label: ", y_train_full[i])
    plt.imshow(X_train_full[i], cmap = 'gray')
    plt.show()

In [None]:
# Parkinsons
for i in range(40,50):
    print("Label: ", y_train_full[i])
    plt.imshow(X_train_full[i], cmap = 'gray')
    plt.show()

In [None]:
# encode the labels
le = LabelEncoder()
y_train_full = le.fit_transform(y_train_full)
y_test = le.transform(y_test)

In [None]:
y_train_full[0] # Healthy

In [None]:
y_train_full[50] # Parkinsons

# Data Augmentation

In [None]:
def data_augmentation(X_train_full, index) :
    
    transform_horizontal = T.RandomHorizontalFlip(p = 1)
    transform_vertical = T.RandomVerticalFlip(p = 1)
    transform_brightness = T.ColorJitter(brightness=0.5, contrast=1, saturation=0.1, hue=0.5)
    # (min degree to rotate,max degree to rotate) = (0,360)
    transform_rotation = T.RandomRotation(degrees = (0,360))
    transform_affine = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
    transform_perspective = T.RandomPerspective(distortion_scale=0.6, p=1.0)
    
    img = X_train_full[index]
    
    # Convert numpy array to  PIL Image
    img_pil = PIL.Image.fromarray(img)
    
    # Horizontal Flip 
    augmented_img_horizontal = np.array(transform_horizontal(img_pil))
    
    # Vertical Flip
    augmented_img_vertical = np.array(transform_vertical(img_pil))
    
    # Change brightness
    augmented_img_brightness = np.array(transform_brightness(img_pil))
    
    # Change Rotation
    augmented_img_rotation = np.array(transform_rotation(img_pil))

    # Affine
    augmented_img_affine = np.array(transform_affine(img_pil))
    
    # Random Perspective
    augmented_img_perspective = np.array(transform_perspective(img_pil))
    
    augmented_images = [augmented_img_horizontal,augmented_img_vertical, augmented_img_brightness, augmented_img_rotation, augmented_img_affine, augmented_img_perspective]
        
    return augmented_images

def data_augmentation_visualize(X_train_full,index) : 
    
    augmented_images = data_augmentation(X_train_full, index)
    fig, ax = plt.subplots(4, 2, figsize = (20,20))
    ax[0,0].imshow(X_train_full[index], cmap = 'gray')
    ax[0,0].set_title("Original Image", fontsize = 12)
    ax[0,1].imshow(augmented_images[0], cmap = 'gray')
    ax[0,1].set_title("After Horizontal Flip", fontsize = 12)
    ax[1,0].imshow(augmented_images[1], cmap = 'gray')
    ax[1,0].set_title("After Vertical Flip", fontsize = 12)
    ax[1,1].imshow(augmented_images[2], cmap = 'gray')
    ax[1,1].set_title("After Change of Brightness", fontsize = 12)
    ax[2,0].imshow(augmented_images[3], cmap = 'gray')
    ax[2,0].set_title("After Rotation", fontsize = 12)
    ax[2,1].imshow(augmented_images[4], cmap = 'gray')
    ax[2,1].set_title("After Affine", fontsize = 12)
    ax[3,0].imshow(augmented_images[5], cmap = 'gray')
    ax[3,0].set_title("After Change of Perspective", fontsize = 12)
    ax[3,1].set_visible(False)
    plt.show()

In [None]:
image_index = 0
data_augmentation_visualize(X_train_full,image_index)

In [None]:
def data_aug_dataset(X_train_full, y_train_full) :
    X_train_full_list = X_train_full.tolist()
    y_train_full_list = y_train_full.tolist()
    
    for index in range(len(X_train_full_list)):
        augmented_images = data_augmentation(X_train_full, index)
        for img in augmented_images : 
            X_train_full_list.append(img)
            y_train_full_list.append(y_train_full[index])   
    return np.array(X_train_full_list), np.array(y_train_full_list)

In [None]:
X_train_full_augmented, y_train_full_augmented = data_aug_dataset(X_train_full, y_train_full)

In [None]:
print("Augmented Set: ", X_train_full_augmented.shape)
print("Augmented Set Labels: ", y_train_full_augmented.shape)

In [None]:
for i in range(72,84):
    plt.figure(figsize = (5,5))
    plt.imshow(X_train_full_augmented[i], cmap = 'gray')   
    plt.axis('off')

In [None]:
# Split the dataset into training and validation set and shuffle

X_train_augmented, X_valid_augmented, y_train_augmented, y_valid_augmented = train_test_split(X_train_full_augmented,y_train_full_augmented,
                                   random_state=104, 
                                   test_size=0.25, 
                                   shuffle=True)

In [None]:
print("Training set: ", X_train_augmented.shape)
print("Training Labels : ", y_train_augmented.shape)
print("Validation set: ", X_valid_augmented.shape)
print("Validation Labels : ", y_valid_augmented.shape)

In [None]:
# Flatten the  data 
X_train_augmented = X_train_augmented.flatten().reshape(-1,40000)
X_valid_augmented = X_valid_augmented.flatten().reshape(-1,40000)
X_test = X_test.flatten().reshape(-1,40000)

In [None]:
X_train_augmented[0]

In [None]:
# Scale

# define min max scaler
scaler = MinMaxScaler()
# transform data
X_train_augmented = scaler.fit_transform(X_train_augmented)
X_valid_augmented = scaler.fit_transform(X_valid_augmented)
X_test = scaler.fit_transform(X_test)

In [None]:
X_train_augmented[0]

In [None]:
print("Training set: ", X_train_augmented.shape)
print("Validation set: ", X_valid_augmented.shape)
print("Test set: ", X_test.shape)

# Training Models

## Random Forest 1

In [None]:
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rnd_clf.get_params())

In [None]:
# accuracy on validation set
rf1_score=rnd_clf.score(X_valid_augmented, y_valid_augmented)

In [None]:
# Make predictions for the test set
y_pred_rnd = rnd_clf.predict(X_test)

# accuracy for the test data
rf1_acc=accuracy_score(y_test, y_pred_rnd)

## Random Search

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rnd = RandomForestClassifier(random_state=42)

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rnd_random = RandomizedSearchCV(estimator = rnd, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the random search model
rnd_random.fit(X_train_augmented, y_train_augmented)

In [None]:
rnd_search_score=rnd_random.best_score_
rnd_search_score

In [None]:
rnd_random.best_params_

In [None]:
rnd_random.best_estimator_

## Random Forest 2

In [None]:
rnd_best = RandomForestClassifier(bootstrap=False, max_depth=10, max_features='sqrt',
                       min_samples_split=10, n_estimators=2000,
                       random_state=42)

In [None]:
rnd_best.fit(X_train_augmented, y_train_augmented)

In [None]:
rf2_score=rnd_best.score(X_valid_augmented, y_valid_augmented)


In [None]:
# Make predictions for the test set
y_pred_best = rnd_best.predict(X_test)

# accuracy for the test data
rf2_acc=accuracy_score(y_test, y_pred_best)

In [None]:
# Create the grid
param_grid = {'n_estimators': [600,1200,2000],
               'max_features': [2,3],
               'max_depth': [70,80,90,100],
               'min_samples_split': [8, 10, 12],
               'min_samples_leaf': [1],
               'bootstrap': [False]}

In [None]:
# First create the base model to tune
rnd_grid = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rnd_grid, param_grid, cv=3)
grid_search.fit(X_train_augmented, y_train_augmented)

In [None]:
grid_search.best_estimator_

In [None]:
grid_score=grid_search.best_score_

## Random Forest 3

In [None]:
rnd_grid = RandomForestClassifier(bootstrap=False, max_depth=70, max_features=3,
                       min_samples_split=12, n_estimators=1200,
                       random_state=42)

In [None]:
rnd_grid.fit(X_train_augmented, y_train_augmented)

In [None]:
rf3_score=rnd_grid.score(X_valid_augmented, y_valid_augmented)

In [None]:
# Make predictions for the test set
y_pred_grid = rnd_grid.predict(X_test)

# accuracy for the test data
rf3_acc=accuracy_score(y_test, y_pred_grid)

## Random Forest 4

In [None]:
rnd_random = RandomForestClassifier(random_state=42, n_estimators = 600, 
                                  max_depth=40, min_samples_split = 3 , 
                                  min_samples_leaf = 1, max_features = "sqrt", bootstrap = False)

In [None]:
rnd_random.fit(X_train_augmented, y_train_augmented)


In [None]:
rf4_score=rnd_random.score(X_valid_augmented, y_valid_augmented)


In [None]:
# Make predictions for the test set
y_pred_random = rnd_random.predict(X_test)

# accuracy for the test data
rf4_acc=accuracy_score(y_test, y_pred_random)

## Extra Trees

In [None]:
ext_clf = ExtraTreesClassifier(random_state=42, n_estimators = 1000, 
                                  max_depth=40, min_samples_split = 5 , 
                                  min_samples_leaf = 1, max_features = "sqrt", bootstrap = False)
ext_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
# accuracy on validation set
ext_score=ext_clf.score(X_valid_augmented, y_valid_augmented)

In [None]:
# Make predictions for the test set
y_pred_ext = ext_clf.predict(X_test)

# accuracy for the test data
ext_acc=accuracy_score(y_test, y_pred_ext)

##  SVM

In [None]:
svm_clf = LinearSVC(C=1, max_iter = 500, tol = 20, random_state=42) #  Large value of C can cause overfitting
svm_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
# accuracy on validation set
svm_score=svm_clf.score(X_valid_augmented, y_valid_augmented)

In [None]:
# Make predictions for the test set
y_pred_svm = svm_clf.predict(X_test)

# accuracy for the test set
svm_acc=accuracy_score(y_test, y_pred_svm)

## MLPClassifier

In [None]:
mlp_clf = MLPClassifier(random_state=42, max_iter = 300)
mlp_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
# accuracy on validation set
mlp_score=mlp_clf.score(X_valid_augmented, y_valid_augmented)

In [None]:
# Make predictions for the test set
y_pred_mlp = mlp_clf.predict(X_test)

# accuracy for the test set
mlp_acc=accuracy_score(y_test, y_pred_mlp)

## Ensemble

In [None]:
named_estimators = [
    ("random_forest_clf", rnd_grid),
    ("extra_trees_clf", ext_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

### Hard Voting

In [None]:
voting_clf = VotingClassifier(named_estimators)
voting_clf.voting = "hard"
voting_clf.fit(X_train_augmented, y_train_augmented) # clones every estimator and fits the clones

In [None]:
# Make predictions for the test set
y_pred_vot_hard = voting_clf.predict(X_test)

# accuracy for the test set
hard_acc=accuracy_score(y_test, y_pred_vot_hard)

In [None]:
hard_score=voting_clf.score(X_valid_augmented, y_valid_augmented)
hard_score

### Soft Voting

In [None]:
voting_clf.set_params(svm_clf="drop")

In [None]:
svm_clf_trained = voting_clf.named_estimators_.pop("svm_clf")
voting_clf.estimators_.remove(svm_clf_trained)

In [None]:
voting_clf.voting = "soft"

In [None]:
soft_score=voting_clf.score(X_valid_augmented, y_valid_augmented)

In [None]:
# Make predictions for the test set
y_pred_vot_soft = voting_clf.predict(X_test)

# accuracy for the test set
soft_acc=accuracy_score(y_test, y_pred_vot_soft)

## Plot Metrics

### rnd_grid

In [None]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred_grid))

In [None]:
cm = confusion_matrix(y_test, y_pred_grid)
cm

In [None]:
color = 'white'
matrix = plot_confusion_matrix(rnd_grid, X_test, y_test, cmap=plt.cm.Greens)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()

### Hard Voting

In [None]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred_vot_hard))

In [None]:
cm = confusion_matrix(y_test, y_pred_vot_hard)
cm

In [None]:
color = 'white'
matrix = plot_confusion_matrix(voting_clf, X_test, y_test, cmap=plt.cm.Greens)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()

In [None]:
models=["Random \nForest 1","Random \nForest 2","Random\nSearch","Grid\n Search","Random\n Forest 3","Random \nForest 4","Extra\n Trees","SVM","MLP","Ensemble\n(Hard \nVoting)","Ensemble\n(Soft\n Voting)"]
accuracies=[rf1_score
            ,rf2_score,
            rnd_search_score,
           grid_score,rf3_score,rf4_score,ext_score,svm_score,mlp_score,hard_score,soft_score]
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(models, accuracies, color ='darkgreen',
        width = 0.4)
 
plt.xlabel("Models")
plt.ylabel("Validation Scores")
plt.title("Validation Scores of Models")
plt.show()
 
# Show plot
plt.show()

In [None]:
models=["Random \nForest 1","Random \nForest 2","Random\n Forest 3","Random \nForest 4","Extra\n Trees","SVM","MLP","Ensemble\n(Hard \nVoting)","Ensemble\n(Soft\n Voting)"]
accuracies=[rf1_acc,rf2_acc,rf3_acc,rf4_acc,ext_acc,svm_acc,mlp_acc,hard_acc,soft_acc]
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(models, accuracies, color ='lightgreen',
        width = 0.4)
 
plt.xlabel("Models")
plt.ylabel("Accuracies")
plt.title("Accuracies of Models")
plt.show()
 
# Show plot
plt.show()