In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-success">
<b>Loading the datasets:</b> We load the datasets from the path provided
</div>

In [3]:
path_of_input_file = 'D:\\kaggle_trials\\parkinsons_dataset\\ReplicatedAcousticFeatures-ParkinsonDatabase.csv'
df                 = pd.read_csv(path_of_input_file,index_col='ID')
df.head(4)

Unnamed: 0_level_0,Recording,Status,Gender,Jitter_rel,Jitter_abs,Jitter_RAP,Jitter_PPQ,Shim_loc,Shim_dB,Shim_APQ3,...,Delta3,Delta4,Delta5,Delta6,Delta7,Delta8,Delta9,Delta10,Delta11,Delta12
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CONT-01,1,0,1,0.25546,1.5e-05,0.001467,0.001673,0.030256,0.26313,0.017463,...,1.407701,1.417218,1.380352,1.42067,1.45124,1.440295,1.403678,1.405495,1.416705,1.35461
CONT-01,2,0,1,0.36964,2.2e-05,0.001932,0.002245,0.023146,0.20217,0.01301,...,1.331232,1.227338,1.213377,1.352739,1.354242,1.365692,1.32287,1.314549,1.318999,1.323508
CONT-01,3,0,1,0.23514,1.3e-05,0.001353,0.001546,0.019338,0.1671,0.011049,...,1.412304,1.324674,1.276088,1.429634,1.455996,1.368882,1.438053,1.38891,1.305469,1.305402
CONT-02,1,0,0,0.2932,1.7e-05,0.001105,0.001444,0.024716,0.20892,0.014525,...,1.5012,1.53417,1.323993,1.496442,1.472926,1.643177,1.551286,1.638346,1.604008,1.621456


<div class="alert alert-block alert-success">
<b>Feature Extraction:</b> We extract the x matrix and Y predictors needed for classification
</div>

In [4]:
cols        = df.columns.tolist()
cols_needed = cols[2:len(cols)-1]
X           = df[cols_needed].values
y           = df[cols[1]].values

In [5]:
num_labels = df['Status'].unique()
print('The number of labels are ',len(num_labels))

The number of labels are  2


In [6]:
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['Status']==num_labels[i]]))
print('We have a pretty balanced dataset and hence we wont need to perform any imbalanced dataset handling')

The number of  0  labels are :-  120
The number of  1  labels are :-  120
We have a pretty balanced dataset and hence we wont need to perform any imbalanced dataset handling


<div class="alert alert-block alert-success">
<b>Feature Scaling:</b> We will perform Min-Max Scaling on our numerical features and one hot encode the categorical features to get the X matrix.
</div>

In [7]:
columns_to_encode = ['Gender']
columns_to_scale  = list(set(cols_needed)-set(['Gender']))

scaler            = MinMaxScaler()
ohe               = OneHotEncoder(sparse=False)

scaled_columns    = scaler.fit_transform(df[columns_to_scale]) 
encoded_columns   =    ohe.fit_transform(df[columns_to_encode])

X_processed_data  = np.concatenate([scaled_columns, encoded_columns], axis=1)

<div class="alert alert-block alert-success">
<b>Train Test Split of data:</b> We will split the data to train and test data to fit the model
</div>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_processed_data, y, test_size=0.33, random_state=42)

<div class="alert alert-block alert-success">
<b>Parameter grid creation and Tuning:</b> We will now create a grid for hyperparameter tuning. We needed to create 3 such grids because there are certain parameters that are dependent on other hyperparameters of the model
</div>

In [8]:
svc_grid_1  = {'C' : hp.uniform('C',0.0,5),
             'kernel' : hp.choice('kernel',['linear','poly','rbf','sigmoid']),
             'decision_function_shape':hp.choice('decision_function_shape',['ovo','ovr']),
               'shrinking' : hp.choice('shrinking',[True,False])
            }
svc_grid_non_linear = {'degree':hp.choice('degree',range(1,10))}
svc_poly_sigmoid    = {'coef0':hp.uniform('coef0',0.0,1.0)}


In [9]:
def hyperopt_train_test(params):
    clf = SVC(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, svc_grid_1, algo=tpe.suggest, max_evals=200, trials=trials)
best_parameters = space_eval(svc_grid_1, best)
final_best_params = best_parameters
if best_parameters['kernel']!='linear':
    trials          = Trials()
    svc_grid_non_linear.update(best_parameters)
    best            = fmin(function_to_minimise, svc_grid_non_linear, algo=tpe.suggest, max_evals=200, trials=trials)
    best_parameter_1 = space_eval(svc_grid_non_linear, best)
    final_best_params = best_parameter_1
if best_parameters['kernel']=='poly' or best_parameters['kernel']=='sigmoid':
    trials          = Trials()
    svc_poly_sigmoid.update(best_parameters)
    best            = fmin(function_to_minimise, svc_poly_sigmoid, algo=tpe.suggest, max_evals=200, trials=trials)
    best_parameters_2 = space_eval(svc_poly_sigmoid, best)
    final_best_params = best_parameters_2
    
print('The best parameter tuned on training set is given by :- ',final_best_params)

100%|███████████████████████████████████████████████| 200/200 [00:01<00:00, 122.50it/s, best loss: -0.8129154795821463]
100%|███████████████████████████████████████████████| 200/200 [00:01<00:00, 174.19it/s, best loss: -0.8129154795821463]
The best parameter tuned on training set is given by :-  {'C': 4.9272622013218434, 'decision_function_shape': 'ovr', 'degree': 7, 'kernel': 'rbf', 'shrinking': False}


<div class="alert alert-block alert-success">
<b>Model Fitting:</b> We will now fit the model with parameters obtained from the previous step 
</div>

In [10]:
svc_clf = SVC(**final_best_params)
svc_clf.fit(X_train,y_train)

SVC(C=4.9272622013218434, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=7, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=False, tol=0.001, verbose=False)

<div class="alert alert-block alert-success">
<b>Final results:</b> We obtained the following results from SVC classifier
</div>

In [11]:
y_pred = svc_clf.predict(X_test)
print('The classification report obtained is below:- \n')
print(classification_report(y_pred,y_test))

The classification report obtained is below:- 

              precision    recall  f1-score   support

           0       0.85      0.85      0.85        40
           1       0.85      0.85      0.85        40

    accuracy                           0.85        80
   macro avg       0.85      0.85      0.85        80
weighted avg       0.85      0.85      0.85        80

