In [1]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
this_dir = Path.cwd()
csv_file= this_dir / "data/pd_speech_features.csv"

In [3]:
df = pd.read_csv(csv_file, skiprows=[0])
df.head()

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,...,1.562,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.178,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.846,6.265,4.0603,1
4,1,0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1


In [4]:
df.drop(columns=['id'], inplace=True)
skip_column = ['gender', 'class']
columns =list(df.columns)
columns = [c for c in columns if c not in skip_column]
for col in columns:
    df[col] = (df[col] - df[col].mean())/df[col].std(ddof=0)

df.head()

Unnamed: 0,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,locAbsJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,1,0.627644,0.256144,0.605835,-0.846892,-0.842373,0.933328,-0.407251,-0.054993,0.037843,...,-0.584822,-0.619412,-0.576762,-0.482286,-0.399331,-0.484533,-0.775137,-0.814727,-0.366595,1
1,1,0.12162,-0.080433,0.368415,-0.907404,-0.902773,1.040014,-0.426092,-0.14257,-0.027698,...,-0.584895,-0.589778,0.193084,0.016183,-0.06712,-0.175566,-0.526647,-0.582972,0.400396,1
2,1,0.61795,-0.349839,0.733609,-0.927575,-0.922907,1.084576,-0.443557,-0.214916,-0.088871,...,-0.584767,-0.629033,-0.356261,-0.156055,-0.067593,-0.463462,-0.756063,-0.80439,-0.780935,1
3,0,-1.98056,1.382279,0.753631,-1.472186,-1.466513,2.464215,-0.275316,0.710353,1.256919,...,-0.532242,-0.591137,-0.522406,0.0084,-0.449894,-0.470865,-0.633475,-0.588387,-0.801583,1
4,0,-2.472989,1.398068,0.300123,-0.887233,-0.88264,0.987044,3.143597,1.152045,1.178269,...,-0.475545,-0.521356,-0.49009,-0.404833,-0.249678,-0.042021,-0.419354,-0.672216,-0.741477,1


In [5]:
data = df.to_numpy(dtype=np.float32)
data.shape

(756, 754)

In [6]:
features, labels = data[:, :-1], data[:, -1]

In [7]:
features.shape

(756, 753)

In [8]:
kfold = KFold(n_splits=10, shuffle=True, random_state=450)

In [12]:
accuracy = []
precision = []
recall = []

for train_indexes, test_indexes in kfold.split(data):
    X_train, y_train = features[train_indexes], labels[train_indexes]
    X_test, y_test = features[test_indexes], labels[test_indexes]
    
    svc=svm.SVC(probability=True, kernel='rbf')
    clf = AdaBoostClassifier(base_estimator=svc)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy.append(metrics.accuracy_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))

In [13]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

Accuracy:  [0.6710526315789473, 0.7236842105263158, 0.7105263157894737, 0.7368421052631579, 0.7105263157894737, 0.8421052631578947, 0.8, 0.76, 0.76, 0.7466666666666667]
Precision:  [0.6710526315789473, 0.7236842105263158, 0.7105263157894737, 0.7368421052631579, 0.7105263157894737, 0.8421052631578947, 0.8, 0.76, 0.76, 0.7466666666666667]
Recall:  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [14]:
print("Avg accuracy:", sum(accuracy)/len(accuracy))
print("Avg precision:", sum(precision)/len(precision))
print("Avg recall:", sum(recall)/len(recall))

Avg accuracy: 0.746140350877193
Avg precision: 0.746140350877193
Avg recall: 1.0


## With Hyperparameter tuning

In [None]:
param_grid = { 'C':[0.1,1,100,1000],
              'kernel':['rbf','poly','sigmoid','linear'],
              'degree':[1,2,3,4,5,6],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

accuracy = []
precision = []
recall = []

clf = svm.SVC()
grid = GridSearchCV(clf, param_grid, n_jobs=12, cv = 10, scoring='accuracy', verbose=1)
grid.fit(features, labels)

print(grid.best_params_)
# print(grid.score(X_test,y_test))

In [None]:
grid.cv_results_.keys()

In [None]:
grid.