## Dysarthric Speech Classifier

In [2]:
import pandas as pd
import numpy as np

In [9]:
# Remove unnamed columns 
def remove_cols(df, cols=['unnamed']):
    for c in cols:
        df.drop(df.columns[df.columns.str.contains(c ,case = False)],axis = 1, inplace = True)


### Read all data and store them in appropriate dataframes

In [11]:
acoustic_df = pd.read_csv('Acoustic.csv', delimiter=';')
glottal_df = pd.read_csv('Glottal.csv')
pca_df = pd.read_csv('PCA.csv')

name_label = glottal_df[['filename', 'label']]

remove_cols(acoustic_df)
remove_cols(glottal_df)
remove_cols(pca_df)

acoustic_df = pd.merge(acoustic_df, name_label, on='filename')
pca_df = pd.merge(pca_df, name_label, on='filename')


pca_df.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,...,PCA22,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,filename,label
0,6.072951,-8.379528,5.577261,-3.415463,-0.291024,-2.599755,2.666768,-0.311165,0.753887,7.794293,...,2.448468,-1.973694,0.418576,4.317791,0.54873,-0.908461,-0.378939,-2.327045,D17,1.0
1,9.121491,-5.841063,2.433022,-0.96044,1.74179,-5.793687,0.819953,-2.056339,-1.087733,1.881972,...,-0.294742,-0.906101,-0.342386,0.868768,-2.51279,2.699075,3.160857,-0.871171,D2,1.0
2,6.829803,-4.761638,1.156062,6.030174,-4.836396,1.171264,3.120006,0.11932,3.211237,1.07385,...,-0.566967,-1.356335,0.961374,-0.504994,-1.590274,-0.311877,-0.006198,0.434887,D16,1.0
3,7.105508,-5.262632,4.549543,-2.271064,-0.83939,-4.274856,-0.235649,0.942712,-0.233031,-0.243069,...,1.986139,-1.179298,1.185099,0.948335,-1.440725,1.456308,-0.048675,0.149327,D63,1.0
4,4.020848,-2.693716,-7.290392,-2.415865,-5.678421,0.218685,1.853003,-6.123078,-1.589755,-4.136185,...,2.039828,-0.797543,-0.510078,2.104387,1.501729,-0.061067,-1.635011,-0.783493,D88,1.0


## Building SVM using only Acoustic Features

In [13]:
df = acoustic_df

df.fillna(0, inplace=True)

df.head()

Unnamed: 0,frameTime,pcm_RMSenergy_sma_max,pcm_RMSenergy_sma_min,pcm_RMSenergy_sma_range,pcm_RMSenergy_sma_maxPos,pcm_RMSenergy_sma_minPos,pcm_RMSenergy_sma_amean,pcm_RMSenergy_sma_linregc1,pcm_RMSenergy_sma_linregc2,pcm_RMSenergy_sma_linregerrQ,...,F0_sma_de_minPos,F0_sma_de_amean,F0_sma_de_linregc1,F0_sma_de_linregc2,F0_sma_de_linregerrQ,F0_sma_de_stddev,F0_sma_de_skewness,F0_sma_de_kurtosis,filename,label
0,0.0,0.010374,0.000102,0.010271,82,149,0.002477,-3.260103e-06,0.002743,1.2e-05,...,108,-4.465805e-08,-0.040824,3.327116,216.6325,14.84479,-0.109238,18.60309,D17,1.0
1,0.0,0.008003,0.000108,0.007896,88,5,0.001813,5.445886e-06,0.001394,6e-06,...,54,7.152558e-08,-0.040462,3.115546,616.1741,24.88879,-0.001942,7.910336,D2,1.0
2,0.0,0.011478,0.0001,0.011378,83,194,0.003527,-2.217385e-06,0.003756,1.4e-05,...,52,2.177862e-08,-0.026768,2.770505,303.0591,17.48263,-0.062166,15.81088,D16,1.0
3,0.0,0.005419,0.0001,0.005319,109,206,0.001289,4.164633e-07,0.001246,3e-06,...,47,-3.250674e-08,-0.022291,2.307168,532.0298,23.10457,0.034254,12.49632,D63,1.0
4,0.0,0.01195,0.0001,0.01185,79,31,0.002273,9.087521e-06,0.001574,8e-06,...,86,-1.724449e-08,-0.006227,0.479484,155.4552,12.47128,-0.010072,21.19281,D88,1.0


In [32]:
columns = df.columns.tolist()

features = [f for f in filter(lambda x: x not in ['label', 'frameTime', 'filename'], columns)]
target = ['label']

feature_df = df[features]


from sklearn import preprocessing

x = feature_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
feature_df = pd.DataFrame(x_scaled)

X = np.asarray(feature_df)

y = np.asarray(df['label'])

print(X)

[[0.07914046 0.00549249 0.08052999 ... 0.5008237  0.20763629 0.1154856 ]
 [0.06037197 0.00773829 0.06137653 ... 0.83968152 0.2154339  0.04910635]
 [0.08788768 0.00455437 0.08945459 ... 0.5898174  0.2110572  0.09815192]
 ...
 [0.30587623 0.93119215 0.29396766 ... 0.54698785 0.22922141 0.04404861]
 [0.15888222 0.9641977  0.14366793 ... 0.29811483 0.22660843 0.19414331]
 [0.61571701 0.97210002 0.60869501 ... 0.54867674 0.20688717 0.05129345]]


In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)

X = scaler.transform(X)

In [34]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

X_train.shape


(181, 384)

Build and measure accuracy of SVM

In [36]:
from sklearn import svm
from sklearn.metrics import classification_report


classifier = svm.SVC(kernel='rbf', C=0.1, gamma=1)

classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        13
         1.0       0.38      1.00      0.55         8

    accuracy                           0.38        21
   macro avg       0.19      0.50      0.28        21
weighted avg       0.15      0.38      0.21        21



  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.38095238095238093

# SequentialFeatureSelector

<img src="sequentialModelDiagram.png">

In [13]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.svm import SVC 
classifier = SVC(gamma='auto', C=10)
# Sequential Forward Selection(sfs)
feature_selector = SFS(SVC(kernel='rbf', C=1000, gamma=0.1),
           k_features=20,
           forward=True,
           verbose=2,
           scoring='roc_auc',
           cv=4)

In [14]:
feature_selector.fit(X_train, y_train)
feature_selector.k_feature_idx_  

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 481 out of 481 | elapsed:   14.5s finished

[2020-10-29 19:08:30] Features: 1/20 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:    8.3s finished

[2020-10-29 19:08:38] Features: 2/20 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 479 out of 479 | elapsed:    7.5s finished

[2020-10-29 19:08:46] Features: 3/20 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

(0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 19, 20, 22, 23, 24, 30)

In [15]:
filtered_features= df[features].columns[list(feature_selector.k_feature_idx_)]
filtered_features

Index(['Unnamed: 0', 'pcm_RMSenergy_sma_max', 'pcm_RMSenergy_sma_min',
       'pcm_RMSenergy_sma_range', 'pcm_RMSenergy_sma_amean',
       'pcm_RMSenergy_sma_linregc1', 'pcm_RMSenergy_sma_linregc2',
       'pcm_RMSenergy_sma_linregerrQ', 'pcm_RMSenergy_sma_stddev',
       'pcm_RMSenergy_sma_skewness', 'pcm_RMSenergy_sma_kurtosis',
       'pcm_fftMag_mfcc_sma[1]_max', 'pcm_fftMag_mfcc_sma[1]_range',
       'pcm_fftMag_mfcc_sma[1]_amean', 'pcm_fftMag_mfcc_sma[1]_linregc1',
       'pcm_fftMag_mfcc_sma[1]_linregc2', 'pcm_fftMag_mfcc_sma[1]_stddev',
       'pcm_fftMag_mfcc_sma[1]_skewness', 'pcm_fftMag_mfcc_sma[1]_kurtosis',
       'pcm_fftMag_mfcc_sma[2]_amean'],
      dtype='object')

In [27]:
from sklearn import svm


classifier = svm.SVC(kernel='rbf', C=1000, gamma=0.1)

classifier.fit(X_train, y_train)

y_predict = classifier.predict(X_test)

In [28]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        13
         1.0       0.38      1.00      0.55         8

    accuracy                           0.38        21
   macro avg       0.19      0.50      0.28        21
weighted avg       0.15      0.38      0.21        21



  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.38095238095238093

# core code for SFS

In [23]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features