
pip install -r requirements.txt
pip install pipwin
pipwin install -r requirements.txt

0=women
1=men
0=no pd
1=pd

In [1]:
## Initial Imports

%matplotlib inline

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import LeaveOneOut, GridSearchCV, KFold, train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report

  import pandas.util.testing as tm


**`Read data and Display data dimension`**

In [2]:
pd_speech_features = pd.read_csv('pd_speech_features.csv')
new_header = pd_speech_features.iloc[0] #grab the first row for the header
pd_speech_features = pd_speech_features[1:] #take the data less the header row
pd_speech_features.columns = new_header #set the header row as the df header
pd_speech_features.head()
print('The shape of the matrix is :', pd_speech_features.shape)

The shape of the matrix is : (756, 755)


#### Gather most general metadata about the data

In [3]:
pd_speech_features['patient/healthy count'] = 1
pd_speech_features.groupby('class').sum()/3

Unnamed: 0_level_0,patient/healthy count
class,Unnamed: 1_level_1
0,64.0
1,188.0


In [4]:
#pd_speech_features['patient/healthy count'] = 1
pd_speech_features['gender'].value_counts()

1    390
0    366
Name: gender, dtype: int64

In [5]:
pd_speech_features[['gender','class']]=pd_speech_features[['gender','class']].astype('int')
pd_speech_features.groupby(by='gender')['class'].sum()/3

gender
0     81.0
1    107.0
Name: class, dtype: float64

In [6]:
pd_speech_features = pd_speech_features.drop(['patient/healthy count'], axis = 1)  #756x755

All the data was imported in the object type, we now need to take care about the types in the dataframe

In [7]:
pd_speech_features =  pd_speech_features.astype(float) #per default all floats 
pd_speech_features[['id', 'numPulses', 'numPeriodsPulses']] = pd_speech_features[['id', 'numPulses', 'numPeriodsPulses']].astype(int) #ints
pd_speech_features[['gender', 'class']] = pd_speech_features[['gender', 'class']].astype('category') #categoricals
pd_speech_features.dtypes

0
id                              int32
gender                       category
PPE                           float64
DFA                           float64
RPDE                          float64
                               ...   
tqwt_kurtosisValue_dec_33     float64
tqwt_kurtosisValue_dec_34     float64
tqwt_kurtosisValue_dec_35     float64
tqwt_kurtosisValue_dec_36     float64
class                        category
Length: 755, dtype: object

In [8]:
pd_speech_features_no_tqwt = pd_speech_features[pd_speech_features.columns[1: -433]]
pd_speech_features_no_tqwt.head()

Unnamed: 0,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,locAbsJitter,...,app_LT_TKEO_std_1_coef,app_LT_TKEO_std_2_coef,app_LT_TKEO_std_3_coef,app_LT_TKEO_std_4_coef,app_LT_TKEO_std_5_coef,app_LT_TKEO_std_6_coef,app_LT_TKEO_std_7_coef,app_LT_TKEO_std_8_coef,app_LT_TKEO_std_9_coef,app_LT_TKEO_std_10_coef
1,1.0,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,1.8e-05,...,6.299,16.7003,42.0762,101.0889,228.8489,493.8563,1015.7707,2091.946,4188.2456,8373.9278
2,1.0,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,1.6e-05,...,6.2381,16.5376,41.7306,100.0918,226.9019,489.9169,1006.3702,2074.4541,4148.9889,8298.1606
3,1.0,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,1.5e-05,...,6.2163,16.4817,41.4869,99.6154,225.7803,486.9865,1001.7348,2064.1067,4127.0967,8254.7868
4,0.0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,4.6e-05,...,6.7833,16.8216,41.3157,94.4579,211.1565,443.3447,955.8128,1890.1299,3910.7029,7698.9389
5,0.0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,4.4e-05,...,6.9366,18.3595,46.2704,108.6792,244.0607,541.2414,1057.2566,2242.546,4297.4639,8645.2845


In [9]:
df=pd_speech_features

In [10]:
class Htest:
    def __init__(self, population_mean_diff=0, p=0.05, cls_names='class', test_type='z_test'):
        
        '''
            cls_vals=target class values
            p=confidence interval, default=0.05
            
            population_mean_diff: 
                        H0(null hypothesis): difference of population mean over feature F, default=0
            
            cls_names=sample classes
            
            test_type: what kind of hypothesis testing
                            
        '''
        
        self.p=p
        self.cls=cls_names
        self.population_mean_diff=population_mean_diff
        
    def z_test(self, df1, df2, threshold_val=1.96, show_top=1):
        '''
        Assumptions:
                    1. sample size=population size
                    2. normal distribution
        input:
                df1=dataframe of numerical features of sample1
                df2=dataframe of numerical features of sample2
                threshold_val= z score of 1.96 for 95% confidence(p<0.05)
        '''
        self.df1=df1
        self.df2=df2
        
        std_error1=(self.df1.std().pow(2.))/self.df1.shape[0]
        std_error2=(self.df2.std().pow(2.))/self.df2.shape[0]
        
        denominator=(std_error1+std_error2).pow(1./2)
        
        nominator=(self.df1.mean()-self.df2.mean())\
                    -self.population_mean_diff
        
        z_scores=nominator/denominator
        feature_z=z_scores[z_scores.abs()>threshold_val].abs()
        
        return feature_z
        

In [11]:
htest=Htest()

In [12]:
df1=df[(df['gender']==0) & (df['class']==1)].drop(['gender','id','class'],axis=1)
df2=df[(df['gender']==1) & (df['class']==1)].drop(['gender','id','class'],axis=1)

In [13]:
fp=htest.z_test(df1,df2)
fp.shape

(332,)

In [32]:
normal_features=[]

for i,j in zip(df.skew().between(-1,1., inclusive=True).index,df.skew().between(-1,1, inclusive=True).values): 
    if j==True:
        #print(i,' ',j)
        normal_features.append(i)

In [33]:
normal_features.append('gender')
normal_features.append('class')

In [34]:
dfn=df.loc[:,normal_features]

In [17]:
dfn.columns.to_list()

['id',
 'DFA',
 'RPDE',
 'numPulses',
 'numPeriodsPulses',
 'meanPeriodPulses',
 'meanHarmToNoiseHarmonicity',
 'f1',
 'f2',
 'f3',
 'f4',
 'GNE_SNR_SEO',
 'GNE_NSR_SEO',
 'VFER_NSR_TKEO',
 'VFER_NSR_SEO',
 'IMF_NSR_SEO',
 'IMF_NSR_entropy',
 'mean_MFCC_0th_coef',
 'mean_MFCC_1st_coef',
 'mean_MFCC_2nd_coef',
 'mean_MFCC_3rd_coef',
 'mean_MFCC_4th_coef',
 'mean_MFCC_5th_coef',
 'mean_MFCC_6th_coef',
 'mean_MFCC_7th_coef',
 'mean_MFCC_8th_coef',
 'mean_MFCC_9th_coef',
 'mean_MFCC_10th_coef',
 'mean_MFCC_11th_coef',
 'mean_MFCC_12th_coef',
 'mean_1st_delta',
 'mean_2nd_delta',
 'mean_3rd_delta',
 'mean_4th_delta',
 'mean_5th_delta',
 'mean_7th_delta',
 'mean_8th_delta',
 'mean_9th_delta',
 'mean_10th_delta',
 'mean_11th_delta',
 'mean_12th_delta',
 'mean_1st_delta_delta',
 'mean_2nd_delta_delta',
 'mean_3rd_delta_delta',
 'mean_4th_delta_delta',
 'mean_5th_delta_delta',
 'mean_6th_delta_delta',
 'mean_8th_delta_delta',
 'mean_9th_delta_delta',
 'mean_10th_delta_delta',
 'mean_11th_delta_

In [35]:
df1=dfn[(dfn['gender']==0) & (dfn['class']==1)].drop(['gender','id','class'],axis=1)
df2=dfn[(dfn['gender']==1) & (dfn['class']==1)].drop(['gender','id','class'],axis=1)

In [37]:
fg=htest.z_test(df1,df2)

In [38]:
fg.shape

(95,)

In [39]:
fg.sort_values(ascending=False).head(50)

0
app_LT_entropy_shannon_3_coef     16.999996
app_LT_entropy_shannon_2_coef     16.946003
app_LT_entropy_shannon_4_coef     16.943117
app_LT_entropy_shannon_1_coef     16.861116
app_LT_TKEO_std_9_coef            16.783765
app_entropy_log_6_coef            16.770024
app_LT_entropy_shannon_5_coef     16.751841
app_entropy_log_7_coef            16.714299
app_LT_TKEO_std_7_coef            16.644014
app_entropy_log_5_coef            16.588256
app_LT_TKEO_std_10_coef           16.579911
app_LT_entropy_shannon_6_coef     16.502967
app_LT_TKEO_mean_10_coef          16.479578
app_LT_TKEO_mean_9_coef           16.449697
app_LT_TKEO_mean_8_coef           16.435172
app_entropy_log_10_coef           16.417442
app_entropy_log_9_coef            16.395183
app_entropy_log_8_coef            16.385684
app_LT_entropy_shannon_7_coef     16.377921
app_LT_TKEO_std_8_coef            16.335716
app_LT_TKEO_std_4_coef            16.202648
app_LT_entropy_shannon_8_coef     16.167823
app_LT_entropy_shannon_9_coef 

In [22]:
df1=dfn[(dfn['class']==0)].drop(['gender','id','class'],axis=1)
df2=dfn[(dfn['class']==1)].drop(['gender','id','class'],axis=1)

In [23]:
fp=htest.z_test(df1,df2)

In [24]:
fp

0
DFA                          8.891309
RPDE                         7.028783
numPulses                    7.070892
numPeriodsPulses             7.090547
meanPeriodPulses             5.505576
                               ...   
tqwt_maxValue_dec_16         7.571182
tqwt_maxValue_dec_17         5.462049
tqwt_maxValue_dec_18         2.830251
tqwt_skewnessValue_dec_28    2.121591
tqwt_kurtosisValue_dec_36    9.697527
Length: 128, dtype: float64

In [25]:
fp.shape

(128,)

In [26]:
fp.sort_values(ascending=False).head(50)

0
std_8th_delta_delta            13.809746
std_9th_delta_delta            13.725554
std_6th_delta_delta            13.718478
std_10th_delta_delta           12.576544
std_11th_delta_delta           12.426792
tqwt_entropy_log_dec_12        12.269921
std_8th_delta                  12.243678
std_9th_delta                  12.036063
std_10th_delta                 11.203688
std_11th_delta                 11.147667
tqwt_entropy_log_dec_13        11.130468
mean_MFCC_2nd_coef             10.782389
tqwt_entropy_log_dec_11        10.641208
std_12th_delta_delta           10.197717
tqwt_entropy_log_dec_14         9.989655
tqwt_entropy_log_dec_8          9.854034
tqwt_kurtosisValue_dec_36       9.697527
tqwt_entropy_log_dec_9          9.381163
tqwt_entropy_shannon_dec_17     9.331510
std_12th_delta                  9.222778
tqwt_entropy_shannon_dec_16     9.192519
tqwt_entropy_log_dec_10         9.106833
DFA                             8.891309
tqwt_stdValue_dec_16            8.839733
f1            

In [50]:
ftrs=list(set(fp.index)-(set(fp.index)-set(fg.index)))
ftrs.append('class')

In [53]:
train_df=df[ftrs].values
x_train=train_df[:,:-1]
y_train=train_df[:,-1]

In [75]:
df[ftrs].to_csv('Commonly_important_features.csv')

In [55]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train= sc.fit_transform(x_train)
#newly added
kfold=KFold(10, shuffle=True, random_state=10)

In [63]:
def svm_model(X,y, kfold):
    params={'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001], 'kernel':['rbf']}

    grid=GridSearchCV(svm.SVC(probability=True, random_state=10),params,cv=kfold, refit=True, verbose=1)
    grid.fit(X,y)
    
    return grid

In [64]:
grid=svm_model(x_train, y_train,kfold)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


In [65]:
grid.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

In [66]:
grid.best_score_

0.9141052631578948

In [67]:
grid.cv_results_

{'mean_fit_time': array([0.20830464, 0.20314698, 0.11563711, 0.10626116, 0.1031348 ,
        0.21095779, 0.1984555 , 0.09844625, 0.1015727 , 0.1078207 ,
        0.21896589, 0.25103323, 0.10922921, 0.11635201, 0.13787088,
        0.23181236, 0.2135752 , 0.12069716, 0.12507443, 0.11604738,
        0.2109273 , 0.20914862, 0.11962235, 0.22949555, 0.12501106]),
 'std_fit_time': array([9.84573367e-03, 1.72947332e-05, 7.65474322e-03, 6.25122841e-03,
        7.65670467e-03, 7.81462799e-03, 7.16347553e-03, 7.16172633e-03,
        7.81303784e-03, 4.68701091e-03, 9.25242322e-03, 5.18831685e-02,
        6.52593006e-03, 2.22776967e-02, 2.69631598e-02, 2.29811953e-02,
        1.48556684e-02, 9.85524804e-03, 1.22481404e-02, 1.44933759e-02,
        8.48630493e-03, 7.24717574e-03, 1.68464397e-03, 7.85288915e-03,
        6.23686442e-03]),
 'mean_score_time': array([0.00882745, 0.        , 0.        , 0.00468826, 0.00468888,
        0.0109405 , 0.00937915, 0.00937657, 0.00781391, 0.00468795,
        0.00

In [68]:
wrong_pred_indices=[]

svm_estimator=svm.SVC(C= 1000, gamma= 0.01, kernel= 'rbf',probability=True, random_state=10)
i=0

for trids, tstids in kfold.split(x_train):
    
    trainx, testx= x_train[trids,:], x_train[tstids,:]
    trainy, testy= y_train[trids], y_train[tstids]
    
    svm_estimator.fit(trainx, trainy)
    
    preds=svm_estimator.predict(x_train[tstids,:])
    
    print(classification_report(testy, svm_estimator.predict(testx)))
    
    if (i==9):#checking if it is last split
        for i in range(len(tstids)):
            if (y_train[tstids[i]]!=preds[i]):
                wrong_pred_indices.append(tstids[i])
    i=i+1

              precision    recall  f1-score   support

         0.0       0.83      0.71      0.77        14
         1.0       0.94      0.97      0.95        62

    accuracy                           0.92        76
   macro avg       0.89      0.84      0.86        76
weighted avg       0.92      0.92      0.92        76

              precision    recall  f1-score   support

         0.0       0.81      0.65      0.72        20
         1.0       0.88      0.95      0.91        56

    accuracy                           0.87        76
   macro avg       0.85      0.80      0.82        76
weighted avg       0.86      0.87      0.86        76

              precision    recall  f1-score   support

         0.0       0.79      0.61      0.69        18
         1.0       0.89      0.95      0.92        58

    accuracy                           0.87        76
   macro avg       0.84      0.78      0.80        76
weighted avg       0.86      0.87      0.86        76

              preci

In [72]:
wrong_pred_indices

[8, 44, 73, 420, 545]

f1 score average over 10 fold=0.88 

In [71]:
import lime
import lime.lime_tabular
import eli5
import shape

In [None]:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(x_train, feature_names=features,
                                                  class_names=[0,1], verbose=True, mode='classification')