### MDVR-KCL experiments using the segmented wav files. (13 mfcc + acoustic features)

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [2]:
df = pd.read_csv("MDVR_split_on_silence_500ms_all_features_v2.csv")
# df = pd.read_csv("MDVR_all_features_chunk_5000ms_v2.csv")
df.drop(['voiceID'], inplace = True, axis = 1)

In [3]:
#drop rows wth na
df = df.dropna()

In [4]:
#separate dependent and independent variable for acoustic features only
X = df.iloc[:, :-1]
df_X = df.iloc[:, :-1]
df_Y = df.iloc[:,-1]

# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size = 0.3, random_state = 0)

In [5]:
# Scale features
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### K Fold Evaluation

In [12]:
def kfold_eval(alg):
    df_X = df.iloc[:, :-1].values
    df_Y = df.iloc[:,-1].values
    
    df_X = pd.DataFrame(df_X)
    df_Y = pd.DataFrame(df_Y)

    parts = 5
    kfold = KFold(parts, shuffle=True)
    
    accuracy_list = []
    sensitivity_list = []
    specificity_list = []
    precision_list =[]
    f1_knn_list = []
    
    for i in range(1, 11):
        accuracy_total = 0
        sensitivity_total =0
        specificity_total=0
        precision_total=0
        f1_knn_total=0
        
        for train, test in kfold.split(df_X,df_Y):
            Xtrain = df_X.iloc[train, :]
            #print(Xtrain)
            Ytrain = df_Y.iloc[train, :]
            #print(Ytrain)
            Xtest = df_X.iloc[test, :]
            Ytest = df_Y.iloc[test, :]
            #scale
            sc = MinMaxScaler()
            Xtrain = sc.fit_transform(Xtrain)
            Xtest = sc.transform(Xtest)
            
            #modelling
            if (alg == "LR"):
                model = LogisticRegression(max_iter=3000)
            elif (alg == "GB"):
                model = GradientBoostingClassifier(learning_rate= 0.5, max_depth=1,n_estimators=3)
            elif (alg == "KNN"):
                model = KNeighborsClassifier(n_neighbors = 6, p=1, weights="distance", leaf_size=1, algorithm="auto")
            elif (alg == "SVM"):
                model = svm.SVC(C=100, gamma=1, kernel="rbf")
            elif (alg == "DT"):
                model = tree.DecisionTreeClassifier()
            elif (alg == "NB"):
                model =  GaussianNB()
            elif (alg == "RF"):
                model = RandomForestClassifier(max_features="auto", min_samples_leaf=1, min_samples_split=2, n_estimators=100)
                
            model.fit(Xtrain, Ytrain.values.ravel())
            y_pred = model.predict(Xtest)

            conf_matrix = confusion_matrix(Ytest, y_pred)
            #print(conf_matrix)
            TN = conf_matrix[0][0]
            FP = conf_matrix[0][1]
            FN = conf_matrix[1][0]
            TP = conf_matrix[1][1]
            #print(TN, ", ", FP, ", ", FN, ", ", TP)

            accuracy = ((TP + TN) / (TP + TN + FP + FN)) * 100
            sensitivity = (TP/(TP+FN)) * 100 #recall
            specificity = (TN/(TN + FP)) * 100
            precision = (TP/(TP+FP)) * 100
            f1_knn = 2 *((sensitivity * precision)/(sensitivity + precision))
            
            #sum it up
            accuracy_total += accuracy
            sensitivity_total += sensitivity
            specificity_total += specificity
            precision_total += precision
            f1_knn_total += f1_knn
            
            #avg
            accuracy_mean = accuracy_total/parts
            sensitivity_mean = sensitivity_total/parts
            specificity_mean = specificity_total/parts
            precision_mean = precision_total/parts
            f1_mean = f1_knn_total/parts
            
        #print("Loop ", i, "\n Mean Accuracy: ", accuracy_total/parts,
                           #  "\n Mean sensitivity: ",sensitivity_total/parts,
                            # "\n Mean Specificity: ", specificity_total/parts,
                             #"\n Mean Precision: ",precision_total/parts,
                             #"\n Mean f1: ",f1_knn_total/parts)
        accuracy_list.append(accuracy_mean)
        sensitivity_list.append(sensitivity_mean)
        specificity_list.append(specificity_mean)
        precision_list.append(precision_mean)
        f1_knn_list.append(f1_mean)

    print("\n==================================================\n")
    print(alg)
    # print("Accuracy for the 10 iterations: ",  accuracy_list) #mean accuracy acros the 6 folds for each iteration
    print("Average accuracy: ", np.mean(accuracy_list), "\n")
    
    # print("Sensitivity for the 10 iterations: ",  sensitivity_list) #mean accuracy acros the 6 folds for each iteration
    print("Average sensitivity: ", np.mean(sensitivity_list), "\n")
        
    # print("Specificity for the 10 iterations: ",  specificity_list) #mean accuracy acros the 6 folds for each iteration
    print("Average specificity: ", np.mean(specificity_list), "\n")
    
    # print("Precision for the 10 iterations: ",  precision_list) 
    print("Average precision: ", np.mean(precision_list), "\n")
    
    # print("F1  score for the 10 iterations: ",  f1_knn_list) 
    print("Average f1 score: ", np.mean(f1_knn_list), "\n")
    
       
    
        
    

In [13]:

kfold_eval("KNN")

print("\n")
kfold_eval("DT")

print("\n")
kfold_eval("SVM")

print("\n")
kfold_eval("NB")

print("\n")
kfold_eval("LR")

print("\n")
kfold_eval("GB")

print("\n")
kfold_eval("RF")








KNN
Average accuracy:  90.13552188552188 

Average sensitivity:  87.59613958614106 

Average specificity:  92.02908390546578 

Average precision:  88.99450768965309 

Average f1 score:  88.25490700148063 





DT
Average accuracy:  78.83858858858859 

Average sensitivity:  74.80866677777696 

Average specificity:  81.85356641689835 

Average precision:  75.29584417718159 

Average f1 score:  74.93129265270099 





SVM
Average accuracy:  91.2292064792065 

Average sensitivity:  89.57820607386451 

Average specificity:  92.47851094916084 

Average precision:  89.764576835457 

Average f1 score:  89.62870412497617 





NB
Average accuracy:  73.67890617890619 

Average sensitivity:  65.79865668049877 

Average specificity:  79.47283477129622 

Average precision:  70.21947051697559 

Average f1 score:  67.87412185025751 





LR
Average accuracy:  79.07502957502956 

Average sensitivity:  69.57808448589105 

Average specificity:  86.08859621601718 

Average precision:  78.59863492233748

In [None]:
# data = {'ML Alg':['KNN', 'SVM', 'Decision Tree', 'Naive Bayes', 'Random Forest'],
#         'Accuracy':[accuracy_knn, accuracy_svm, accuracy_dt, accuracy_nb, accuracy_rf]}
  
# # Create DataFrame
# results = pd.DataFrame(data)
# results