In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from pycm import ConfusionMatrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('Standardized_Features.csv')
data.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,chroma_stft,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,label
0,0.282728,-0.474545,-1.43162,-0.83859,-1.296439,-1.893429,-1.771372,-0.975644,-1.69305,-0.905051,-2.385454,-0.195656,-1.8979,-0.203286,0.483809,-0.393816,0.077136,0.846385,1
1,1.193171,-1.450861,0.000711,-0.790228,1.279287,0.75565,0.901923,1.356368,2.278842,1.350389,1.535524,1.345042,1.197492,1.985103,1.772678,1.661541,1.866492,0.415296,0
2,-1.034973,0.627078,0.19952,0.998984,-0.784622,1.528109,-0.577994,1.107375,-0.706535,0.982741,-0.397005,-0.912998,0.486108,-1.381268,-0.979375,-1.090882,-1.125755,-1.22606,0
3,1.384008,0.560417,0.667361,-0.254236,0.610491,0.620806,0.811242,0.099781,-0.245145,1.018141,-0.216337,2.221808,0.605855,0.687013,-0.664155,-0.353625,-0.733579,-0.766253,1
4,0.934085,2.136934,-2.251532,0.690264,-1.07694,-0.852905,0.00193,-0.603969,0.854139,0.59172,0.474665,-1.057779,0.405189,-1.209908,-1.625381,-1.950325,-1.546935,-1.381914,0


In [3]:
labels = ['Logistic Regression', 'Decision Tree (Gini)','Decision Tree (Entropy)', 'SVM Classifier', 'Random Forest','Perceptron', 'MLP Classifier']
clf1 = LogisticRegression()
clf2=DecisionTreeClassifier(criterion = "gini")
clf3=DecisionTreeClassifier(criterion = "entropy")
clf4=SVC()
clf5=RandomForestClassifier()
clf6=Perceptron()
clf7=MLPClassifier()
models=[clf1,clf2,clf3,clf4,clf5,clf6,clf7]

def train(X,y=data['label']):
    for clf, label in zip(models, labels):
        scores = cross_val_score(clf, X, y, cv=10,scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]"
              % (scores.mean(), scores.std(), label))

In [4]:
# On MFCC features only
train(data.iloc[:,:13])

Accuracy: 0.62 (+/- 0.01) [Logistic Regression]
Accuracy: 0.71 (+/- 0.01) [Decision Tree (Gini)]
Accuracy: 0.72 (+/- 0.01) [Decision Tree (Entropy)]
Accuracy: 0.78 (+/- 0.01) [SVM Classifier]
Accuracy: 0.79 (+/- 0.01) [Random Forest]
Accuracy: 0.53 (+/- 0.03) [Perceptron]
Accuracy: 0.78 (+/- 0.01) [MLP Classifier]


In [5]:
# On All features except MFCC
train(data.iloc[:,13:-1])

Accuracy: 0.57 (+/- 0.01) [Logistic Regression]
Accuracy: 0.57 (+/- 0.01) [Decision Tree (Gini)]
Accuracy: 0.58 (+/- 0.01) [Decision Tree (Entropy)]
Accuracy: 0.59 (+/- 0.01) [SVM Classifier]
Accuracy: 0.61 (+/- 0.01) [Random Forest]
Accuracy: 0.51 (+/- 0.02) [Perceptron]
Accuracy: 0.61 (+/- 0.01) [MLP Classifier]


In [6]:
# On All features
train(data.iloc[:,:-1])

Accuracy: 0.62 (+/- 0.01) [Logistic Regression]
Accuracy: 0.72 (+/- 0.01) [Decision Tree (Gini)]
Accuracy: 0.73 (+/- 0.01) [Decision Tree (Entropy)]
Accuracy: 0.78 (+/- 0.01) [SVM Classifier]
Accuracy: 0.79 (+/- 0.01) [Random Forest]
Accuracy: 0.54 (+/- 0.02) [Perceptron]
Accuracy: 0.80 (+/- 0.00) [MLP Classifier]


In [7]:
# From above, we see that SVC, Random Forest and MLP Classifier give best results
# We get best results on All features

In [8]:
# Function to calculate accuracy 
def cal_accuracy(y_test, y_pred): 
    print ("Accuracy : ", accuracy_score(y_test,y_pred)*100) 
    print("Report : ", classification_report(y_test, y_pred)) 
    print(ConfusionMatrix(actual_vector=list(y_test),predict_vector=list(y_pred)))

In [14]:
# MLP
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1],data['label'], test_size=0.10, random_state=42)
clf7.fit(X_train, y_train)
y_pred = clf7.predict(X_test)
cal_accuracy(y_test, y_pred)

Accuracy :  79.33836331979106
Report :                precision    recall  f1-score   support

           0       0.79      0.85      0.82       938
           1       0.80      0.73      0.76       785

    accuracy                           0.79      1723
   macro avg       0.79      0.79      0.79      1723
weighted avg       0.79      0.79      0.79      1723

Predict   0         1         
Actual
0         794       144       

1         212       573       





Overall Statistics : 

95% CI                                                            (0.77427,0.8125)
ACC Macro                                                         0.79338
ARI                                                               0.34378
AUNP                                                              0.78821
AUNU                                                              0.78821
Bennett S                                                         0.58677
CBA                                                

In [13]:
# Random Forest
clf5.fit(X_train, y_train)
y_pred = clf5.predict(X_test)
cal_accuracy(y_test, y_pred)

Accuracy :  78.11955890887987
Report :                precision    recall  f1-score   support

           0       0.76      0.87      0.81       938
           1       0.82      0.67      0.74       785

    accuracy                           0.78      1723
   macro avg       0.79      0.77      0.77      1723
weighted avg       0.79      0.78      0.78      1723

Predict   0         1         
Actual
0         819       119       

1         258       527       





Overall Statistics : 

95% CI                                                            (0.76167,0.80072)
ACC Macro                                                         0.7812
ARI                                                               0.31558
AUNP                                                              0.77224
AUNU                                                              0.77224
Bennett S                                                         0.56239
CBA                                                

In [11]:
# SVC
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
cal_accuracy(y_test, y_pred)

Accuracy :  77.77132907719094
Report :                precision    recall  f1-score   support

           0       0.77      0.84      0.80       938
           1       0.79      0.70      0.74       785

    accuracy                           0.78      1723
   macro avg       0.78      0.77      0.77      1723
weighted avg       0.78      0.78      0.78      1723

Predict   0         1         
Actual
0         788       150       

1         233       552       





Overall Statistics : 

95% CI                                                            (0.75808,0.79735)
ACC Macro                                                         0.77771
ARI                                                               0.30793
AUNP                                                              0.77164
AUNU                                                              0.77164
Bennett S                                                         0.55543
CBA                                               

In [15]:
# save the model to disk
filename = 'final_AD_Model.sav'
pickle.dump(clf7, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.7933836331979106


# Observations

* It is known that Feature Scaling improves performance of:
    1. Gradient Descent Based Algorithms (like  linear regression, logistic regression, and neural network, etc.)
    2. Distance-Based Algorithms (like KNN, K-means, and SVM, etc.)
    
* It is also known that Tree-Based Algorithms (like Decision Tree, and Random Forest, etc.) are fairly insensitive/invariant to the scale of the features.
    
* Let's see if these two properties hold true in our case or not!!

## Only MFCCs

|              | Logistic Regression | SVM Classifier | Perceptron | MLP Classifier |
|--------------|---------------------|----------------|------------|----------------|
| Original     | 62                  | 64             | 56         | 69             |
| Normalized   | 55                  | 55             | 51         | 55             |
| Standardized | 62                  | 78             | 53         | 78             |

|              | Decision Tree (Gini) | Decision Tree (Entropy) | Random Forest |
|--------------|----------------------|-------------------------|---------------|
| Original     | 72                   | 72                      | 79            |
| Normalized   | 51                   | 51                      | 52            |
| Standardized | 71                   | 72                      | 79            |

## Other 5 Features

|              | Logistic Regression | SVM Classifier | Perceptron | MLP Classifier |
|--------------|---------------------|----------------|------------|----------------|
| Original     | 55                  | 55             | 52         | 54             |
| Normalized   | 55                  | 55             | 52         | 55             |
| Standardized | 57                  | 59             | 51         | 61             |

|              | Decision Tree (Gini) | Decision Tree (Entropy) | Random Forest |
|--------------|----------------------|-------------------------|---------------|
| Original     | 58                   | 58                      | 61            |
| Normalized   | 50                   | 51                      | 52            |
| Standardized | 57                   | 58                      | 61            |

## All Features

|              | Logistic Regression | SVM Classifier | Perceptron | MLP Classifier |
|--------------|---------------------|----------------|------------|----------------|
| Original     | 61                  | 55             | 53         | 55             |
| Normalized   | 55                  | 55             | 52         | 55             |
| Standardized | 62                  | 78             | 54         | 80             |

|              | Decision Tree (Gini) | Decision Tree (Entropy) | Random Forest |
|--------------|----------------------|-------------------------|---------------|
| Original     | 72                   | 73                      | 79            |
| Normalized   | 51                   | 51                      | 52            |
| Standardized | 72                   | 73                      | 79            |

* It is evident from the tables above that, ML models when trained on:
1. scaled (standardized to be specific) features, perform a little better than when trained on unscaled features (for Gradient Descent Based and Distance-Based algorithms)
2. unscaled features, perform better/equally than when trained on scaled features (for Tree-Based Algorithms)