# Data Preprocessing and Modelling

In this notebook, we will focus on extracting MFCC features and modelling.

In [26]:
#Imports

import librosa #library to extract MFCC features
import pandas as pd
import numpy as np
import os
import pickle

#Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [27]:
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name)
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=16)
    
    return np.mean(mfccs_features.T,axis=0) #reshape it for the models

In [28]:
features = []
labels = []

Get features for non-snore data.

In [29]:
for file in os.listdir('data/0'):
    features.append(features_extractor(os.path.join('data','0',file)))
    labels.append(0)

Get features for snore data.

In [30]:
for file in os.listdir('data/1'):
    features.append(features_extractor(os.path.join('data','1',file)))
    labels.append(1)

In [31]:
X = np.array(features)
y = np.array(labels)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

# Logistic Regression

In [33]:
lg = LogisticRegression(max_iter=10_000)

In [34]:
grid = GridSearchCV(lg, scoring = 'accuracy', cv=10,param_grid={})

In [35]:
grid.fit(X_train,y_train)

In [36]:
grid.best_score_

0.7613333333333334

In [37]:
grid.score(X_test,y_test)

0.712

We can see that there is a slight overfit.

We will attempt to improve the accuracy first by increasing the number of features by increasing the number of MFCCs.

In [38]:
def mfccs_extractor(n_mfccs):
    def features_extractor(file_name,n_mfccs):
        audio, sample_rate = librosa.load(file_name)
        mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfccs)

        return np.mean(mfccs_features.T,axis=0)
    features = []
    labels = []
    for file in os.listdir('data/0'):
        features.append(features_extractor(os.path.join('data','0',file),n_mfccs))
        labels.append(0)
    for file in os.listdir('data/1'):
        features.append(features_extractor(os.path.join('data','1',file),n_mfccs))
        labels.append(1)
    X = np.array(features)
    y = np.array(labels)
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
    return X,y,X_train, X_test, y_train, y_test

In [39]:
X, y, X_train, X_test, y_train, y_test = mfccs_extractor(32)

In [40]:
grid.fit(X_train,y_train)

In [41]:
grid.best_score_

0.884

In [42]:
grid.score(X_test,y_test)

0.904

We see that by increasing the number of MFCCs, we can improve our score. Let's see if we can improve it somemore without overfitting.

In [43]:
X, y, X_train, X_test, y_train, y_test = mfccs_extractor(64)

In [44]:
grid.fit(X_train,y_train)

In [45]:
grid.best_score_

0.9453333333333334

In [46]:
grid.score(X_test,y_test)

0.936

In [47]:
X ,y ,X_train, X_test, y_train, y_test = mfccs_extractor(128)

In [48]:
grid.fit(X_train,y_train)
grid.best_score_

0.9480000000000001

In [49]:
grid.score(X_test,y_test)

0.936

Increasing for 64 features to 128 features barely improved the model. So we will stick with 64 features.

We will save our best linear regression model for further testing later.

In [50]:
X, y, X_train, X_test, y_train, y_test = mfccs_extractor(64)

In [51]:
grid.fit(X_train,y_train)

In [52]:
best_lg = grid.best_estimator_

In [57]:
with open('./models/logistic_regression.pkl','wb') as f:
    pickle.dump(best_lg, f)

# Testing out other classification models

## Setting up 64 features

In [58]:
X, y, X_train, X_test, y_train, y_test = mfccs_extractor(64)

## Naive Bayes

In [59]:
nb = GaussianNB()

In [60]:
grid = GridSearchCV(nb, scoring = 'accuracy', cv=10,param_grid={})

In [61]:
grid.fit(X_train,y_train)
grid.best_score_

0.7906666666666667

In [62]:
grid.score(X_test,y_test)

0.772

In [63]:
best_nb = grid.best_estimator_

In [64]:
with open('./models/naive_bayes.pkl','wb') as f:
    pickle.dump(best_nb, f)

## K-Nearest Neighbours

In [65]:
knn = KNeighborsClassifier()

In [66]:
grid = GridSearchCV(knn, scoring = 'accuracy', cv=10,param_grid={'n_neighbors':[5,10,15]})

In [67]:
grid.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__algorithm': 'auto',
 'estimator__leaf_size': 30,
 'estimator__metric': 'minkowski',
 'estimator__metric_params': None,
 'estimator__n_jobs': None,
 'estimator__n_neighbors': 5,
 'estimator__p': 2,
 'estimator__weights': 'uniform',
 'estimator': KNeighborsClassifier(),
 'n_jobs': None,
 'param_grid': {'n_neighbors': [5, 10, 15]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': 'accuracy',
 'verbose': 0}

In [68]:
grid.fit(X_train,y_train)
grid.best_score_

0.9560000000000002

In [69]:
grid.score(X_test,y_test)

0.94

In [70]:
grid.best_params_

{'n_neighbors': 5}

In [71]:
best_knn = grid.best_estimator_

In [72]:
with open('./models/knn.pkl','wb') as f:
    pickle.dump(best_knn, f)

## Random Forest

In [73]:
rfm = RandomForestClassifier()

In [74]:
grid = GridSearchCV(rfm, scoring = 'accuracy', cv=10,param_grid={'n_estimators':[1,10,20,50,70,100,1000,1500,2000],'n_jobs':[-1]})

In [75]:
grid.fit(X_train,y_train)
grid.best_score_

0.984

In [76]:
grid.score(X_test,y_test)

0.988

In [77]:
grid.best_params_

{'n_estimators': 1500, 'n_jobs': -1}

In [78]:
best_rfm = grid.best_estimator_

In [79]:
with open('./models/rfm.pkl','wb') as f:
    pickle.dump(best_rfm, f)

## Support Vector Machine

In [80]:
svm = SVC()

In [81]:
grid = GridSearchCV(svm, scoring = 'accuracy', cv=10,param_grid={'kernel':['rbf','linear'],'C':[1,0.1,0.025]})

In [82]:
grid.fit(X_train,y_train)
grid.best_score_

0.9640000000000001

In [83]:
grid.score(X_test,y_test)

0.944

In [84]:
grid.best_params_

{'C': 0.025, 'kernel': 'linear'}

In [85]:
best_svm = grid.best_estimator_

In [86]:
with open('./models/svm.pkl','wb') as f:
    pickle.dump(best_svm, f)