In [1]:
import math

import librosa
import numpy as np

def get_mfcc(file_path):
    y, sr = librosa.load(file_path)  # read .wav file
    hop_length = math.floor(sr * 0.010)  # 10ms hop
    win_length = math.floor(sr * 0.025)  # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # subtract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1, 1))
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0)  # O^r
    # return T x 36 (transpose of X)
    return X.T  # hmmlearn use T x N matrix

In [2]:
import os
import pickle

import hmmlearn.hmm as hmm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [96]:
class_names = ['xemnhietdo', 'xemgio']
states = [8,8]
dataset_path = 'audio'

In [97]:
X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}

model = {}
model_path = 'models_train'

In [98]:
length = 0
for cname in class_names:
    length += len(os.listdir(f"{dataset_path}/{cname}"))
print('Total samples:', length)

all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join(dataset_path, cname, i) for i in os.listdir(
        os.path.join(dataset_path, cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for _ in range(len(file_paths))]

for cname in class_names:
    x_train, x_test, y_train, y_test = train_test_split(
        all_data[cname], all_labels[cname],
        test_size=0.33,
        random_state=42
    )

    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

Total samples: 110


In [99]:
total_train = 0
total_test = 0
for cname in class_names:
    train_count = len(X['train'][cname])
    test_count = len(X['test'][cname])
    print(cname, 'train:', train_count, '| test:', test_count)
    total_train += train_count
    total_test += test_count
print('train samples:', total_train)
print('test samples', total_test)

xemnhietdo train: 33 | test: 17
xemgio train: 40 | test: 20
train samples: 73
test samples 37


In [100]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx],
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(X['train'][cname]),
                            lengths=[x.shape[0] for x in X['train'][cname]])

xemnhietdo
[[0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -431041.1212             +nan
         2     -402852.8035      +28188.3176
         3     -395594.3568       +7258.4467
         4     -393559.0430       +2035.3138
         5     -392443.5909       +1115.4520
         6     -391713.4670        +730.1239
         7     -391270.9743        +442.4927
         8     -390914.0476        +356.9267
         9     -390623.3564        +290.6912
        10     -390380.5085        +242.8478
        11     -390168.1895        +212.3191
        12     -389968.6352        +199.5543
        13     -389833.3445        +135.2906
        14     -389715.9419        +117.4026
        15     -389664.0349         +51.9071
        16     -389637.2492         +26.7857
        17     -389622.0577         +15.1915
        18     -389609.6840         +12.3737
        19     -389597.8145         +11.8695
        20     -389588.9248          +8.8897
        21     -389585.2458          +3.6790
        22     -389583.9224          +1.3234
        23

xemgio
[[0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -314000.8162             +nan
         2     -292342.3005      +21658.5157
         3     -289176.3832       +3165.9173
         4     -288369.2198        +807.1634
         5     -287952.3035        +416.9164
         6     -287537.2432        +415.0602
         7     -287081.2172        +456.0261
         8     -286729.9543        +351.2629
         9     -286667.1348         +62.8195
        10     -286638.0752         +29.0596
        11     -286620.4595         +17.6157
        12     -286611.7289          +8.7306
        13     -286606.6926          +5.0362
        14     -286603.1139          +3.5788
        15     -286600.5057          +2.6082
        16     -286598.5590          +1.9467
        17     -286597.1130          +1.4459
        18     -286596.1295          +0.9836
        19     -286595.5085          +0.6209
        20     -286595.1561          +0.3525
        21     -286594.9659          +0.1902
        22     -286594.8560          +0.1099
        23

In [101]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

In [102]:
print('====== Evaluation ======')
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
    print(f'{cname}:', (np.array(y_true) == np.array(y_pred)).sum() / len(y_true))
print('======')
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

xemnhietdo: 1.0
xemgio: 1.0
Confusion matrix:
[[17  0]
 [ 0 20]]


Train with full dataset

In [103]:
finalX = {}
for cname in class_names:
    finalX[cname] = X['test'][cname] + X['train'][cname]

In [104]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    # print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx],
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(finalX[cname]),
        lengths=[x.shape[0] for x in finalX[cname]])

xemnhietdo


         1     -689212.9467             +nan
         2     -637646.8511      +51566.0957
         3     -628314.4702       +9332.3809
         4     -625918.5078       +2395.9624
         5     -625109.7997        +808.7081
         6     -624498.7137        +611.0860
         7     -624011.1387        +487.5750
         8     -623489.2556        +521.8831
         9     -623035.0760        +454.1796
        10     -622625.5724        +409.5037
        11     -622356.9389        +268.6334
        12     -622186.0970        +170.8419
        13     -622067.2227        +118.8743
        14     -621966.7295        +100.4932
        15     -621861.2896        +105.4399
        16     -621766.8620         +94.4276
        17     -621677.9995         +88.8625
        18     -621592.7955         +85.2040
        19     -621537.5156         +55.2800
        20     -621494.9460         +42.5696
        21     -621445.7298         +49.2162
        22     -621400.1593         +45.5705
        23

xemgio


         1     -470349.0823             +nan
         2     -437744.6421      +32604.4401
         3     -432900.7473       +4843.8949
         4     -431824.9965       +1075.7508
         5     -431221.7521        +603.2444
         6     -430915.7754        +305.9767
         7     -430767.4832        +148.2922
         8     -430667.8906         +99.5926
         9     -430581.0706         +86.8200
        10     -430515.4203         +65.6503
        11     -430483.0501         +32.3702
        12     -430462.4525         +20.5976
        13     -430443.3886         +19.0639
        14     -430415.6243         +27.7643
        15     -430375.8305         +39.7938
        16     -430323.0057         +52.8248
        17     -430211.3684        +111.6373
        18     -430101.5262        +109.8422
        19     -430081.6752         +19.8510
        20     -430079.4864          +2.1888
        21     -430078.9283          +0.5581
        22     -430078.5121          +0.4162
        23

In [105]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

Test with new wav file

In [113]:
test_file_path = 'xemnd.wav'
test_mfcc = get_mfcc(test_file_path)
scores = [model[cname].score(test_mfcc) for cname in class_names]
print(scores)
print("Output:",class_names[np.argmax(scores)])

[-11209.51934401379, -12102.927778346906]
Output: xemnhietdo
