In [1]:
import math

import librosa
import numpy as np

def get_mfcc(file_path):
    y, sr = librosa.load(file_path)  # read .wav file
    hop_length = math.floor(sr * 0.010)  # 10ms hop
    win_length = math.floor(sr * 0.025)  # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # subtract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1, 1))
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0)  # O^r
    # return T x 36 (transpose of X)
    return X.T  # hmmlearn use T x N matrix

In [2]:
import os
import pickle

import hmmlearn.hmm as hmm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [4]:
class_names = ['xemnhietdo', 'xemgio']
states = [8,8]
dataset_path = 'datasets/train_audio'

In [6]:
X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}

model = {}
model_path = 'model'

In [7]:
length = 0
for cname in class_names:
    length += len(os.listdir(f"{dataset_path}/{cname}"))
print('Total samples:', length)

all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join(dataset_path, cname, i) for i in os.listdir(
        os.path.join(dataset_path, cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for _ in range(len(file_paths))]

for cname in class_names:
    x_train, x_test, y_train, y_test = train_test_split(
        all_data[cname], all_labels[cname],
        test_size=0.33,
        random_state=42
    )

    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

Total samples: 110


In [8]:
total_train = 0
total_test = 0
for cname in class_names:
    train_count = len(X['train'][cname])
    test_count = len(X['test'][cname])
    print(cname, 'train:', train_count, '| test:', test_count)
    total_train += train_count
    total_test += test_count
print('train samples:', total_train)
print('test samples', total_test)

xemnhietdo train: 33 | test: 17
xemgio train: 40 | test: 20
train samples: 73
test samples 37


In [9]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx],
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(X['train'][cname]),
                            lengths=[x.shape[0] for x in X['train'][cname]])

xemnhietdo
[[0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -431041.1211             +nan
         2     -402852.8037      +28188.3174
         3     -395594.3568       +7258.4470
         4     -393559.0424       +2035.3143
         5     -392443.5900       +1115.4524
         6     -391713.4664        +730.1236
         7     -391270.9739        +442.4925
         8     -390914.0473        +356.9267
         9     -390623.3559        +290.6913
        10     -390380.5081        +242.8478
        11     -390168.1890        +212.3191
        12     -389968.6348        +199.5542
        13     -389833.3443        +135.2905
        14     -389715.9417        +117.4025
        15     -389664.0348         +51.9069
        16     -389637.2492         +26.7856
        17     -389622.0577         +15.1914
        18     -389609.6840         +12.3737
        19     -389597.8145         +11.8695
        20     -389588.9248          +8.8897
        21     -389585.2458          +3.6789
        22     -389583.9224          +1.3234
        23

xemgio
[[0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -314000.8162             +nan
         2     -292342.3005      +21658.5157
         3     -289176.3832       +3165.9173
         4     -288369.2198        +807.1633
         5     -287952.3035        +416.9163
         6     -287537.2422        +415.0613
         7     -287081.2172        +456.0250
         8     -286729.9543        +351.2629
         9     -286667.1348         +62.8195
        10     -286638.0752         +29.0596
        11     -286620.4595         +17.6157
        12     -286611.7289          +8.7306
        13     -286606.6927          +5.0362
        14     -286603.1139          +3.5788
        15     -286600.5057          +2.6082
        16     -286598.5590          +1.9467
        17     -286597.1130          +1.4459
        18     -286596.1295          +0.9836
        19     -286595.5085          +0.6209
        20     -286595.1561          +0.3525
        21     -286594.9659          +0.1902
        22     -286594.8560          +0.1099
        23

In [10]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

In [11]:
print('====== Evaluation ======')
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
    print(f'{cname}:', (np.array(y_true) == np.array(y_pred)).sum() / len(y_true))
print('======')
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

xemnhietdo: 1.0
xemgio: 1.0
Confusion matrix:
[[17  0]
 [ 0 20]]


Train with full dataset

In [15]:
finalX = {}
for cname in class_names:
    finalX[cname] = X['test'][cname] + X['train'][cname]
    print(cname,len(finalX[cname]))

xemnhietdo 50
xemgio 60


In [16]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    # print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx],
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(finalX[cname]),
        lengths=[x.shape[0] for x in finalX[cname]])

xemnhietdo


         1     -689212.9468             +nan
         2     -637646.8510      +51566.0958
         3     -628314.4702       +9332.3808
         4     -625918.5077       +2395.9625
         5     -625109.7996        +808.7081
         6     -624498.7135        +611.0861
         7     -624011.1385        +487.5751
         8     -623489.2553        +521.8832
         9     -623035.0758        +454.1796
        10     -622625.5721        +409.5037
        11     -622356.9387        +268.6334
        12     -622186.0968        +170.8419
        13     -622067.2226        +118.8742
        14     -621966.7293        +100.4933
        15     -621861.2894        +105.4399
        16     -621766.8618         +94.4276
        17     -621677.9993         +88.8625
        18     -621592.7954         +85.2039
        19     -621537.5155         +55.2799
        20     -621494.9459         +42.5696
        21     -621445.7296         +49.2162
        22     -621400.1592         +45.5705
        23

xemgio


         1     -470349.0823             +nan
         2     -437744.6423      +32604.4400
         3     -432900.7474       +4843.8949
         4     -431824.9967       +1075.7507
         5     -431221.7523        +603.2444
         6     -430915.7755        +305.9767
         7     -430767.4832        +148.2923
         8     -430667.8906         +99.5926
         9     -430581.0706         +86.8200
        10     -430515.4203         +65.6503
        11     -430483.0501         +32.3702
        12     -430462.4525         +20.5976
        13     -430443.3886         +19.0639
        14     -430415.6243         +27.7643
        15     -430375.8305         +39.7938
        16     -430323.0058         +52.8248
        17     -430211.3685        +111.6373
        18     -430101.5263        +109.8422
        19     -430081.6752         +19.8511
        20     -430079.4864          +2.1888
        21     -430078.9283          +0.5581
        22     -430078.5121          +0.4162
        23

In [17]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

Test with new wav file

In [20]:
test_file_path = 'datasets/random_test_audio/xemgio.wav'
test_mfcc = get_mfcc(test_file_path)
scores = [model[cname].score(test_mfcc) for cname in class_names]
print(scores)
print("Output:",class_names[np.argmax(scores)])

[-10130.022710554333, -10065.765174124675]
Output: xemgio
