In [56]:
import math

import librosa
import numpy as np

def get_mfcc(file_path):
    y, sr = librosa.load(file_path)  # read .wav file
    hop_length = math.floor(sr * 0.010)  # 10ms hop
    win_length = math.floor(sr * 0.025)  # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # subtract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1, 1))
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) 
    # return T x 36 (transpose of X)
    return X.T  # hmmlearn use T x N matrix

In [57]:
import os
import pickle

import hmmlearn.hmm as hmm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [71]:
class_names = ['xemnhietdo', 'xemgio','xemngay']
states = 8
dataset_path = 'datasets/train_audio'

In [72]:
X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}

model = {}
model_path = 'model'

In [73]:
length = 0
for cname in class_names:
    length += len(os.listdir(f"{dataset_path}/{cname}"))
print('Total samples:', length)

all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join(dataset_path, cname, i) for i in os.listdir(
        os.path.join(dataset_path, cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for _ in range(len(file_paths))]

for cname in class_names:
    x_train, x_test, y_train, y_test = train_test_split(
        all_data[cname], all_labels[cname],
        test_size=0.33,
        random_state=42
    )

    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

Total samples: 150


In [74]:
total_train = 0
total_test = 0
for cname in class_names:
    train_count = len(X['train'][cname])
    test_count = len(X['test'][cname])
    print(cname, 'train:', train_count, '| test:', test_count)
    total_train += train_count
    total_test += test_count
print('train samples:', total_train)
print('test samples', total_test)

xemnhietdo train: 40 | test: 20
xemgio train: 40 | test: 20
xemngay train: 20 | test: 10
train samples: 100
test samples 50


In [78]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states, 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states, states), 0.0)

    # trans matrix
    print(cname)

    model[cname] = hmm.GaussianHMM(
        n_components=states,
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(X['train'][cname]),
                            lengths=[x.shape[0] for x in X['train'][cname]])

xemnhietdo


         1     -514888.8175             +nan
         2     -481085.6591      +33803.1584
         3     -470864.3558      +10221.3033
         4     -469045.9071       +1818.4487
         5     -468638.3892        +407.5180
         6     -468387.7209        +250.6683
         7     -468059.8229        +327.8980
         8     -467962.0818         +97.7411
         9     -467936.5281         +25.5537
        10     -467929.9186          +6.6095
        11     -467922.4564          +7.4622
        12     -467919.2390          +3.2174
        13     -467918.4115          +0.8275
        14     -467920.0556          -1.6441


xemgio


         1     -314000.8163             +nan
         2     -292341.7716      +21659.0446
         3     -289178.8879       +3162.8837
         4     -288374.1867        +804.7012
         5     -287960.5448        +413.6419
         6     -287543.3548        +417.1900
         7     -287083.5627        +459.7921
         8     -286737.6715        +345.8913
         9     -286678.8603         +58.8111
        10     -286655.5902         +23.2701
        11     -286649.2854          +6.3048
        12     -286635.0553         +14.2301
        13     -286636.9111          -1.8558


xemngay


         1     -100796.2038             +nan
         2      -92175.4206       +8620.7833
         3      -91812.9769        +362.4437
         4      -91726.8096         +86.1673
         5      -91622.7437        +104.0659
         6      -91542.4422         +80.3015
         7      -91508.3481         +34.0941
         8      -91493.8381         +14.5100
         9      -91484.4362          +9.4019
        10      -91480.3008          +4.1354
        11      -91476.3136          +3.9872
        12      -91468.5874          +7.7262
        13      -91455.2197         +13.3677
        14      -91443.0084         +12.2113
        15      -91427.2511         +15.7573
        16      -91414.9500         +12.3011
        17      -91401.7188         +13.2312
        18      -91392.3526          +9.3662
        19      -91385.7248          +6.6278
        20      -91380.7206          +5.0042
        21      -91377.9327          +2.7878
        22      -91376.2158          +1.7169
        23

In [79]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

In [80]:
print('====== Evaluation ======')
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
    print(f'{cname}:', (np.array(y_true) == np.array(y_pred)).sum() / len(y_true))
print('======')
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

xemnhietdo: 1.0
xemgio: 1.0
xemngay: 0.98
Confusion matrix:
[[20  0  0]
 [ 0 20  0]
 [ 0  1  9]]


Train with full dataset

In [33]:
finalX = {}
for cname in class_names:
    finalX[cname] = X['test'][cname] + X['train'][cname]
    print(cname,len(finalX[cname]))

xemnhietdo 50
xemgio 60
xemngay 30


In [34]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states, 0.0)
    trans_matrix = np.full((states, states), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    # print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states,
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(finalX[cname]),
        lengths=[x.shape[0] for x in finalX[cname]])

xemnhietdo


         1     -689212.9467             +nan
         2     -637646.8510      +51566.0957
         3     -628314.4702       +9332.3808
         4     -625918.5079       +2395.9623
         5     -625109.7998        +808.7081
         6     -624498.7140        +611.0858
         7     -624011.1391        +487.5749
         8     -623489.2560        +521.8831
         9     -623035.0764        +454.1796
        10     -622625.5727        +409.5037
        11     -622356.9391        +268.6336
        12     -622186.0971        +170.8420
        13     -622067.2228        +118.8743
        14     -621966.7296        +100.4932
        15     -621861.2897        +105.4399
        16     -621766.8622         +94.4276
        17     -621677.9996         +88.8625
        18     -621592.7956         +85.2040
        19     -621537.5157         +55.2800
        20     -621494.9461         +42.5696
        21     -621445.7300         +49.2161
        22     -621400.1595         +45.5705
        23

xemgio


         1     -470349.0823             +nan
         2     -437744.6422      +32604.4400
         3     -432900.7473       +4843.8949
         4     -431824.9966       +1075.7508
         5     -431221.7522        +603.2444
         6     -430915.7755        +305.9767
         7     -430767.4832        +148.2923
         8     -430667.8906         +99.5926
         9     -430581.0706         +86.8200
        10     -430515.4203         +65.6503
        11     -430483.0501         +32.3702
        12     -430462.4525         +20.5976
        13     -430443.3886         +19.0639
        14     -430415.6243         +27.7643
        15     -430375.8305         +39.7938
        16     -430323.0057         +52.8248
        17     -430211.3684        +111.6373
        18     -430101.5262        +109.8421
        19     -430081.6752         +19.8510
        20     -430079.4864          +2.1888
        21     -430078.9283          +0.5581
        22     -430078.5121          +0.4162
        23

xemngay


         1     -154471.6228             +nan
         2     -141353.1852      +13118.4375
         3     -140410.6432        +942.5420
         4     -140674.2050        -263.5618


In [35]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

Test with new wav file

In [70]:
test_file_path = 'datasets/random_test_audio/xemgio.wav'
test_mfcc = get_mfcc(test_file_path)
scores = [model[cname].score(test_mfcc) for cname in class_names]
print(scores)
print("Output:",class_names[np.argmax(scores)])

[-10126.85010977268, -10056.375667507526, -12731.50684615672]
Output: xemgio
