In [1]:
import math

import librosa
import numpy as np

In [2]:


def get_mfcc(file_path):
    y, sr = librosa.load(file_path)  # read .wav file
    hop_length = math.floor(sr * 0.010)  # 10ms hop
    win_length = math.floor(sr * 0.025)  # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # subtract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1, 1))
    # delta feature 1st order and 2nd order
    # delta1 = librosa.feature.delta(mfcc, order=1)
    # delta2 = librosa.feature.delta(mfcc, order=2)
    # # X is 36 x T
    # X = np.concatenate([mfcc, delta1, delta2], axis=0) 
    # # return T x 36 (transpose of X)
    return np.array(mfcc).T  # hmmlearn use T x N matrix

In [3]:
import os
import pickle

import hmmlearn.hmm as hmm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [4]:
class_names = ['xemnhietdo','xemgio','xemngay','xemthoitiet','tatden','batden','tatquat','batquat','tatnhac','batnhac']
states = [10, 7, 8, 11, 8, 8, 8, 8, 8, 8]
dataset_path = 'datasets/train_audio'

In [5]:
X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}

model = {}
model_path = 'model'

In [76]:
len(os.listdir(dataset_path+"/"+"batden"))

21

In [49]:
for cname in class_names:
    file_paths = [i for i in os.listdir(os.path.join(dataset_path, cname))]
vocal_train,vocal_test = train_test_split(file_paths,test_size=0.25,random_state=42)
print(vocal_test)
vocal_train.remove('Phuong')
print(vocal_train)



['An', 'Toan', 'Thinh', 'Anh', 'Nhat']
['Lan', 'Quang', 'Hoa', 'Vu', 'Tien', 'Tai', 'Hieu', 'Nhu', 'Yen', 'Hung', 'San', 'Nghi', 'Thai', 'May']


In [50]:
X = {'train':{},'test':{}}
Y = {'train':{},'test':{}}
for cname in class_names:
    train_data = []
    for v in vocal_train:
        paths = dataset_path+"/"+cname+"/"+v
        for i in os.listdir(paths):
            train_data.append(get_mfcc(paths+"/"+i))
    X['train'][cname] = train_data
    Y['train'][cname] = [class_names.index(cname) for _ in range(len(vocal_train)*5)]
    test_data = []
    for v in vocal_test:
        paths = dataset_path+"/"+cname+"/"+v
        for i in os.listdir(paths):
            test_data.append(get_mfcc(paths+"/"+i))
    X['test'][cname] = test_data        
    Y['test'][cname] = [class_names.index(cname) for _ in range(len(vocal_test)*5)]


In [None]:
all_data = {}
all_labels = {}
for cname in class_names:
    all_data[cname] = []
    all_labels[cname] = []
    for v in os.listdir(dataset_path+"/"+cname):
        if v == 'Phuong':
            continue
        for i in os.listdir(dataset_path+"/"+cname+"/"+v):
            file_pathsss = dataset_path+"/"+cname+"/"+v+"/"+i
            print(file_pathsss)
            all_data[cname].append(get_mfcc(file_pathsss))
            # print(dataset_path+"/"+cname+"/"+v+"/"+i)
    all_labels[cname] = [class_names.index(cname) for _ in range(100)]

In [78]:
len(all_labels['batden'])

100

In [64]:
# all_data = {}
# all_labels = {}
# for cname in class_names:
#     file_paths = [os.path.join(dataset_path, cname, i) for i in os.listdir(
#         os.path.join(dataset_path, cname)) if i.endswith('.wav')]
#     data = [get_mfcc(file_path) for file_path in file_paths]
#     all_data[cname] = data
#     all_labels[cname] = [class_names.index(cname) for _ in range(len(file_paths))]

for cname in class_names:
    x_train, x_test, y_train, y_test = train_test_split(
        all_data[cname], all_labels[cname],
        test_size=0.33,
        random_state=42
    )

    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

Total samples: 150


In [52]:
total_train = 0
total_test = 0
for cname in class_names:
    train_count = len(X['train'][cname])
    test_count = len(X['test'][cname])
    print(cname, 'train:', train_count, '| test:', test_count)
    total_train += train_count
    total_test += test_count
print('train samples:', total_train)
print('test samples', total_test)

xemnhietdo train: 70 | test: 25
xemgio train: 70 | test: 25
xemngay train: 70 | test: 25
xemthoitiet train: 70 | test: 25
tatden train: 70 | test: 25
batden train: 70 | test: 25
tatquat train: 70 | test: 25
batquat train: 70 | test: 25
tatnhac train: 70 | test: 25
batnhac train: 70 | test: 25
train samples: 700
test samples 250


In [62]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0
    # trans matrix
    print(cname)

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx],
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(X['train'][cname]),
                            lengths=[x.shape[0] for x in X['train'][cname]])

xemnhietdo


         1     -525483.9663             +nan
         2     -494153.5987      +31330.3676
         3     -486817.2167       +7336.3820
         4     -484768.0558       +2049.1609
         5     -484170.5599        +597.4960
         6     -483966.1400        +204.4199
         7     -483875.4998         +90.6402
         8     -483834.2074         +41.2924
         9     -483804.0591         +30.1483
        10     -483786.2587         +17.8004
        11     -483776.0003         +10.2584
        12     -483765.9228         +10.0775
        13     -483758.0728          +7.8500
        14     -483751.6682          +6.4047
        15     -483744.6848          +6.9834
        16     -483735.9179          +8.7669
        17     -483724.9052         +11.0127
        18     -483715.2189          +9.6863
        19     -483702.1514         +13.0675
        20     -483688.7539         +13.3975
        21     -483677.5780         +11.1759
        22     -483666.8297         +10.7483
        23

xemgio


         1     -436618.1870             +nan
         2     -411617.9078      +25000.2792
         3     -407469.1279       +4148.7800
         4     -405989.6162       +1479.5117
         5     -405519.0648        +470.5514
         6     -405397.7056        +121.3593
         7     -405364.9752         +32.7304
         8     -405353.3265         +11.6487
         9     -405347.3758          +5.9507
        10     -405342.0558          +5.3200
        11     -405335.7554          +6.3004
        12     -405328.8771          +6.8783
        13     -405322.4384          +6.4387
        14     -405316.7896          +5.6488
        15     -405311.6997          +5.0899
        16     -405308.1466          +3.5531
        17     -405305.2352          +2.9114
        18     -405303.0012          +2.2340
        19     -405301.6119          +1.3893
        20     -405300.8229          +0.7890
        21     -405300.3788          +0.4441
        22     -405300.1240          +0.2547
        23

xemngay


         1     -480595.5104             +nan
         2     -454751.9617      +25843.5487
         3     -449189.9711       +5561.9906
         4     -446548.5040       +2641.4671
         5     -444926.3375       +1622.1665
         6     -443882.2910       +1044.0465
         7     -443607.4713        +274.8197
         8     -443530.5915         +76.8798
         9     -443506.7405         +23.8510
        10     -443486.2136         +20.5269
        11     -443453.6274         +32.5861
        12     -443440.2511         +13.3764
        13     -443434.8144          +5.4366
        14     -443429.7861          +5.0284
        15     -443424.9047          +4.8814
        16     -443420.2298          +4.6748
        17     -443416.0011          +4.2288
        18     -443413.8764          +2.1247
        19     -443412.5752          +1.3012
        20     -443411.7345          +0.8407
        21     -443411.1548          +0.5797
        22     -443410.6822          +0.4726
        23

xemthoitiet


         1     -506504.2714             +nan
         2     -473307.6864      +33196.5850
         3     -466567.8126       +6739.8738
         4     -464617.4966       +1950.3160
         5     -464204.3546        +413.1420
         6     -464105.0494         +99.3051
         7     -464065.6533         +39.3961
         8     -464049.7275         +15.9258
         9     -464029.2827         +20.4449
        10     -464013.8642         +15.4184
        11     -464000.1120         +13.7522
        12     -463987.9890         +12.1230
        13     -463976.2139         +11.7751
        14     -463958.2179         +17.9960
        15     -463932.1173         +26.1006
        16     -463899.4186         +32.6988
        17     -463884.0323         +15.3862
        18     -463878.2785          +5.7539
        19     -463873.0808          +5.1977
        20     -463870.3885          +2.6923
        21     -463869.3316          +1.0569
        22     -463868.9057          +0.4259
        23

tatden


         1     -456706.0775             +nan
         2     -427416.0528      +29290.0246
         3     -420746.2509       +6669.8019
         4     -418317.1237       +2429.1272
         5     -417510.3495        +806.7742
         6     -417392.5217        +117.8278
         7     -417357.3282         +35.1935
         8     -417331.0087         +26.3195
         9     -417305.2194         +25.7892
        10     -417293.8844         +11.3350
        11     -417284.5556          +9.3288
        12     -417270.6725         +13.8831
        13     -417256.4935         +14.1790
        14     -417248.9391          +7.5544
        15     -417243.3047          +5.6344
        16     -417239.4113          +3.8934
        17     -417238.9522          +0.4591
        18     -417237.0517          +1.9005
        19     -417233.5282          +3.5235
        20     -417231.9237          +1.6045
        21     -417231.7179          +0.2057
        22     -417231.7760          -0.0581


batden


         1     -461581.3192             +nan
         2     -432495.1010      +29086.2182
         3     -425773.6182       +6721.4829
         4     -423510.2371       +2263.3810
         5     -422895.3246        +614.9125
         6     -422740.9522        +154.3725
         7     -422687.0331         +53.9191
         8     -422661.4479         +25.5852
         9     -422651.9781          +9.4697
        10     -422646.4321          +5.5460
        11     -422636.9763          +9.4559
        12     -422609.8269         +27.1494
        13     -422594.7705         +15.0564
        14     -422587.8334          +6.9371
        15     -422582.1429          +5.6905
        16     -422579.4023          +2.7406
        17     -422578.5901          +0.8122
        18     -422578.3102          +0.2800
        19     -422578.1901          +0.1201
        20     -422578.1308          +0.0593
        21     -422578.0980          +0.0328
        22     -422578.0782          +0.0199
        23

tatquat


         1     -435568.4812             +nan
         2     -408222.8049      +27345.6763
         3     -401952.2182       +6270.5866
         4     -400244.3511       +1707.8671
         5     -399858.0406        +386.3105
         6     -399735.6438        +122.3969
         7     -399672.0805         +63.5633
         8     -399605.1871         +66.8934
         9     -399530.3864         +74.8007
        10     -399464.6498         +65.7367
        11     -399417.6708         +46.9789
        12     -399381.4556         +36.2153
        13     -399359.6443         +21.8113
        14     -399349.3625         +10.2818
        15     -399343.5171          +5.8454
        16     -399337.5123          +6.0048
        17     -399323.1980         +14.3144
        18     -399304.0369         +19.1611
        19     -399295.9469          +8.0900
        20     -399292.5107          +3.4362
        21     -399290.4398          +2.0709
        22     -399289.4888          +0.9510
        23

batquat


         1     -452466.2939             +nan
         2     -428298.5126      +24167.7814
         3     -421233.7401       +7064.7725
         4     -418374.6826       +2859.0575
         5     -417545.7999        +828.8826
         6     -417323.5154        +222.2845
         7     -417231.3267         +92.1887
         8     -417187.2877         +44.0390
         9     -417162.6027         +24.6850
        10     -417144.3997         +18.2030
        11     -417128.7368         +15.6629
        12     -417115.4567         +13.2801
        13     -417097.4208         +18.0359
        14     -417072.4999         +24.9210
        15     -417045.5857         +26.9142
        16     -417024.5320         +21.0537
        17     -417008.0885         +16.4436
        18     -416995.6092         +12.4793
        19     -416987.1776          +8.4316
        20     -416981.1031          +6.0745
        21     -416976.5319          +4.5712
        22     -416973.1385          +3.3934
        23

tatnhac


         1     -449856.8902             +nan
         2     -420689.8480      +29167.0422
         3     -413554.2476       +7135.6005
         4     -411400.0314       +2154.2162
         5     -410934.8837        +465.1477
         6     -410765.2867        +169.5970
         7     -410694.1699         +71.1168
         8     -410660.8275         +33.3424
         9     -410644.1580         +16.6695
        10     -410636.0743          +8.0837
        11     -410631.2024          +4.8719
        12     -410628.2649          +2.9376
        13     -410626.4231          +1.8418
        14     -410625.2275          +1.1955
        15     -410624.6981          +0.5295
        16     -410624.4551          +0.2430
        17     -410624.3566          +0.0985
        18     -410624.3360          +0.0205
        19     -410624.3640          -0.0280


batnhac


         1     -456862.5132             +nan
         2     -429542.2592      +27320.2539
         3     -423505.7469       +6036.5123
         4     -421506.5924       +1999.1544
         5     -420623.0802        +883.5123
         6     -420100.0930        +522.9871
         7     -419754.8396        +345.2535
         8     -419637.9652        +116.8743
         9     -419592.0988         +45.8664
        10     -419562.5027         +29.5961
        11     -419527.8331         +34.6696
        12     -419491.1267         +36.7064
        13     -419440.8424         +50.2843
        14     -419363.8900         +76.9524
        15     -419305.9809         +57.9091
        16     -419280.1674         +25.8135
        17     -419269.5760         +10.5914
        18     -419263.1265          +6.4494
        19     -419258.3219          +4.8047
        20     -419254.2144          +4.1075
        21     -419249.5538          +4.6605
        22     -419242.9313          +6.6225
        23

In [63]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

In [64]:
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], Y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
    print(f'{cname}:', (np.array(y_true) == np.array(y_pred)).sum() / len(y_true))
print('======')
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

xemnhietdo: 0.64
xemgio: 0.48
xemngay: 0.5333333333333333
xemthoitiet: 0.61
tatden: 0.576
batden: 0.56
tatquat: 0.52
batquat: 0.505
tatnhac: 0.5111111111111111
batnhac: 0.532
Confusion matrix:
[[16  4  3  2  0  0  0  0  0  0]
 [ 2  8  0 11  2  1  0  0  0  1]
 [ 0  1 16  6  0  1  0  0  0  1]
 [ 0  2  2 21  0  0  0  0  0  0]
 [ 0  0  1  0 11  2  0  6  0  5]
 [ 0  5  0  2  1 12  0  1  0  4]
 [ 0  0  0  0  0  0  7  8 10  0]
 [ 0  0  0  0  1  0  1 10  9  4]
 [ 0  0  1  0  0  0  0  5 14  5]
 [ 0  1  0  1  1  0  0  3  1 18]]


Train with full dataset

In [22]:
finalX = {}
for cname in class_names:
    finalX[cname] = X['test'][cname] + X['train'][cname]
    print(cname,len(finalX[cname]))

xemnhietdo 60
xemgio 60
xemngay 60
xemgio 60
tat 60
bat 60
quat 60
nhac 60
den 60


In [None]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states, 0.0)
    trans_matrix = np.full((states, states), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    # print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states,
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(finalX[cname]),
        lengths=[x.shape[0] for x in finalX[cname]])

In [24]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

Test with new random wav file

In [77]:
test_file_name = 'xemngay.wav'
test_file_path = 'datasets/random_test_audio/'
test_mfcc = get_mfcc(test_file_path+test_file_name)
scores = [model[cname].score(test_mfcc) for cname in class_names]

print("Input:",test_file_name)
print("score:",scores)
print('Class_name:',class_names)
print("Output:",class_names[np.argmax(scores)])

Input: xemngay.wav
score: [-4854.188538714205, -4822.473777114951, -4923.192492845547, -4804.489042371499, -5130.463648381897, -5073.942404860141, -5413.492933637798, -5516.447391855644, -5132.017922592878, -5155.8324796599545]
Class_name: ['xemnhietdo', 'xemgio', 'xemngay', 'xemthoitiet', 'tatden', 'batden', 'tatquat', 'batquat', 'tatnhac', 'batnhac']
Output: xemthoitiet


In [92]:
model['xemgio'].transmat_

array([[0.91020378, 0.        , 0.        , 0.        , 0.08500456,
        0.        , 0.00479167],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.034725  , 0.89493378, 0.        , 0.        ,
        0.        , 0.07034123],
       [0.        , 0.        , 0.        , 0.98654232, 0.        ,
        0.01345768, 0.        ],
       [0.04146354, 0.        , 0.15466062, 0.        , 0.80387584,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.16248435,
        0.83751565, 0.        ],
       [0.03870414, 0.        , 0.        , 0.03763044, 0.        ,
        0.        , 0.92366542]])

In [91]:
model['xemgio'].startprob_

array([4.19839370e-117, 0.00000000e+000, 0.00000000e+000, 7.89717484e-002,
       0.00000000e+000, 9.21028252e-001, 0.00000000e+000])

In [16]:
import librosa
y, sr = librosa.load('datasets/train_audio/bat/Open1.wav')  # read .wav file
print(sr)

22050


In [24]:
# with open('model/model_bat.pkl','rb') as file:
#     p = pickle.load(file)

-3806.9000670368246