In [1]:
import math

import librosa
import numpy as np

In [2]:
def get_mfcc(file_path):
    y, sr = librosa.load(file_path)  # read .wav file
    #chuan hoa tin hieu vao
    hop_length = math.floor(sr * 0.010)  # 10ms hop
    win_length = math.floor(sr * 0.025)  # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # subtract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1, 1))
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) 
    # # return T x 36 (transpose of X)
    return X.T  # hmmlearn use T x N matrix

In [3]:
import os
import pickle

import hmmlearn.hmm as hmm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [4]:
class_names = ['xemnhietdo','xemgio','xemngay','xemthoitiet','tatden','batden','tatquat','batquat','tatnhac','batnhac']
states = [10, 7, 8, 11, 8, 8, 8, 8, 8, 8]
dataset_path = 'datasets/train_audio'

In [5]:
X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}

model = {}
model_path = 'model'

In [6]:
len(os.listdir(dataset_path+"/"+"batden"))

8

In [7]:
for cname in class_names:
    file_paths = [i for i in os.listdir(os.path.join(dataset_path, cname))]
vocal_train,vocal_test = train_test_split(file_paths,test_size=0.25,random_state=42)
print(vocal_test)
print(vocal_train)
#train phai phong phu
#test phai co trong giong train

['Duong', 'Quang']
['Anh', 'Toan', 'Hieu', 'Hung', 'Hoa', 'San']


In [8]:
X = {'train':{},'test':{}}
Y = {'train':{},'test':{}}
for cname in class_names:
    train_data = []
    for v in vocal_train:
        paths = dataset_path+"/"+cname+"/"+v
        for i in os.listdir(paths):
            train_data.append(get_mfcc(paths+"/"+i))
    X['train'][cname] = train_data
    Y['train'][cname] = [class_names.index(cname) for _ in range(len(vocal_train)*5)]
    test_data = []
    for v in vocal_test:
        paths = dataset_path+"/"+cname+"/"+v
        for i in os.listdir(paths):
            test_data.append(get_mfcc(paths+"/"+i))
    X['test'][cname] = test_data        
    Y['test'][cname] = [class_names.index(cname) for _ in range(len(vocal_test)*5)]


In [9]:
# all_data = {}
# all_labels = {}
# for cname in class_names:
#     all_data[cname] = []
#     all_labels[cname] = []
#     for v in os.listdir(dataset_path+"/"+cname):
#         if v == 'Phuong':
#             continue
#         for i in os.listdir(dataset_path+"/"+cname+"/"+v):
#             file_pathsss = dataset_path+"/"+cname+"/"+v+"/"+i
#             print(file_pathsss)
#             all_data[cname].append(get_mfcc(file_pathsss))
#             # print(dataset_path+"/"+cname+"/"+v+"/"+i)
#     all_labels[cname] = [class_names.index(cname) for _ in range(100)]

In [10]:
# len(all_labels['batden'])

In [11]:
# all_data = {}
# all_labels = {}
# for cname in class_names:
#     file_paths = [os.path.join(dataset_path, cname, i) for i in os.listdir(
#         os.path.join(dataset_path, cname)) if i.endswith('.wav')]
#     data = [get_mfcc(file_path) for file_path in file_paths]
#     all_data[cname] = data
#     all_labels[cname] = [class_names.index(cname) for _ in range(len(file_paths))]

# for cname in class_names:
#     x_train, x_test, y_train, y_test = train_test_split(
#         all_data[cname], all_labels[cname],
#         test_size=0.33,
#         random_state=42
#     )

#     X['train'][cname] = x_train
#     X['test'][cname] = x_test
#     y['test'][cname] = y_test

In [12]:
total_train = 0
total_test = 0
for cname in class_names:
    train_count = len(X['train'][cname])
    test_count = len(X['test'][cname])
    print(cname, 'train:', train_count, '| test:', test_count)
    total_train += train_count
    total_test += test_count
print('train samples:', total_train)
print('test samples', total_test)

xemnhietdo train: 30 | test: 10
xemgio train: 30 | test: 10
xemngay train: 30 | test: 10
xemthoitiet train: 30 | test: 10
tatden train: 30 | test: 10
batden train: 30 | test: 10
tatquat train: 30 | test: 10
batquat train: 30 | test: 10
tatnhac train: 30 | test: 10
batnhac train: 30 | test: 10
train samples: 300
test samples 100


In [13]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0
    # trans matrix
    print(cname)

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx],
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(X['train'][cname]),
                            lengths=[x.shape[0] for x in X['train'][cname]])

xemnhietdo


         1     -692544.4112             +nan
         2     -640297.0284      +52247.3828
         3     -635025.6969       +5271.3315
         4     -633388.9377       +1636.7592
         5     -632731.5423        +657.3954
         6     -632400.4025        +331.1398
         7     -632204.4757        +195.9267
         8     -632062.0745        +142.4012
         9     -631952.6511        +109.4235
        10     -631858.3960         +94.2551
        11     -631781.2398         +77.1562
        12     -631682.3545         +98.8853
        13     -631539.4672        +142.8873
        14     -631355.6360        +183.8313
        15     -631138.2048        +217.4311
        16     -630969.3765        +168.8283
        17     -630854.7360        +114.6405
        18     -630777.7004         +77.0356
        19     -630668.4624        +109.2379
        20     -630610.3203         +58.1421
        21     -630557.5510         +52.7694
        22     -630517.7667         +39.7843
        23

xemgio


         1     -602583.1862             +nan
         2     -560317.4213      +42265.7649
         3     -555979.5924       +4337.8289
         4     -554397.1087       +1582.4837
         5     -553454.7227        +942.3859
         6     -552837.7904        +616.9324
         7     -552537.5397        +300.2506
         8     -552418.0472        +119.4925
         9     -552354.9175         +63.1297
        10     -552316.5619         +38.3556
        11     -552292.7521         +23.8097
        12     -552269.0719         +23.6802
        13     -552250.3882         +18.6838
        14     -552225.0236         +25.3646
        15     -552196.0829         +28.9406
        16     -552134.5764         +61.5065
        17     -552070.4270         +64.1494
        18     -552039.0242         +31.4028
        19     -552018.3112         +20.7131
        20     -552000.3190         +17.9922
        21     -551984.6507         +15.6683
        22     -551975.3833          +9.2674
        23

xemngay


         1     -677832.6186             +nan
         2     -630756.7759      +47075.8427
         3     -625803.4543       +4953.3217
         4     -623847.6213       +1955.8330
         5     -623044.4516        +803.1696
         6     -622498.9501        +545.5016
         7     -622189.6666        +309.2834
         8     -622047.1178        +142.5488
         9     -621943.0001        +104.1177
        10     -621841.7117        +101.2884
        11     -621746.8649         +94.8468
        12     -621674.5727         +72.2922
        13     -621631.1414         +43.4313
        14     -621605.1031         +26.0383
        15     -621589.0728         +16.0303
        16     -621580.7815          +8.2913
        17     -621576.5149          +4.2667
        18     -621574.0160          +2.4989
        19     -621572.2349          +1.7810
        20     -621570.0722          +2.1627
        21     -621565.7040          +4.3682
        22     -621559.0506          +6.6534
        23

xemthoitiet


         1     -696983.7826             +nan
         2     -644052.1152      +52931.6674
         3     -638514.9146       +5537.2006
         4     -637203.5539       +1311.3606
         5     -636672.7115        +530.8425
         6     -636390.1833        +282.5282
         7     -636229.2512        +160.9321
         8     -636143.0394         +86.2118
         9     -636092.5997         +50.4397
        10     -636046.1435         +46.4563
        11     -636009.7911         +36.3524
        12     -635980.0128         +29.7783
        13     -635951.2803         +28.7325
        14     -635922.4757         +28.8047
        15     -635893.1396         +29.3361
        16     -635866.3465         +26.7931
        17     -635847.5847         +18.7618
        18     -635838.0211          +9.5636
        19     -635831.0273          +6.9937
        20     -635824.6947          +6.3327
        21     -635819.4437          +5.2510
        22     -635814.6571          +4.7866
        23

tatden


         1     -701287.6332             +nan
         2     -652496.5965      +48791.0366
         3     -640383.5837      +12113.0129
         4     -638258.3645       +2125.2191
         5     -637502.4968        +755.8677
         6     -637128.7253        +373.7716
         7     -636858.6242        +270.1011
         8     -636688.4720        +170.1522
         9     -636568.5963        +119.8756
        10     -636460.4324        +108.1639
        11     -636277.9143        +182.5181
        12     -636111.2110        +166.7032
        13     -636026.8958         +84.3152
        14     -635980.5284         +46.3674
        15     -635939.3943         +41.1342
        16     -635917.5393         +21.8549
        17     -635903.7413         +13.7980
        18     -635893.2165         +10.5248
        19     -635886.0725          +7.1440
        20     -635878.5935          +7.4790
        21     -635867.6709         +10.9226
        22     -635855.9988         +11.6721
        23

batden


         1     -718098.3605             +nan
         2     -663434.3601      +54664.0004
         3     -655368.2489       +8066.1113
         4     -653362.6135       +2005.6353
         5     -651678.7293       +1683.8843
         6     -650844.6909        +834.0384
         7     -650478.9505        +365.7404
         8     -650203.8697        +275.0808
         9     -650029.4895        +174.3802
        10     -649902.5870        +126.9025
        11     -649806.4598         +96.1271
        12     -649725.1571         +81.3028
        13     -649655.2209         +69.9361
        14     -649586.8102         +68.4107
        15     -649510.2092         +76.6010
        16     -649408.4900        +101.7192
        17     -649274.0855        +134.4045
        18     -649078.0886        +195.9970
        19     -648787.6879        +290.4007
        20     -648500.5329        +287.1550
        21     -648370.5220        +130.0109
        22     -648295.4201         +75.1019
        23

tatquat


         1     -686479.8200             +nan
         2     -640166.8914      +46312.9286
         3     -630110.0318      +10056.8596
         4     -628686.9207       +1423.1111
         5     -628397.9517        +288.9689
         6     -628265.1586        +132.7931
         7     -628162.0490        +103.1097
         8     -628073.7313         +88.3177
         9     -628002.8026         +70.9287
        10     -627936.4015         +66.4011
        11     -627874.8436         +61.5579
        12     -627823.2262         +51.6173
        13     -627775.9772         +47.2490
        14     -627726.8327         +49.1445
        15     -627680.5491         +46.2836
        16     -627633.0857         +47.4633
        17     -627590.9975         +42.0883
        18     -627559.2655         +31.7320
        19     -627540.3007         +18.9648
        20     -627531.1980          +9.1027
        21     -627525.3556          +5.8424
        22     -627521.5184          +3.8373
        23

batquat


         1     -690910.9391             +nan
         2     -638142.0033      +52768.9358
         3     -633374.0264       +4767.9769
         4     -632244.7085       +1129.3179
         5     -631782.3825        +462.3260
         6     -631536.5612        +245.8213
         7     -631384.6062        +151.9550
         8     -631304.7006         +79.9056
         9     -631267.0517         +37.6489
        10     -631247.9263         +19.1255
        11     -631234.3661         +13.5601
        12     -631225.0816          +9.2846
        13     -631218.4224          +6.6592
        14     -631215.3633          +3.0591
        15     -631213.5101          +1.8532
        16     -631212.1426          +1.3675
        17     -631211.1674          +0.9751
        18     -631210.6022          +0.5652
        19     -631210.2952          +0.3070
        20     -631210.1165          +0.1787
        21     -631210.0104          +0.1060
        22     -631209.9485          +0.0619
        23

tatnhac


         1     -709080.7762             +nan
         2     -655820.1574      +53260.6188
         3     -647585.9402       +8234.2172
         4     -645654.0943       +1931.8459
         5     -645066.1273        +587.9670
         6     -644731.0401        +335.0872
         7     -644494.3894        +236.6507
         8     -644331.4752        +162.9142
         9     -644181.7196        +149.7556
        10     -644035.7007        +146.0189
        11     -643909.1761        +126.5246
        12     -643822.3638         +86.8124
        13     -643748.3676         +73.9962
        14     -643682.9558         +65.4118
        15     -643590.4759         +92.4799
        16     -643503.8893         +86.5865
        17     -643427.8963         +75.9930
        18     -643362.9780         +64.9183
        19     -643302.3947         +60.5834
        20     -643232.8977         +69.4970
        21     -643157.8734         +75.0242
        22     -643085.3264         +72.5470
        23

batnhac


         1     -701839.2262             +nan
         2     -650986.8978      +50852.3284
         3     -646355.5438       +4631.3540
         4     -645130.4366       +1225.1072
         5     -644604.6763        +525.7603
         6     -644294.0346        +310.6417
         7     -643941.2546        +352.7799
         8     -643495.0965        +446.1581
         9     -643110.2896        +384.8069
        10     -642843.6043        +266.6853
        11     -642668.9369        +174.6675
        12     -642565.4195        +103.5174
        13     -642497.4998         +67.9197
        14     -642456.7562         +40.7437
        15     -642436.0594         +20.6967
        16     -642425.6453         +10.4142
        17     -642419.3883          +6.2570
        18     -642414.6836          +4.7047
        19     -642410.8222          +3.8614
        20     -642407.7550          +3.0672
        21     -642404.8921          +2.8629
        22     -642401.7641          +3.1280
        23

In [15]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

In [16]:
for cname in class_names:
    print(Y['test'][cname])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
[9, 9, 9, 9, 9, 9, 9, 9, 9, 9]


In [17]:
y_true = []
y_pred = []
for cname in class_names:
    temp_y_true = []
    temp_y_pred = []
    for mfcc, target in zip(X['test'][cname], Y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        temp_y_pred.append(pred)
        temp_y_true.append(target)
    # print((np.array(temp_y_true) == np.array(temp_y_pred)))
    print(f'{cname}:', (np.array(temp_y_true) == np.array(temp_y_pred)).sum() / len(temp_y_true))
    for i in range(len(temp_y_pred)):
        y_pred.append(temp_y_pred[i])
        y_true.append(temp_y_true[i])
print('======')
print((np.array(y_true) == np.array(y_pred)).sum() / len(y_true))
print('Confusion matrix:')
print(confusion_matrix(y_true, y_pred))

xemnhietdo: 1.0
xemgio: 0.5
xemngay: 0.4
xemthoitiet: 0.5
tatden: 0.2
batden: 0.9
tatquat: 0.9
batquat: 0.6
tatnhac: 0.5
batnhac: 0.7
0.62
Confusion matrix:
[[10  0  0  0  0  0  0  0  0  0]
 [ 4  5  0  1  0  0  0  0  0  0]
 [ 2  4  4  0  0  0  0  0  0  0]
 [ 5  0  0  5  0  0  0  0  0  0]
 [ 0  0  0  0  2  2  2  0  4  0]
 [ 0  0  0  0  0  9  0  0  0  1]
 [ 0  0  0  0  0  0  9  1  0  0]
 [ 0  0  0  0  0  0  4  6  0  0]
 [ 0  0  0  0  0  0  4  0  5  1]
 [ 0  0  0  0  0  1  0  0  2  7]]


Train with full dataset

In [22]:
finalX = {}
for cname in class_names:
    finalX[cname] = X['test'][cname] + X['train'][cname]
    print(cname,len(finalX[cname]))

xemnhietdo 60
xemgio 60
xemngay 60
xemgio 60
tat 60
bat 60
quat 60
nhac 60
den 60


In [None]:
for idx, cname in enumerate(class_names):
    start_prob = np.full(states, 0.0)
    trans_matrix = np.full((states, states), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0

    # trans matrix
    print(cname)
    # print(trans_matrix)

    model[cname] = hmm.GaussianHMM(
        n_components=states,
        verbose=True,
        n_iter=300,
        startprob_prior=start_prob,
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )
    model[cname].fit(X=np.vstack(finalX[cname]),
        lengths=[x.shape[0] for x in finalX[cname]])

In [24]:
for cname in class_names:
    name = f'{model_path}/model_{cname}.pkl'
    with open(name, 'wb') as file:
        pickle.dump(model[cname], file)

Test with new random wav file

In [77]:
test_file_name = 'xemngay.wav'
test_file_path = 'datasets/random_test_audio/'
test_mfcc = get_mfcc(test_file_path+test_file_name)
scores = [model[cname].score(test_mfcc) for cname in class_names]

print("Input:",test_file_name)
print("score:",scores)
print('Class_name:',class_names)
print("Output:",class_names[np.argmax(scores)])

Input: xemngay.wav
score: [-4854.188538714205, -4822.473777114951, -4923.192492845547, -4804.489042371499, -5130.463648381897, -5073.942404860141, -5413.492933637798, -5516.447391855644, -5132.017922592878, -5155.8324796599545]
Class_name: ['xemnhietdo', 'xemgio', 'xemngay', 'xemthoitiet', 'tatden', 'batden', 'tatquat', 'batquat', 'tatnhac', 'batnhac']
Output: xemthoitiet


In [92]:
model['xemgio'].transmat_

array([[0.91020378, 0.        , 0.        , 0.        , 0.08500456,
        0.        , 0.00479167],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.034725  , 0.89493378, 0.        , 0.        ,
        0.        , 0.07034123],
       [0.        , 0.        , 0.        , 0.98654232, 0.        ,
        0.01345768, 0.        ],
       [0.04146354, 0.        , 0.15466062, 0.        , 0.80387584,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.16248435,
        0.83751565, 0.        ],
       [0.03870414, 0.        , 0.        , 0.03763044, 0.        ,
        0.        , 0.92366542]])

In [91]:
model['xemgio'].startprob_

array([4.19839370e-117, 0.00000000e+000, 0.00000000e+000, 7.89717484e-002,
       0.00000000e+000, 9.21028252e-001, 0.00000000e+000])

In [16]:
import librosa
y, sr = librosa.load('datasets/train_audio/bat/Open1.wav')  # read .wav file
print(sr)

22050


In [24]:
# with open('model/model_bat.pkl','rb') as file:
#     p = pickle.load(file)

-3806.9000670368246