In [1]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm
from itertools import islice

def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
#     energy = librosa.feature.rmse(y=y)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix

def get_class_data(data_dir):
    files = os.listdir(data_dir)
    mfcc = [get_mfcc(os.path.join(data_dir,f)) for f in files if f.endswith(".wav")]
    print('data_dir: ', data_dir)
    print(f'mfcc.shape: {np.array(mfcc).shape}')
    return mfcc

def clustering(X, n_clusters=14):
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0, verbose=0)
    kmeans.fit(X)
    print("centers", kmeans.cluster_centers_.shape)
    return kmeans  

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [2]:

class_names = ["khong","tôi","cachly", "cothe" , 'nguoi',"test_cothe" ,"test_cachly", "test_tôi" , "test_khong" ,"test_nguoi" ]
dataset = {}
for cname in class_names:
    print(f"Load {cname} dataset")
    dataset[cname] = get_class_data(os.path.join("data", cname))


Load khong dataset
data_dir:  data/khong
mfcc.shape: (69,)
Load tôi dataset
data_dir:  data/tôi
mfcc.shape: (68,)
Load cachly dataset
data_dir:  data/cachly
mfcc.shape: (91,)
Load cothe dataset
data_dir:  data/cothe
mfcc.shape: (92,)
Load nguoi dataset
data_dir:  data/nguoi
mfcc.shape: (70,)
Load test_cothe dataset
data_dir:  data/test_cothe
mfcc.shape: (7,)
Load test_cachly dataset
data_dir:  data/test_cachly
mfcc.shape: (9,)
Load test_tôi dataset
data_dir:  data/test_tôi
mfcc.shape: (10,)
Load test_khong dataset
data_dir:  data/test_khong
mfcc.shape: (31,)
Load test_nguoi dataset
data_dir:  data/test_nguoi
mfcc.shape: (28,)


In [3]:
# Get all vectors in the datasets
all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
print("vectors", all_vectors.shape)

vectors (23339, 36)


In [4]:
# Run K-Means algorithm to get clusters
kmeans = clustering(all_vectors, n_clusters=8)
print(kmeans.cluster_centers_.shape)

centers (8, 36)
(8, 36)


In [5]:
for key, val in dataset.items():
    print(key, np.array(val).shape)

khong (69,)
tôi (68,)
cachly (91,)
cothe (92,)
nguoi (70,)
test_cothe (7,)
test_cachly (9,)
test_tôi (10,)
test_khong (31,)
test_nguoi (28,)


In [6]:
models = {}

original_dataset = {}

In [7]:
original_dataset['tôi'] = dataset['tôi'].copy()
cname = 'tôi'
class_vectors = dataset[cname]
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])


hmm = hmmlearn.hmm.MultinomialHMM(
    n_components= 9, random_state=0, n_iter=1100, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_ = np.array([0.6,0.3,0.1,0.0,0.0,0.0,0.0,0.0,0.0])
hmm.transmat_ = np.array([
        [0.5,0.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0],
        [0.0,0.5,0.3,0.2,0.0,0.0,0.0,0.0,0.0],
        [0.0,0.0,0.5,0.3,0.2,0.0,0.0,0.0,0.0],
        [0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0,0.0],
        [0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.4],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],
    ])
if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class tôi
(1267, 1) [11, 27, 42, 14, 15, 15, 17, 30, 17, 30, 17, 24, 17, 19, 15, 14, 28, 21, 21, 15, 15, 14, 27, 14, 14, 17, 16, 17, 11, 18, 20, 17, 14, 12, 18, 36, 15, 12, 16, 17, 12, 25, 10, 12, 13, 18, 13, 9, 22, 30, 12, 37, 33, 13, 18, 13, 19, 22, 11, 15, 24, 14, 9, 16, 28, 35, 19, 16] 68


         1       -2765.8717             +nan
         2       -1824.6792        +941.1925
         3       -1579.6009        +245.0783
         4       -1442.1964        +137.4044
         5       -1378.7731         +63.4234
         6       -1334.9340         +43.8391
         7       -1293.1751         +41.7589
         8       -1264.3531         +28.8220
         9       -1253.2080         +11.1450
        10       -1249.8123          +3.3958
        11       -1247.6171          +2.1951
        12       -1244.4097          +3.2074
        13       -1241.7838          +2.6260
        14       -1239.9961          +1.7877
        15       -1238.6881          +1.3080
        16       -1237.8518          +0.8363
        17       -1237.2724          +0.5793
        18       -1236.8356          +0.4369
        19       -1236.5073          +0.3282
        20       -1236.2644          +0.2430
        21       -1236.0829          +0.1814
        22       -1235.9438          +0.1391
        23

In [8]:
np.set_printoptions(precision=2, suppress=True)
print(models['tôi'].transmat_)

[[0.63 0.02 0.35 0.   0.   0.   0.   0.   0.  ]
 [0.   0.7  0.3  0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.36 0.13 0.51 0.   0.   0.   0.  ]
 [0.   0.   0.   0.85 0.03 0.13 0.   0.   0.  ]
 [0.   0.   0.   0.   0.37 0.57 0.06 0.   0.  ]
 [0.   0.   0.   0.   0.   0.82 0.1  0.08 0.  ]
 [0.   0.   0.   0.   0.   0.   0.66 0.34 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.91 0.09]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.  ]]


In [9]:
original_dataset['khong'] = dataset['khong'].copy()
cname = 'khong'
class_vectors = dataset[cname]
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])


hmm = hmmlearn.hmm.MultinomialHMM(n_components=9, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class khong
(4017, 1) [46, 101, 38, 101, 101, 24, 28, 31, 27, 23, 101, 101, 34, 32, 101, 27, 27, 25, 101, 101, 31, 28, 25, 101, 101, 33, 101, 19, 101, 101, 28, 101, 101, 101, 101, 25, 23, 31, 101, 23, 18, 101, 101, 101, 26, 30, 22, 30, 101, 101, 25, 101, 101, 23, 101, 31, 25, 15, 45, 22, 19, 101, 22, 101, 101, 29, 26, 26, 26] 69


         1       -7892.8756             +nan
         2       -3374.0112       +4518.8644
         3       -2030.7280       +1343.2832
         4       -1755.6993        +275.0287
         5       -1698.2217         +57.4775
         6       -1681.3017         +16.9201
         7       -1674.7662          +6.5354
         8       -1671.3643          +3.4020
         9       -1669.3659          +1.9984
        10       -1667.5854          +1.7806
        11       -1665.9040          +1.6814
        12       -1663.8707          +2.0333
        13       -1660.8263          +3.0443
        14       -1657.7885          +3.0378
        15       -1654.8260          +2.9625
        16       -1652.5758          +2.2502
        17       -1651.5701          +1.0057
        18       -1651.1257          +0.4444
        19       -1650.8491          +0.2766
        20       -1650.6230          +0.2261
        21       -1650.4121          +0.2109
        22       -1650.2033          +0.2088
        23

In [10]:
models['khong'].startprob_

array([0.14, 0.31, 0.54, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [11]:
np.set_printoptions(precision=2, suppress=True)
print(models['khong'].transmat_)

[[0.78 0.16 0.06 0.   0.   0.   0.   0.   0.  ]
 [0.   0.95 0.   0.05 0.   0.   0.   0.   0.  ]
 [0.   0.   0.76 0.   0.24 0.   0.   0.   0.  ]
 [0.   0.   0.   0.72 0.03 0.26 0.   0.   0.  ]
 [0.   0.   0.   0.   0.83 0.   0.17 0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.86 0.14 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.98 0.02]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.  ]]


In [12]:
original_dataset['nguoi'] = dataset['nguoi'].copy()
cname = 'nguoi'
class_vectors = dataset[cname]
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])


hmm = hmmlearn.hmm.MultinomialHMM(n_components=9, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class nguoi
(1887, 1) [22, 44, 29, 35, 16, 15, 39, 26, 53, 14, 18, 33, 25, 15, 21, 51, 23, 22, 18, 35, 24, 34, 44, 34, 16, 25, 30, 32, 47, 18, 27, 20, 17, 35, 18, 27, 36, 22, 22, 31, 10, 20, 26, 48, 15, 35, 13, 21, 36, 37, 27, 22, 17, 24, 38, 29, 24, 19, 24, 24, 22, 23, 22, 33, 25, 55, 32, 21, 18, 14] 70


         1       -3761.9674             +nan
         2       -2204.3113       +1557.6561
         3       -1888.2902        +316.0210
         4       -1776.3877        +111.9025
         5       -1733.4811         +42.9066
         6       -1712.3696         +21.1115
         7       -1690.5596         +21.8100
         8       -1669.8845         +20.6750
         9       -1658.1936         +11.6909
        10       -1653.1412          +5.0525
        11       -1651.2284          +1.9127
        12       -1650.4492          +0.7793
        13       -1650.0477          +0.4015
        14       -1649.7874          +0.2603
        15       -1649.5907          +0.1967
        16       -1649.4252          +0.1655
        17       -1649.2725          +0.1527
        18       -1649.1194          +0.1532
        19       -1648.9545          +0.1648
        20       -1648.7689          +0.1856
        21       -1648.5593          +0.2096
        22       -1648.3374          +0.2220
        23

In [13]:
original_dataset['cachly'] = dataset['cachly'].copy()
cname = 'cachly'
class_vectors = dataset[cname]
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1

dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])


hmm = hmmlearn.hmm.MultinomialHMM(
    n_components= 15, random_state=0, n_iter=1100, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_ = np.array([0.7,0.2,0.1,0.0,0.0,0.0, 0.0,0.0,0.0,0.0,0.0,0.0,0.0, 0.0,0.0])
hmm.transmat_ = np.array([     
        [0.5,0.3,0.2,0.0,0.0,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.5,0.3,0.2,0.0,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.5,0.3,0.2,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.5,0.3,0.2,0.0, 0.0, 0.0,0.0, 0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0,0.0, 0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0, 0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0, 0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0, 0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2,0.0],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3,0.2],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5],
        [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0],
    ])

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

training class cachly
(7818, 1) [29, 65, 32, 31, 28, 39, 49, 89, 64, 51, 37, 39, 38, 71, 55, 49, 56, 50, 89, 60, 64, 56, 54, 89, 58, 61, 29, 61, 89, 49, 2957, 51, 54, 37, 45, 61, 57, 50, 63, 62, 57, 33, 59, 34, 51, 60, 51, 36, 56, 65, 82, 43, 67, 59, 50, 33, 61, 56, 64, 35, 30, 61, 33, 55, 57, 60, 34, 86, 70, 60, 46, 35, 61, 43, 58, 58, 35, 28, 59, 83, 59, 59, 41, 42, 68, 58, 60, 51, 69, 63, 76] 91


         1      -16682.3508             +nan
         2      -11586.9046       +5095.4462
         3      -10598.4674        +988.4372
         4      -10233.2254        +365.2420
         5      -10117.7164        +115.5089
         6      -10008.6565        +109.0599
         7       -9924.8657         +83.7909
         8       -9890.6892         +34.1765
         9       -9875.5629         +15.1263
        10       -9860.2095         +15.3534
        11       -9834.3483         +25.8612
        12       -9796.4412         +37.9071
        13       -9765.4325         +31.0087
        14       -9733.5039         +31.9286
        15       -9640.4709         +93.0330
        16       -9591.4901         +48.9808
        17       -9584.7654          +6.7247
        18       -9580.4997          +4.2657
        19       -9576.8171          +3.6826
        20       -9573.6043          +3.2129
        21       -9570.9076          +2.6967
        22       -9568.7004          +2.2072
        23

In [14]:
original_dataset['cothe'] = dataset['cothe'].copy()
cname = 'cothe'
class_vectors = dataset[cname]
# convert all vectors to the cluster index
# dataset['one'] = [O^1, ... O^R] , O^r: the r-th recorded wav file 
# O^r = (c1, c2, ... ct, ... cT) , c_i: the i-th frame in the r-th observation ( or the r-th wav file )
# O^r size T x 1
dataset[cname] = list([kmeans.predict(v).reshape(-1,1) for v in original_dataset[cname]])


hmm = hmmlearn.hmm.MultinomialHMM(
    n_components=12,random_state=0, n_iter=1000, verbose=True,
    params='te',
    init_params='e'
)
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

if cname[:4] != 'test':
    X = np.concatenate(dataset[cname])
    lengths = list([len(x) for x in dataset[cname]])
    print("training class", cname)
    print(X.shape, lengths, len(lengths))
    hmm.fit(X, lengths=lengths)
    models[cname] = hmm

         1       -7117.2530             +nan


training class cothe
(3564, 1) [30, 57, 52, 34, 27, 41, 53, 37, 22, 28, 26, 27, 40, 35, 40, 35, 20, 33, 36, 43, 38, 34, 46, 37, 36, 35, 35, 30, 37, 39, 35, 25, 30, 42, 35, 35, 49, 40, 44, 36, 44, 39, 59, 26, 52, 30, 34, 56, 32, 47, 25, 43, 45, 56, 16, 44, 23, 36, 39, 35, 25, 48, 44, 34, 37, 40, 51, 28, 47, 43, 47, 39, 44, 36, 102, 52, 36, 41, 44, 56, 42, 45, 26, 31, 39, 38, 27, 36, 53, 37, 31, 30] 92


         2       -5327.9325       +1789.3205
         3       -4405.5107        +922.4218
         4       -4038.0519        +367.4588
         5       -3896.3358        +141.7160
         6       -3848.0198         +48.3160
         7       -3829.2345         +18.7853
         8       -3820.8671          +8.3674
         9       -3816.5145          +4.3526
        10       -3813.5242          +2.9903
        11       -3810.9615          +2.5626
        12       -3808.5328          +2.4287
        13       -3806.6221          +1.9107
        14       -3805.2752          +1.3469
        15       -3804.1757          +1.0995
        16       -3803.1904          +0.9853
        17       -3802.2425          +0.9479
        18       -3801.2446          +0.9979
        19       -3799.9764          +1.2681
        20       -3796.3017          +3.6747
        21       -3789.4783          +6.8234
        22       -3787.2633          +2.2150
        23       -3785.6009          +1.6624
        24

In [15]:
import operator
print("Testing")
class_names = ["test_tôi", "test_khong", "test_nguoi", "test_cachly","test_cothe"]

dataset["test_khong"] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_khong']])
dataset['test_nguoi'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_nguoi']])
dataset['test_cachly'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_cachly']])
dataset['test_tôi'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_tôi']])
dataset['test_cothe'] = list([kmeans.predict(v).reshape(-1,1) for v in dataset['test_cothe']])
for true_cname in class_names:
    print("-------------------")
    index = 0
    total = 0
    for O in dataset[true_cname]:
        total += 1
        score = {cname : model.score(O, [len(O)]) for cname, model in models.items() if cname[:4] != 'test' }
        pred = max(score.items(), key=operator.itemgetter(1))[0]
        if(true_cname[5:] == pred): index+=1
        print(true_cname, pred)
    
    print(index / total * 100, "%") 
    

Testing
-------------------
test_tôi tôi
test_tôi tôi
test_tôi tôi
test_tôi cothe
test_tôi tôi
test_tôi tôi
test_tôi cachly
test_tôi tôi
test_tôi tôi
test_tôi tôi
80.0 %
-------------------
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong cachly
test_khong cachly
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong tôi
test_khong tôi
test_khong khong
test_khong cachly
test_khong khong
test_khong cothe
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
test_khong khong
80.64516129032258 %
-------------------
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi tôi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi nguoi
test_nguoi tôi
test_nguoi

In [16]:

print("Testing")
class_names = ["test_tôi", "test_khong", "test_nguoi", "test_cachly","test_va"]
for true_cname in class_names:
    for O in dataset[true_cname]:
        score = {cname : model.score(O, [len(O)]) for cname, model in models.items() if cname[:5] != 'test' }
        print(true_cname, score)
    


Testing
test_tôi {'tôi': -18.858952778902054, 'khong': -22.594452114188893, 'nguoi': -26.70324175624899, 'cachly': -23.66281411086912, 'cothe': -24.163457745687744}
test_tôi {'tôi': -13.434635078346988, 'khong': -20.880558317149255, 'nguoi': -13.574730959537872, 'cachly': -15.08063947322289, 'cothe': -17.941039411261954}
test_tôi {'tôi': -23.420207222610802, 'khong': -42.9672363234074, 'nguoi': -32.93818230571511, 'cachly': -37.35502038123481, 'cothe': -38.16267845196381}
test_tôi {'tôi': -539.4888130851969, 'khong': -197.4781482083232, 'nguoi': -188.62229613609458, 'cachly': -171.464971188894, 'cothe': -158.73883407315705}
test_tôi {'tôi': -25.016825491031593, 'khong': -30.819788886282023, 'nguoi': -29.023493346991145, 'cachly': -30.40336551281277, 'cothe': -34.26495363739536}
test_tôi {'tôi': -27.588246515460263, 'khong': -36.60882808657881, 'nguoi': -32.53959296216564, 'cachly': -40.38789222765791, 'cothe': -29.687028420035162}
test_tôi {'tôi': -28.063821467741228, 'khong': -52.0427

test_nguoi {'tôi': -32.14683616863378, 'khong': -45.68373759205032, 'nguoi': -28.689520176200414, 'cachly': -27.235644945524463, 'cothe': -26.036472280784437}
test_nguoi {'tôi': -23.524477655869628, 'khong': -40.49862060572944, 'nguoi': -22.023293926344465, 'cachly': -27.47777086686809, 'cothe': -36.75609944981189}
test_nguoi {'tôi': -42.60119747052347, 'khong': -55.769022872281276, 'nguoi': -21.84861510752805, 'cachly': -32.8159482564668, 'cothe': -39.9232232308722}
test_nguoi {'tôi': -19.852650601055373, 'khong': -36.55069603003084, 'nguoi': -15.541650266136045, 'cachly': -24.526926362837916, 'cothe': -26.014090456381204}
test_nguoi {'tôi': -24.186969956725186, 'khong': -41.21833114575696, 'nguoi': -12.751281205025665, 'cachly': -21.514012100930785, 'cothe': -29.2813142576343}
test_nguoi {'tôi': -16.606506542915433, 'khong': -28.282998599918646, 'nguoi': -11.895388667960843, 'cachly': -16.01005735962067, 'cothe': -22.261066765725047}
test_nguoi {'tôi': -544.4514649371795, 'khong': -2

KeyError: 'test_va'