In [None]:
!pip -q install librosa
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as idsp
from matplotlib.patches import ConnectionPatch
import scipy.spatial.distance as dist
from numpy.lib.function_base import extract

# Get sound and mfcc

In [None]:
def get_sound(path):
  data = open(path+'.txt').read().split('\n')[:-1]
  command_dict = {}
  for i in data:
    start, end, command = i.split('\t')
    if command not in command_dict:
      command_dict[command] = []
    else:
      command_dict[command].append([float(start), float(end)])

  sound, sr = librosa.load(path+'.wav', duration=22050)

  sound_dict = {}

  for command in command_dict:
    if command not in sound_dict:
      sound_dict[command] = []
    for times in command_dict[command]:
      sound_dict[command].append([sound[int(sr*times[0]):int(sr*times[1])],sr])
  return sound_dict

In [None]:
def get_mfcc(path):
  sound_dict = get_sound(path)
  mfccs_features = {}
  for command in sound_dict:
    if command not in mfccs_features:
      mfccs_features[command] = []
    for (s, sr) in sound_dict[command]:
        mfcc = librosa.feature.mfcc(y=s, sr=sr, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        mfccs_features[command].append(np.concatenate((mfcc, mfcc_delta, mfcc_delta2)))
  return mfccs_features

mfcc = get_mfcc('/content/drive/MyDrive/180200981_CaoCamNhung/56_60')

In [None]:
mfcc

{'A': [array([[-5.3426550e+02, -5.2805676e+02, -5.2983893e+02, ...,
          -5.0635309e+02, -5.0676953e+02, -5.0458966e+02],
         [ 5.1596291e+01,  5.5420631e+01,  5.3375580e+01, ...,
           5.4990566e+01,  5.4785866e+01,  5.4530388e+01],
         [ 2.5965847e+01,  2.7892651e+01,  2.5934736e+01, ...,
           1.0025503e+01,  7.6062326e+00,  5.2171102e+00],
         ...,
         [-3.5238069e-01, -3.5238069e-01, -3.5238069e-01, ...,
          -6.0637832e-01, -6.0637832e-01, -6.0637832e-01],
         [-1.5149381e+00, -1.5149381e+00, -1.5149381e+00, ...,
           8.0382258e-02,  8.0382258e-02,  8.0382258e-02],
         [-3.2876513e-03, -3.2876513e-03, -3.2876513e-03, ...,
           2.2142512e-01,  2.2142512e-01,  2.2142512e-01]], dtype=float32),
  array([[-5.4746552e+02, -5.3789673e+02, -5.3384729e+02, ...,
          -5.3040900e+02, -5.3122192e+02, -5.3653186e+02],
         [ 3.9325619e+01,  4.0619789e+01,  4.4160332e+01, ...,
           4.7491821e+01,  4.5210819e+01,  4.22

# Nhận diện từ đơn bằng DTW

In [None]:
def speed_recognition_dtw(sound, sr):
  mfcc = librosa.feature.mfcc(y=sound, sr=sr, n_mfcc=13)
  mfcc_delta = librosa.feature.delta(mfcc)
  mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
  feature = np.concatenate((mfcc, mfcc_delta, mfcc_delta2))

  mfcc_frame = get_mfcc('/content/drive/MyDrive/Ex1-2022/02/18020758_HoangPhuongLinh/56_60')

  count = {}
  for command in mfcc_frame:
    if command not in count:
      count[command] = []
    for mfcc_fr in mfcc_frame[command]:
      D, wp = librosa.sequence.dtw(mfcc_fr,feature, subseq=True)
      count[command].append(D[-1, -1])
      average = dict([(k,np.min(v)) for (k,v) in count.items()])
  
  return min(average, key=average.get)

In [None]:
import tqdm

subfolder=[os.path.join('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/',i) for i in os.listdir('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/')]

predict = []
target = []
for path in tqdm.tqdm_notebook(subfolder[:100]):
  if '.wav' in path:
    sound = get_sound(path.replace('.wav', ''))
    for i in sound:
      for j in sound[i]:
        predict.append(speed_recognition_dtw(j[0], j[1]))
        target.append(i)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
ta = {}
for e, i in enumerate(target):
  if i not in ta:
    ta[i] = []
  ta[i].append(predict[e])

print('Accuracy:')
for i in ta:
  print(f'{i}: {(np.array([i]*len(ta[i])) == np.array(ta[i])).sum()/len(ta[i])}')
  
print(f'Tất cả: {(np.array(target) == np.array(predict)).sum()/len(target)}')

Accuracy:
sil: 0.1
len: 0.0
xuong: 0.8181818181818182
phai: 0.6428571428571429
trai: 0.7096774193548387
A: 0.0
ban: 1.0
nhay: 0.8260869565217391
B: 0.2692307692307692
Tất cả: 0.5219512195121951


In [None]:
tsound = get_sound('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c1')
s = sound['xuong'][0][0]
sr = sound['xuong'][0][1]
print(speed_recognition_dtw(s, sr))
idsp.Audio(data=s, rate=22050)

xuong


# Nhận diện từ đơn bằng HMM

In [None]:
!pip -q install hmmlearn


In [None]:
from hmmlearn import hmm
import os

In [None]:
subfolder=[os.path.join('/content/drive/MyDrive/Ex1-2022/02/18020909_Trần Công Minh/',i) for i in os.listdir('/content/drive/MyDrive/Ex1-2022/02/18020909_Trần Công Minh/')]
# subfolder+=[os.path.join('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/',i) for i in os.listdir('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/')]
subfolder+=[os.path.join('/content/drive/MyDrive/Ex1-2022/02/18020758_HoangPhuongLinh/',i) for i in os.listdir('/content/drive/MyDrive/Ex1-2022/02/18020758_HoangPhuongLinh/')]

mfcc_all = {}
for path in tqdm.tqdm_notebook(subfolder):
  if '.wav' in path:
    try:
      mfcc = get_mfcc(path.replace('.wav', ''))
      for j in mfcc:
        if j != 'sl':
          if j.strip() not in mfcc_all:
            mfcc_all[j.strip()] = []
          mfcc_all[j.strip()]+=mfcc[j]
        else:
          mfcc_all['sil']+=mfcc[j]
    except:
      continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/234 [00:00<?, ?it/s]

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]


In [None]:
mfcc_all.keys()

dict_keys(['sil', 'len', 'xuong', 'phai', 'B', 'A', 'trai', 'nhay', 'ban'])

In [None]:
import hmmlearn.hmm as hmm
states = [1,3,3,3,3,3,3,3,3]
model = {}
for idx, cname in enumerate(mfcc_all.keys()):
    model[cname] = hmm.GaussianHMM(n_components=states[idx], n_iter=300)
    X = []
    for j in mfcc_all[cname]:
      # if j.shape[1]>=28:
      X.append(j[:,:9])

    model[cname].fit(X=np.vstack(X), lengths=[x.shape[0] for x in X])

In [None]:
import tqdm

subfolder=[os.path.join('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/',i) for i in os.listdir('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/')]

mfcc_all = {}
predict = []
target = []
for path in tqdm.tqdm_notebook(subfolder[:100]):
  if '.wav' in path:
    try:
      mfcc = get_mfcc(path.replace('.wav', ''))
      for j in mfcc:
        if j != 'sl':
          if j.strip() not in mfcc_all:
            mfcc_all[j.strip()] = []
          mfcc_all[j.strip()]+=mfcc[j]
        else:
          mfcc_all['sil']+=mfcc[j]
    except:
      continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
target = []
predict = []
for e, cname in enumerate(mfcc_all.keys()):
    for i in mfcc_all[cname]:
        # if i.shape[1]>=28:
        scores = [model[cname].score(i[:,:9]) for cname in mfcc_all.keys()]
        pred = np.argmax(scores)
        predict.append(pred)
        target.append(e)

In [None]:
ta = {}
for e, i in enumerate(target):
  if i not in ta:
    ta[i] = []
  ta[i].append(predict[e])

print('Accuracy:')
for i in ta:
  print(f'{list(mfcc_all.keys())[i]}: {(np.array([i]*len(ta[i])) == np.array(ta[i])).sum()/len(ta[i])}')
  
print(f'Tất cả: {(np.array(target) == np.array(predict)).sum()/len(target)}')

Accuracy:
sil: 0.0
len: 0.3076923076923077
xuong: 0.0
phai: 0.07142857142857142
B: 0.4230769230769231
A: 0.9411764705882353
trai: 0.0967741935483871
nhay: 0.043478260869565216
ban: 0.0
Tất cả: 0.2


In [None]:
mfcc = get_mfcc('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c1')
mfcc = mfcc['len'][0]
tsound = get_sound('/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c1')
s = sound['len'][0][0]
sr = sound['len'][0][1]
scores = [model[cname].score(mfcc[:,:9]) for cname in mfcc_all.keys()]
print(list(mfcc_all.keys())[np.argmax(scores)])
idsp.Audio(data=s, rate=22050)

len
