In [1]:
import pickle
from glob import iglob
import numpy as np
import librosa
from shutil import rmtree
from constants import *
import speech_recognition as speech_r

DATA_AUDIO_DIR = '../vad_train_audio'
DATA_RESAMPLED_AUDIO_DIR = '../resampled_train_data'

list_dir = os.listdir(DATA_AUDIO_DIR)
list_dir.sort()

r = speech_r.Recognizer() 

OUTPUT_DIR = '../output'
OUTPUT_DIR_TRAIN = os.path.join(OUTPUT_DIR, 'train')
OUTPUT_DIR_TEST = os.path.join(OUTPUT_DIR, 'test')
OUTPUT_DIR_RESAMPLED_TRAIN = os.path.join(OUTPUT_DIR, 'resampled_train')
OUTPUT_DIR_RESAMPLED_TEST = os.path.join(OUTPUT_DIR, 'resampled_test')

PAD_SIZE = 0
TARGET_LENGTH = 200000
TARGET_SR = 8000
MAX_AUDIO_LENGTH = 50
AUDIO_LENGTH = TARGET_LENGTH
SEGMENT_NUM = 1

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def del_folder(path):
    try:
        rmtree(path)
    except:
        pass

### Delete pickle file & Make directory

In [31]:
del_folder(OUTPUT_DIR_TRAIN)
del_folder(OUTPUT_DIR_TEST)
mkdir_p(OUTPUT_DIR_TRAIN)
mkdir_p(OUTPUT_DIR_TEST)

In [32]:
del_folder(OUTPUT_DIR_RESAMPLED_TRAIN)
del_folder(OUTPUT_DIR_RESAMPLED_TEST)
mkdir_p(OUTPUT_DIR_RESAMPLED_TRAIN)
mkdir_p(OUTPUT_DIR_RESAMPLED_TEST)

In [33]:
del_folder(DATA_RESAMPLED_AUDIO_DIR)
mkdir_p(DATA_RESAMPLED_AUDIO_DIR)

## Audio Resampling 

음성 길이를 TARGET_LENGTH로 맞추기 위한 전처리 진행   
Default Samplerate는 TARGET_SR 이며, TARGET_SR로 Audio를 load 하여 Audio Duration을 측정한다.  

**Audio Duration X Sample Rate = Result Audio Length**   

Audio Duration과 Result Audio Length(TARGET_LENGTH)를 통해 Sample Rate를 구한다.  


In [3]:
def resampling_audio():
     for i, wav_filename in enumerate(iglob(os.path.join(DATA_AUDIO_DIR, '**/**.wav'), recursive=True)):
        y, sr = librosa.load(wav_filename, sr=TARGET_SR, mono=True, duration=MAX_AUDIO_LENGTH)
        audio_duration = len(y) / sr
        
        resample_sr = TARGET_LENGTH / audio_duration        
        resample = librosa.resample(y, sr, resample_sr)
        print(i, wav_filename)
        print("Default SR : {}, Audio Length(Default SR) : {}, Audio Duration : {}".format(TARGET_SR, len(y), audio_duration))
        print("Audio Duration : {}, Resampling SR : {}, Result Audio Length : {}".format(audio_duration, resample_sr, len(resample)))
        
        if len(resample) < TARGET_LENGTH:
            resample = np.concatenate((resample, np.zeros(shape=(TARGET_LENGTH - len(resample), 1))))
        resample = resample[:TARGET_LENGTH]
        
        print("Final Audio Length : {}".format(len(resample)))

        
        output_folder = DATA_RESAMPLED_AUDIO_DIR
        if not os.path.exists(os.path.join(DATA_RESAMPLED_AUDIO_DIR, wav_filename.split('/')[2])):
                mkdir_p(os.path.join(DATA_RESAMPLED_AUDIO_DIR, wav_filename.split('/')[2]))
        output_filename = os.path.join(DATA_RESAMPLED_AUDIO_DIR, wav_filename.split('/')[2], str(wav_filename.split('/')[3][0]+".pkl"))
        print("Output File Name: {}\n".format(output_filename))
        
        result = {'resampled_audio': resample,
               'resample_sr': resample_sr}

        with open(output_filename, 'wb') as w:
                  pickle.dump(result, w)


In [35]:
resampling_audio()

0 ../vad_train_audio/2017019770032_kimdayeong/5.wav
Default SR : 8000, Audio Length(Default SR) : 275691, Audio Duration : 34.461375
Audio Duration : 34.461375, Resampling SR : 5803.598956803088, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770032_kimdayeong/5.pkl

1 ../vad_train_audio/2017019770032_kimdayeong/1.wav
Default SR : 8000, Audio Length(Default SR) : 276598, Audio Duration : 34.57475
Audio Duration : 34.57475, Resampling SR : 5784.568218136067, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770032_kimdayeong/1.pkl

2 ../vad_train_audio/2017019770032_kimdayeong/3.wav
Default SR : 8000, Audio Length(Default SR) : 263392, Audio Duration : 32.924
Audio Duration : 32.924, Resampling SR : 6074.596039363382, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770032_kimdayeong/3.pkl

3 ../vad_train_audio/201701

26 ../vad_train_audio/2017019880039_seojuyeon/1.wav
Default SR : 8000, Audio Length(Default SR) : 222816, Audio Duration : 27.852
Audio Duration : 27.852, Resampling SR : 7180.812868016659, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880039_seojuyeon/1.pkl

27 ../vad_train_audio/2017019880039_seojuyeon/3.wav
Default SR : 8000, Audio Length(Default SR) : 226827, Audio Duration : 28.353375
Audio Duration : 28.353375, Resampling SR : 7053.833979200007, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880039_seojuyeon/3.pkl

28 ../vad_train_audio/2017019880039_seojuyeon/2.wav
Default SR : 8000, Audio Length(Default SR) : 224928, Audio Duration : 28.116
Audio Duration : 28.116, Resampling SR : 7113.387395077536, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880039_seojuyeon/2.pkl

29 ../vad_train_audio/201701988003

52 ../vad_train_audio/2017019880034_leesujung/3.wav
Default SR : 8000, Audio Length(Default SR) : 240800, Audio Duration : 30.1
Audio Duration : 30.1, Resampling SR : 6644.518272425249, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880034_leesujung/3.pkl

53 ../vad_train_audio/2017019880034_leesujung/2.wav
Default SR : 8000, Audio Length(Default SR) : 232000, Audio Duration : 29.0
Audio Duration : 29.0, Resampling SR : 6896.551724137931, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880034_leesujung/2.pkl

54 ../vad_train_audio/2017019880034_leesujung/4.wav
Default SR : 8000, Audio Length(Default SR) : 241448, Audio Duration : 30.181
Audio Duration : 30.181, Resampling SR : 6626.685663165567, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880034_leesujung/4.pkl

55 ../vad_train_audio/2017019880032_seoyeonjoo/5

78 ../vad_train_audio/2017019880027_leesol/2.wav
Default SR : 8000, Audio Length(Default SR) : 203193, Audio Duration : 25.399125
Audio Duration : 25.399125, Resampling SR : 7874.28700791858, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880027_leesol/2.pkl

79 ../vad_train_audio/2017019880027_leesol/4.wav
Default SR : 8000, Audio Length(Default SR) : 198472, Audio Duration : 24.809
Audio Duration : 24.809, Resampling SR : 8061.590551815873, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880027_leesol/4.pkl

80 ../vad_train_audio/2017019880037_jusunghyun/5.wav
Default SR : 8000, Audio Length(Default SR) : 177543, Audio Duration : 22.192875
Audio Duration : 22.192875, Resampling SR : 9011.901342210056, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880037_jusunghyun/5.pkl

81 ../vad_train_audio/2017019880037_jus

104 ../vad_train_audio/2017019770030_leejeongju/4.wav
Default SR : 8000, Audio Length(Default SR) : 215308, Audio Duration : 26.9135
Audio Duration : 26.9135, Resampling SR : 7431.214817842347, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770030_leejeongju/4.pkl

105 ../vad_train_audio/2017019880022_kimjaein/5.wav
Default SR : 8000, Audio Length(Default SR) : 211638, Audio Duration : 26.45475
Audio Duration : 26.45475, Resampling SR : 7560.07900282558, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880022_kimjaein/5.pkl

106 ../vad_train_audio/2017019880022_kimjaein/1.wav
Default SR : 8000, Audio Length(Default SR) : 213760, Audio Duration : 26.72
Audio Duration : 26.72, Resampling SR : 7485.029940119761, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880022_kimjaein/1.pkl

107 ../vad_train_audio/2017019880022

131 ../vad_train_audio/2017019770035_leehyojin/1.wav
Default SR : 8000, Audio Length(Default SR) : 267968, Audio Duration : 33.496
Audio Duration : 33.496, Resampling SR : 5970.862192500596, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770035_leehyojin/1.pkl

132 ../vad_train_audio/2017019770035_leehyojin/3.wav
Default SR : 8000, Audio Length(Default SR) : 258656, Audio Duration : 32.332
Audio Duration : 32.332, Resampling SR : 6185.822095756526, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770035_leehyojin/3.pkl

133 ../vad_train_audio/2017019770035_leehyojin/2.wav
Default SR : 8000, Audio Length(Default SR) : 256352, Audio Duration : 32.044
Audio Duration : 32.044, Resampling SR : 6241.4180501810015, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770035_leehyojin/2.pkl

134 ../vad_train_audio/2017019770035

157 ../vad_train_audio/2017019880001_kimsubin/3.wav
Default SR : 8000, Audio Length(Default SR) : 171575, Audio Duration : 21.446875
Audio Duration : 21.446875, Resampling SR : 9325.367914906019, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880001_kimsubin/3.pkl

158 ../vad_train_audio/2017019880001_kimsubin/2.wav
Default SR : 8000, Audio Length(Default SR) : 177225, Audio Duration : 22.153125
Audio Duration : 22.153125, Resampling SR : 9028.071660318805, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880001_kimsubin/2.pkl

159 ../vad_train_audio/2017019880001_kimsubin/4.wav
Default SR : 8000, Audio Length(Default SR) : 175387, Audio Duration : 21.923375
Audio Duration : 21.923375, Resampling SR : 9122.68298106473, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880001_kimsubin/4.pkl

160 ../vad_train_audio/201

183 ../vad_train_audio/2017019740017_kwakmihyang/2.wav
Default SR : 8000, Audio Length(Default SR) : 213255, Audio Duration : 26.656875
Audio Duration : 26.656875, Resampling SR : 7502.754917821388, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740017_kwakmihyang/2.pkl

184 ../vad_train_audio/2017019740017_kwakmihyang/4.wav
Default SR : 8000, Audio Length(Default SR) : 204079, Audio Duration : 25.509875
Audio Duration : 25.509875, Resampling SR : 7840.101137304671, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740017_kwakmihyang/4.pkl

185 ../vad_train_audio/2017019880018_wonjoonho/5.wav
Default SR : 8000, Audio Length(Default SR) : 222080, Audio Duration : 27.76
Audio Duration : 27.76, Resampling SR : 7204.610951008645, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880018_wonjoonho/5.pkl

186 ../vad_train_au

209 ../vad_train_audio/2017019740021_kwakbokyeong/4.wav
Default SR : 8000, Audio Length(Default SR) : 310443, Audio Duration : 38.805375
Audio Duration : 38.805375, Resampling SR : 5153.925197218169, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740021_kwakbokyeong/4.pkl

210 ../vad_train_audio/2017019770017_hansohee/5.wav
Default SR : 8000, Audio Length(Default SR) : 244524, Audio Duration : 30.5655
Audio Duration : 30.5655, Resampling SR : 6543.32499059397, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770017_hansohee/5.pkl

211 ../vad_train_audio/2017019770017_hansohee/1.wav
Default SR : 8000, Audio Length(Default SR) : 263169, Audio Duration : 32.896125
Audio Duration : 32.896125, Resampling SR : 6079.743434827051, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770017_hansohee/1.pkl

212 ../vad_train_audio

235 ../vad_train_audio/2017019880031_ahnjiwoo/5.wav
Default SR : 8000, Audio Length(Default SR) : 237590, Audio Duration : 29.69875
Audio Duration : 29.69875, Resampling SR : 6734.29016372743, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880031_ahnjiwoo/5.pkl

236 ../vad_train_audio/2017019880031_ahnjiwoo/1.wav
Default SR : 8000, Audio Length(Default SR) : 247030, Audio Duration : 30.87875
Audio Duration : 30.87875, Resampling SR : 6476.946119904465, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880031_ahnjiwoo/1.pkl

237 ../vad_train_audio/2017019880031_ahnjiwoo/3.wav
Default SR : 8000, Audio Length(Default SR) : 232790, Audio Duration : 29.09875
Audio Duration : 29.09875, Resampling SR : 6873.147471970446, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880031_ahnjiwoo/3.pkl

238 ../vad_train_audio/201701988

261 ../vad_train_audio/2017019740007_leekyeongeun/1.wav
Default SR : 8000, Audio Length(Default SR) : 257846, Audio Duration : 32.23075
Audio Duration : 32.23075, Resampling SR : 6205.254299077744, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740007_leekyeongeun/1.pkl

262 ../vad_train_audio/2017019740007_leekyeongeun/3.wav
Default SR : 8000, Audio Length(Default SR) : 249120, Audio Duration : 31.14
Audio Duration : 31.14, Resampling SR : 6422.6075786769425, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740007_leekyeongeun/3.pkl

263 ../vad_train_audio/2017019740007_leekyeongeun/2.wav
Default SR : 8000, Audio Length(Default SR) : 260352, Audio Duration : 32.544
Audio Duration : 32.544, Resampling SR : 6145.526057030483, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740007_leekyeongeun/2.pkl

264 ../vad_train

287 ../vad_train_audio/2017019880019_wonsonghee/3.wav
Default SR : 8000, Audio Length(Default SR) : 256320, Audio Duration : 32.04
Audio Duration : 32.04, Resampling SR : 6242.197253433209, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880019_wonsonghee/3.pkl

288 ../vad_train_audio/2017019880019_wonsonghee/2.wav
Default SR : 8000, Audio Length(Default SR) : 248141, Audio Duration : 31.017625
Audio Duration : 31.017625, Resampling SR : 6447.946933396738, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880019_wonsonghee/2.pkl

289 ../vad_train_audio/2017019880019_wonsonghee/4.wav
Default SR : 8000, Audio Length(Default SR) : 242435, Audio Duration : 30.304375
Audio Duration : 30.304375, Resampling SR : 6599.7071379957515, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880019_wonsonghee/4.pkl

290 ../vad_train_aud

313 ../vad_train_audio/2017019770021_heoyoonjung/2.wav
Default SR : 8000, Audio Length(Default SR) : 173280, Audio Duration : 21.66
Audio Duration : 21.66, Resampling SR : 9233.610341643582, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770021_heoyoonjung/2.pkl

314 ../vad_train_audio/2017019770021_heoyoonjung/4.wav
Default SR : 8000, Audio Length(Default SR) : 169840, Audio Duration : 21.23
Audio Duration : 21.23, Resampling SR : 9420.631182289213, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770021_heoyoonjung/4.pkl

315 ../vad_train_audio/2017019740022_ahnhyojin/5.wav
Default SR : 8000, Audio Length(Default SR) : 249920, Audio Duration : 31.24
Audio Duration : 31.24, Resampling SR : 6402.048655569783, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740022_ahnhyojin/5.pkl

316 ../vad_train_audio/201701974002

339 ../vad_train_audio/2017019740040_kimjihyeon/4.wav
Default SR : 8000, Audio Length(Default SR) : 245446, Audio Duration : 30.68075
Audio Duration : 30.68075, Resampling SR : 6518.745467434792, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740040_kimjihyeon/4.pkl

340 ../vad_train_audio/2017019880020_yoonhyeeun/5.wav
Default SR : 8000, Audio Length(Default SR) : 208608, Audio Duration : 26.076
Audio Duration : 26.076, Resampling SR : 7669.888019634913, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880020_yoonhyeeun/5.pkl

341 ../vad_train_audio/2017019880020_yoonhyeeun/1.wav
Default SR : 8000, Audio Length(Default SR) : 207616, Audio Duration : 25.952
Audio Duration : 25.952, Resampling SR : 7706.5351418002465, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880020_yoonhyeeun/1.pkl

342 ../vad_train_audio/201

365 ../vad_train_audio/2017019770005_hyeonsanghyeok/5.wav
Default SR : 8000, Audio Length(Default SR) : 185718, Audio Duration : 23.21475
Audio Duration : 23.21475, Resampling SR : 8615.212311138393, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770005_hyeonsanghyeok/5.pkl

366 ../vad_train_audio/2017019770005_hyeonsanghyeok/1.wav
Default SR : 8000, Audio Length(Default SR) : 191904, Audio Duration : 23.988
Audio Duration : 23.988, Resampling SR : 8337.502084375521, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770005_hyeonsanghyeok/1.pkl

367 ../vad_train_audio/2017019770005_hyeonsanghyeok/3.wav
Default SR : 8000, Audio Length(Default SR) : 190251, Audio Duration : 23.781375
Audio Duration : 23.781375, Resampling SR : 8409.942654703524, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770005_hyeonsanghyeok/3.pk

391 ../vad_train_audio/2017019880012_kimsongyi/1.wav
Default SR : 8000, Audio Length(Default SR) : 225558, Audio Duration : 28.19475
Audio Duration : 28.19475, Resampling SR : 7093.519183535942, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880012_kimsongyi/1.pkl

392 ../vad_train_audio/2017019880012_kimsongyi/3.wav
Default SR : 8000, Audio Length(Default SR) : 211200, Audio Duration : 26.4
Audio Duration : 26.4, Resampling SR : 7575.757575757576, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880012_kimsongyi/3.pkl

393 ../vad_train_audio/2017019880012_kimsongyi/2.wav
Default SR : 8000, Audio Length(Default SR) : 226560, Audio Duration : 28.32
Audio Duration : 28.32, Resampling SR : 7062.146892655367, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880012_kimsongyi/2.pkl

394 ../vad_train_audio/2017019880012_ki

417 ../vad_train_audio/2017019740043_kimtaeri/3.wav
Default SR : 8000, Audio Length(Default SR) : 263339, Audio Duration : 32.917375
Audio Duration : 32.917375, Resampling SR : 6075.818621624598, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740043_kimtaeri/3.pkl

418 ../vad_train_audio/2017019740043_kimtaeri/2.wav
Default SR : 8000, Audio Length(Default SR) : 272480, Audio Duration : 34.06
Audio Duration : 34.06, Resampling SR : 5871.9906048150315, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740043_kimtaeri/2.pkl

419 ../vad_train_audio/2017019740043_kimtaeri/4.wav
Default SR : 8000, Audio Length(Default SR) : 266379, Audio Duration : 33.297375
Audio Duration : 33.297375, Resampling SR : 6006.479489749567, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740043_kimtaeri/4.pkl

420 ../vad_train_audio/201701974

443 ../vad_train_audio/2017019740012_chaeminjoon/2.wav
Default SR : 8000, Audio Length(Default SR) : 192939, Audio Duration : 24.117375
Audio Duration : 24.117375, Resampling SR : 8292.776473393145, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740012_chaeminjoon/2.pkl

444 ../vad_train_audio/2017019740012_chaeminjoon/4.wav
Default SR : 8000, Audio Length(Default SR) : 219296, Audio Duration : 27.412
Audio Duration : 27.412, Resampling SR : 7296.074711805049, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740012_chaeminjoon/4.pkl

445 ../vad_train_audio/2017019770009_yuminji/5.wav
Default SR : 8000, Audio Length(Default SR) : 230240, Audio Duration : 28.78
Audio Duration : 28.78, Resampling SR : 6949.270326615705, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770009_yuminji/5.pkl

446 ../vad_train_audio/201701

469 ../vad_train_audio/2017019740023_heosehun/4.wav
Default SR : 8000, Audio Length(Default SR) : 219904, Audio Duration : 27.488
Audio Duration : 27.488, Resampling SR : 7275.902211874272, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740023_heosehun/4.pkl

470 ../vad_train_audio/2017019770003_johaesu/5.wav
Default SR : 8000, Audio Length(Default SR) : 211648, Audio Duration : 26.456
Audio Duration : 26.456, Resampling SR : 7559.721802237677, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770003_johaesu/5.pkl

471 ../vad_train_audio/2017019770003_johaesu/1.wav
Default SR : 8000, Audio Length(Default SR) : 225568, Audio Duration : 28.196
Audio Duration : 28.196, Resampling SR : 7093.204709887927, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770003_johaesu/1.pkl

472 ../vad_train_audio/2017019770003_johaesu/3.

495 ../vad_train_audio/2017019880029_kimminji/5.wav
Default SR : 8000, Audio Length(Default SR) : 234231, Audio Duration : 29.278875
Audio Duration : 29.278875, Resampling SR : 6830.863549231314, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880029_kimminji/5.pkl

496 ../vad_train_audio/2017019880029_kimminji/1.wav
Default SR : 8000, Audio Length(Default SR) : 236800, Audio Duration : 29.6
Audio Duration : 29.6, Resampling SR : 6756.756756756757, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880029_kimminji/1.pkl

497 ../vad_train_audio/2017019880029_kimminji/3.wav
Default SR : 8000, Audio Length(Default SR) : 220485, Audio Duration : 27.560625
Audio Duration : 27.560625, Resampling SR : 7256.729482731251, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880029_kimminji/3.pkl

498 ../vad_train_audio/201701988002

521 ../vad_train_audio/2017019740036_leeyongseok/1.wav
Default SR : 8000, Audio Length(Default SR) : 226656, Audio Duration : 28.332
Audio Duration : 28.332, Resampling SR : 7059.155724975293, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740036_leeyongseok/1.pkl

522 ../vad_train_audio/2017019740036_leeyongseok/3.wav
Default SR : 8000, Audio Length(Default SR) : 204704, Audio Duration : 25.588
Audio Duration : 25.588, Resampling SR : 7816.16382679381, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740036_leeyongseok/3.pkl

523 ../vad_train_audio/2017019740036_leeyongseok/2.wav
Default SR : 8000, Audio Length(Default SR) : 194720, Audio Duration : 24.34
Audio Duration : 24.34, Resampling SR : 8216.926869350862, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740036_leeyongseok/2.pkl

524 ../vad_train_audio/20170

547 ../vad_train_audio/2017019770024_chuminha/3.wav
Default SR : 8000, Audio Length(Default SR) : 210394, Audio Duration : 26.29925
Audio Duration : 26.29925, Resampling SR : 7604.779603981102, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770024_chuminha/3.pkl

548 ../vad_train_audio/2017019770024_chuminha/2.wav
Default SR : 8000, Audio Length(Default SR) : 219890, Audio Duration : 27.48625
Audio Duration : 27.48625, Resampling SR : 7276.365455455, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770024_chuminha/2.pkl

549 ../vad_train_audio/2017019770024_chuminha/4.wav
Default SR : 8000, Audio Length(Default SR) : 197466, Audio Duration : 24.68325
Audio Duration : 24.68325, Resampling SR : 8102.660711211043, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770024_chuminha/4.pkl

550 ../vad_train_audio/20170197400

573 ../vad_train_audio/2017019880042_namyeji/2.wav
Default SR : 8000, Audio Length(Default SR) : 249248, Audio Duration : 31.156
Audio Duration : 31.156, Resampling SR : 6419.309282321223, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880042_namyeji/2.pkl

574 ../vad_train_audio/2017019880042_namyeji/4.wav
Default SR : 8000, Audio Length(Default SR) : 231392, Audio Duration : 28.924
Audio Duration : 28.924, Resampling SR : 6914.6729359701285, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019880042_namyeji/4.pkl

575 ../vad_train_audio/2017019740020_kimseongje/5.wav
Default SR : 8000, Audio Length(Default SR) : 199712, Audio Duration : 24.964
Audio Duration : 24.964, Resampling SR : 8011.536612722321, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740020_kimseongje/5.pkl

576 ../vad_train_audio/2017019740020_kimse

599 ../vad_train_audio/2017019770038_kanghyeyun/4.wav
Default SR : 8000, Audio Length(Default SR) : 212627, Audio Duration : 26.578375
Audio Duration : 26.578375, Resampling SR : 7524.914521674105, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019770038_kanghyeyun/4.pkl

600 ../vad_train_audio/2017019740016_kangyeseo/5.wav
Default SR : 8000, Audio Length(Default SR) : 208790, Audio Duration : 26.09875
Audio Duration : 26.09875, Resampling SR : 7663.202260644667, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740016_kangyeseo/5.pkl

601 ../vad_train_audio/2017019740016_kangyeseo/1.wav
Default SR : 8000, Audio Length(Default SR) : 217931, Audio Duration : 27.241375
Audio Duration : 27.241375, Resampling SR : 7341.773313571726, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740016_kangyeseo/1.pkl

602 ../vad_train_au

625 ../vad_train_audio/2017019740014_leesumin/5.wav
Default SR : 8000, Audio Length(Default SR) : 239403, Audio Duration : 29.925375
Audio Duration : 29.925375, Resampling SR : 6683.291353909517, Result Audio Length : 200001
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740014_leesumin/5.pkl

626 ../vad_train_audio/2017019740014_leesumin/1.wav
Default SR : 8000, Audio Length(Default SR) : 208502, Audio Duration : 26.06275
Audio Duration : 26.06275, Resampling SR : 7673.787301800462, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740014_leesumin/1.pkl

627 ../vad_train_audio/2017019740014_leesumin/3.wav
Default SR : 8000, Audio Length(Default SR) : 239875, Audio Duration : 29.984375
Audio Duration : 29.984375, Resampling SR : 6670.140698280355, Result Audio Length : 200000
Final Audio Length : 200000
Output File Name: ../resampled_train_data/2017019740014_leesumin/3.pkl

628 ../vad_train_audio/2017

In [4]:
order = 0
list_dir.sort()
class_ids = {list_dir[i]: i for i in range(len(list_dir))}

def extract_class_id(wav_filename):
    return class_ids.get(wav_filename.split('/')[2])

def read_audio_from_filename(filename, target_sr):
    audio, _ = librosa.load(filename, sr=TARGET_SR, mono=True)
    audio = audio.reshape(-1, 1)
    return audio

def get_audio_buf(filename):
    audio_buf = None
    sample_rate = None
    
    with open(filename, 'rb') as f:
        audio_element = pickle.load(f)
        audio_buf = audio_element['resampled_audio']
        sample_rate = audio_element['resample_sr']
    
    return audio_buf, sample_rate

def convert_data():

    for i, wav_filename in enumerate(iglob(os.path.join(DATA_RESAMPLED_AUDIO_DIR, '**/**.pkl'), recursive=True)):
        class_id = extract_class_id(wav_filename)
#         audio_buf = read_audio_from_filename(wav_filename, target_sr=TARGET_SR)
        audio_buf,_ = get_audio_buf(wav_filename)
        print(type(audio_buf))
        
        # normalize mean 0, variance 1
        audio_buf = (audio_buf - np.mean(audio_buf)) / np.std(audio_buf)
        original_length = len(audio_buf)
        print(i, wav_filename, original_length, np.round(np.mean(audio_buf), 4), np.std(audio_buf))
        
        voice_seg = []
        
        if original_length >= AUDIO_LENGTH + PAD_SIZE:
            audio_seg = audio_buf[PAD_SIZE : AUDIO_LENGTH + PAD_SIZE]
            voice_seg.append(audio_seg)
            print("Audio Segment Length : {}".format(len(audio_seg)))
        else:
            print('Drop Audio Segment, Audio length={}'.format(len(audio_buf)))
            
        output_folder = OUTPUT_DIR_RESAMPLED_TRAIN
        if wav_filename[-5:] == '5.pkl':
            output_folder = OUTPUT_DIR_RESAMPLED_TEST
            
        output_filename = os.path.join(output_folder, str(wav_filename[24:-4]).replace('/', '_') + '.pkl')

        out_segs = []
        for i_seg, audio_seg in enumerate(voice_seg) :
            out = {'class_id': class_id,
               'audio': audio_seg,
               'sr': TARGET_SR}
            
            with open(output_filename, 'wb') as w:
                pickle.dump(out, w)

            print("Output File Name : "+output_filename)
        
        print("-----------------")
      

### Convert Data

In [5]:
convert_data()

<class 'numpy.ndarray'>
0 ../resampled_train_data/2017019770032_kimdayeong/4.pkl 200000 0.0 0.99999994
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770032_kimdayeong_4.pkl
-----------------
<class 'numpy.ndarray'>
1 ../resampled_train_data/2017019770032_kimdayeong/3.pkl 200000 0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770032_kimdayeong_3.pkl
-----------------
<class 'numpy.ndarray'>
2 ../resampled_train_data/2017019770032_kimdayeong/1.pkl 200000 0.0 0.99999994
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770032_kimdayeong_1.pkl
-----------------
<class 'numpy.ndarray'>
3 ../resampled_train_data/2017019770032_kimdayeong/5.pkl 200000 0.0 0.99999994
Audio Segment Length : 200000
Output File Name : ../output/resampled_test/2017019770032_kimdayeong_5.pkl
-----------------
<class 'numpy.ndarray'>
4 ../resampled_train_data/2017019770032_kimdayeong/2.pkl 200000 -0.0 1.0
Aud

Output File Name : ../output/resampled_train/2017019880022_kimjaein_4.pkl
-----------------
<class 'numpy.ndarray'>
106 ../resampled_train_data/2017019880022_kimjaein/3.pkl 200000 -0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019880022_kimjaein_3.pkl
-----------------
<class 'numpy.ndarray'>
107 ../resampled_train_data/2017019880022_kimjaein/1.pkl 200000 -0.0 0.99999994
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019880022_kimjaein_1.pkl
-----------------
<class 'numpy.ndarray'>
108 ../resampled_train_data/2017019880022_kimjaein/5.pkl 200000 -0.0 0.99999994
Audio Segment Length : 200000
Output File Name : ../output/resampled_test/2017019880022_kimjaein_5.pkl
-----------------
<class 'numpy.ndarray'>
109 ../resampled_train_data/2017019880022_kimjaein/2.pkl 200000 -0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019880022_kimjaein_2.pkl
-----------------
<class 'numpy.ndarr

Output File Name : ../output/resampled_train/2017019770004_leebyeongjin_3.pkl
-----------------
<class 'numpy.ndarray'>
217 ../resampled_train_data/2017019770004_leebyeongjin/1.pkl 200000 -0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770004_leebyeongjin_1.pkl
-----------------
<class 'numpy.ndarray'>
218 ../resampled_train_data/2017019770004_leebyeongjin/5.pkl 200000 0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_test/2017019770004_leebyeongjin_5.pkl
-----------------
<class 'numpy.ndarray'>
219 ../resampled_train_data/2017019770004_leebyeongjin/2.pkl 200000 -0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770004_leebyeongjin_2.pkl
-----------------
<class 'numpy.ndarray'>
220 ../resampled_train_data/2017019740005_moonyeonwoo/4.pkl 200000 0.0 0.9999999
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019740005_moonyeonwoo_4.pkl
-------------

Output File Name : ../output/resampled_train/2017019880020_yoonhyeeun_3.pkl
-----------------
<class 'numpy.ndarray'>
342 ../resampled_train_data/2017019880020_yoonhyeeun/1.pkl 200000 -0.0 1.0000001
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019880020_yoonhyeeun_1.pkl
-----------------
<class 'numpy.ndarray'>
343 ../resampled_train_data/2017019880020_yoonhyeeun/5.pkl 200000 -0.0 0.9999999
Audio Segment Length : 200000
Output File Name : ../output/resampled_test/2017019880020_yoonhyeeun_5.pkl
-----------------
<class 'numpy.ndarray'>
344 ../resampled_train_data/2017019880020_yoonhyeeun/2.pkl 200000 0.0 0.99999994
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019880020_yoonhyeeun_2.pkl
-----------------
<class 'numpy.ndarray'>
345 ../resampled_train_data/2017019880003_kimminji/4.pkl 200000 0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019880003_kimminji_4.pkl
-----------------
<c

Output File Name : ../output/resampled_train/2017019740023_heosehun_1.pkl
-----------------
<class 'numpy.ndarray'>
468 ../resampled_train_data/2017019740023_heosehun/5.pkl 200000 0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_test/2017019740023_heosehun_5.pkl
-----------------
<class 'numpy.ndarray'>
469 ../resampled_train_data/2017019740023_heosehun/2.pkl 200000 -0.0 0.9999999
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019740023_heosehun_2.pkl
-----------------
<class 'numpy.ndarray'>
470 ../resampled_train_data/2017019770003_johaesu/4.pkl 200000 -0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770003_johaesu_4.pkl
-----------------
<class 'numpy.ndarray'>
471 ../resampled_train_data/2017019770003_johaesu/3.pkl 200000 0.0 0.9999999
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770003_johaesu_3.pkl
-----------------
<class 'numpy.ndarray'>
472

Output File Name : ../output/resampled_train/2017019770006_jennie_1.pkl
-----------------
<class 'numpy.ndarray'>
593 ../resampled_train_data/2017019770006_jennie/5.pkl 200000 0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_test/2017019770006_jennie_5.pkl
-----------------
<class 'numpy.ndarray'>
594 ../resampled_train_data/2017019770006_jennie/2.pkl 200000 -0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770006_jennie_2.pkl
-----------------
<class 'numpy.ndarray'>
595 ../resampled_train_data/2017019770038_kanghyeyun/4.pkl 200000 0.0 1.0
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770038_kanghyeyun_4.pkl
-----------------
<class 'numpy.ndarray'>
596 ../resampled_train_data/2017019770038_kanghyeyun/3.pkl 200000 -0.0 0.9999999
Audio Segment Length : 200000
Output File Name : ../output/resampled_train/2017019770038_kanghyeyun_3.pkl
-----------------
<class 'numpy.ndarray'>
597 ../

In [6]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Lambda, Bidirectional, LSTM
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow import convert_to_tensor
from tensorflow import expand_dims
import numpy as np
import pickle
import os
from glob import glob
import tensorflow

print(tensorflow.__version__)

2.4.0-dev20200717


In [7]:
list_dir = os.listdir(DATA_RESAMPLED_AUDIO_DIR)
list_dir.sort()

class_ids = {list_dir[i]: i for i in range(len(list_dir))}

In [8]:
def m5(num_classes):
    print("Class Num", num_classes)
    print('Using Model M5')
    m = Sequential()
    m.add(Conv1D(128,
                 input_shape=[AUDIO_LENGTH, 1],
                 kernel_size=80,
                 strides=4,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(128,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(256,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Conv1D(512,
                 kernel_size=3,
                 strides=1,
                 padding='same',
                 kernel_initializer='glorot_uniform',
                 kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4, strides=None))
    m.add(Lambda(lambda x: K.mean(x, axis=1)))  # Same as GAP for 1D Conv Layer
    m.add(Dense(num_classes, activation='softmax'))
    return m

In [9]:
def get_data(file_list):
    def load_into(_filename, _x, _y):
        with open(_filename, 'rb') as f:
            audio_element = pickle.load(f)
            _x.append(audio_element['audio'])
            _y.append(int(audio_element['class_id']))

    x, y = [], []
    for filename in file_list:
        load_into(filename, x, y)
    return np.array(x), np.array(y)

In [10]:
num_classes = len(list_dir)
model = m5(num_classes)

if model is None:
    exit('Something went wrong!!')

model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
print(model.summary())

Class Num 128
Using Model M5
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 50000, 128)        10368     
_________________________________________________________________
batch_normalization (BatchNo (None, 50000, 128)        512       
_________________________________________________________________
activation (Activation)      (None, 50000, 128)        0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 12500, 128)        0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 12500, 128)        49280     
_________________________________________________________________
batch_normalization_1 (Batch (None, 12500, 128)        512       
_________________________________________________________________
activation_1 (Activation)  

In [11]:
train_files = glob(os.path.join(OUTPUT_DIR_RESAMPLED_TRAIN, '**.pkl'))
print(os.path.join(OUTPUT_DIR_RESAMPLED_TRAIN, '**.pkl'))
x_tr, y_tr = get_data(train_files)

y_tr = to_categorical(y_tr, num_classes=num_classes)
x_tr = x_tr.reshape(-1, x_tr.shape[1], 1)


test_files = glob(os.path.join(OUTPUT_DIR_RESAMPLED_TEST, '**.pkl'))
x_te, y_te = get_data(test_files)

y_te = to_categorical(y_te, num_classes=num_classes)
x_te = x_te.reshape(-1, x_te.shape[1], 1)


print('x_tr.shape =', x_tr.shape)
print('y_tr.shape =', y_tr.shape)
print('x_te.shape =', x_te.shape)
print('y_te.shape =', y_te.shape)

../output/resampled_train/**.pkl
x_tr.shape = (512, 200000, 1)
y_tr.shape = (512, 128)
x_te.shape = (128, 200000, 1)
y_te.shape = (128, 128)


In [None]:
# if the accuracy does not increase over 10 epochs, reduce the learning rate by half.
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=10, min_lr=0.00005, verbose=1)
batch_size = 128
history = model.fit(x=x_tr, y=y_tr, batch_size=16, epochs=100, verbose=2, shuffle=True, validation_data=(x_te, y_te), callbacks=[reduce_lr])


Epoch 1/100
32/32 - 65s - loss: 5.2071 - accuracy: 0.0293 - val_loss: 4.6971 - val_accuracy: 0.0469
Epoch 2/100
32/32 - 65s - loss: 3.5816 - accuracy: 0.1621 - val_loss: 4.5836 - val_accuracy: 0.0469
Epoch 3/100
32/32 - 65s - loss: 2.6928 - accuracy: 0.3828 - val_loss: 4.1882 - val_accuracy: 0.0938
Epoch 4/100
32/32 - 65s - loss: 2.0521 - accuracy: 0.5332 - val_loss: 3.7528 - val_accuracy: 0.2344
Epoch 5/100
32/32 - 65s - loss: 1.6523 - accuracy: 0.6484 - val_loss: 3.1778 - val_accuracy: 0.3203
Epoch 6/100
32/32 - 65s - loss: 1.4243 - accuracy: 0.7129 - val_loss: 2.3812 - val_accuracy: 0.5469
Epoch 7/100
32/32 - 66s - loss: 1.1687 - accuracy: 0.7812 - val_loss: 2.2153 - val_accuracy: 0.5156
Epoch 8/100
32/32 - 66s - loss: 1.0033 - accuracy: 0.7871 - val_loss: 2.1958 - val_accuracy: 0.4688
Epoch 9/100
32/32 - 66s - loss: 0.8690 - accuracy: 0.8457 - val_loss: 1.6257 - val_accuracy: 0.6484
Epoch 10/100
32/32 - 74s - loss: 0.7324 - accuracy: 0.8848 - val_loss: 1.3913 - val_accuracy: 0.6953

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(14, 6))
plt.rc('font', size=18)

plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.legend(["train_accuracy", "val_accuracy"])
plt.show()

In [None]:
import pickle
from glob import iglob
import numpy as np
import librosa
from shutil import rmtree
from constants import *


OUTPUT_VAL_DIR = '../output_val'
OUTPUT_DIR_VAL_TRAIN = os.path.join(OUTPUT_VAL_DIR, 'train')
OUTPUT_DIR_VAL_TEST = os.path.join(OUTPUT_VAL_DIR, 'test')

DATA_AUDIO_VAL_DIR = '../vad_test_audio'

OUTPUT_DIR_RESAMPLED_VAL_TRAIN = os.path.join(OUTPUT_VAL_DIR, 'resampled_train')
OUTPUT_DIR_RESAMPLED_VAL_TEST = os.path.join(OUTPUT_VAL_DIR, 'resampled_test')

DATA_RESAMPLED_AUDIO_VAL_DIR = '../resampled_val_data'

In [None]:
del_folder(DATA_RESAMPLED_AUDIO_VAL_DIR)
mkdir_p(DATA_RESAMPLED_AUDIO_VAL_DIR)

In [None]:
del_folder(OUTPUT_DIR_VAL_TRAIN)
del_folder(OUTPUT_DIR_VAL_TEST)
mkdir_p(OUTPUT_DIR_VAL_TRAIN)
mkdir_p(OUTPUT_DIR_VAL_TEST)

In [None]:
del_folder(OUTPUT_DIR_RESAMPLED_VAL_TRAIN)
del_folder(OUTPUT_DIR_RESAMPLED_VAL_TEST)
mkdir_p(OUTPUT_DIR_RESAMPLED_VAL_TRAIN)
mkdir_p(OUTPUT_DIR_RESAMPLED_VAL_TEST)

In [None]:
def resampling_val_audio():
     for i, wav_filename in enumerate(iglob(os.path.join(DATA_AUDIO_VAL_DIR, '**/**.wav'), recursive=True)):
        y, sr = librosa.load(wav_filename, sr=TARGET_SR, mono=True, duration=MAX_AUDIO_LENGTH)
        audio_duration = len(y) / sr
        
        resample_sr = TARGET_LENGTH / audio_duration        
        resample = librosa.resample(y, sr, resample_sr)
        print(i, wav_filename)
        print("Default SR : {}, Audio Length(Default SR) : {}, Audio Duration : {}".format(TARGET_SR, len(y), audio_duration))
        print("Audio Duration : {}, Resampling SR : {}, Result Audio Length : {}".format(audio_duration, resample_sr, len(resample)))
        
        if len(resample) < TARGET_LENGTH:
            resample = np.concatenate((resample, np.zeros(shape=(TARGET_LENGTH - len(resample), 1))))
        resample = resample[:TARGET_LENGTH]
        
        print("Final Audio Length : {}".format(len(resample)))

        
        output_folder = DATA_RESAMPLED_AUDIO_VAL_DIR
        if not os.path.exists(os.path.join(output_folder, wav_filename.split('/')[2])):
                mkdir_p(os.path.join(output_folder, wav_filename.split('/')[2]))
        output_filename = os.path.join(output_folder, wav_filename.split('/')[2], str(wav_filename.split('/')[3][0]+".pkl"))
        print("Output File Name: {}\n".format(output_filename))
        
        result = {'resampled_audio': resample,
               'resample_sr': resample_sr}

        with open(output_filename, 'wb') as w:
                  pickle.dump(result, w)

In [None]:
resampling_val_audio()

In [None]:
PAD_SIZE = 0
TARGET_LENGTH = 100000
TARGET_SR = 8000
MAX_AUDIO_LENGTH = 50
AUDIO_LENGTH = TARGET_LENGTH
SEGMENT_NUM = 1

In [None]:
def convert_val_data():

    for i, wav_filename in enumerate(iglob(os.path.join(DATA_RESAMPLED_AUDIO_VAL_DIR, '**/**.pkl'), recursive=True)):
        class_id = extract_class_id(wav_filename)
#         audio_buf = read_audio_from_filename(wav_filename, target_sr=TARGET_SR)
        audio_buf,_ = get_audio_buf(wav_filename)
        print(type(audio_buf))
        
        # normalize mean 0, variance 1
        audio_buf = (audio_buf - np.mean(audio_buf)) / np.std(audio_buf)
        original_length = len(audio_buf)
        print(i, wav_filename, original_length, np.round(np.mean(audio_buf), 4), np.std(audio_buf))
        
        voice_seg = []
        
        if original_length >= AUDIO_LENGTH + PAD_SIZE:
            audio_seg = audio_buf[PAD_SIZE : AUDIO_LENGTH + PAD_SIZE]
            voice_seg.append(audio_seg)
            print("Audio Segment Length : {}".format(len(audio_seg)))
        else:
            print('Drop Audio Segment, Audio length={}'.format(len(audio_buf)))
            
        output_folder = OUTPUT_DIR_RESAMPLED_VAL_TRAIN
        if wav_filename[-5:] == '5.pkl':
            output_folder = OUTPUT_DIR_RESAMPLED_VAL_TEST
            
        output_filename = os.path.join(output_folder, str(wav_filename[24:-4]).replace('/', '_') + '.pkl')

        out_segs = []
        for i_seg, audio_seg in enumerate(voice_seg) :
            out = {'class_id': class_id,
               'audio': audio_seg,
               'sr': TARGET_SR}
            
            with open(output_filename, 'wb') as w:
                pickle.dump(out, w)

            print("Output File Name : "+output_filename)
        
        print("-----------------")

if __name__ == '__main__':
    convert_val_data()

In [None]:
from sklearn.metrics import accuracy_score

val_files = glob(os.path.join(OUTPUT_DIR_RESAMPLED_VAL_TRAIN, '**.pkl'))
val_files.sort()
# print(val_files)
x_val, y_val = get_data(val_files)
x_val = x_val.reshape(-1, x_val.shape[1], 1)

print('y_val : ', y_val)
print(x_val.shape, y_val.shape)


In [None]:
pred_out = model.predict(x_val)

real =[]
pred_out_idex=[]

list_dir = os.listdir(DATA_RESAMPLED_AUDIO_DIR)
list_dir.sort()

j = 0
for i,pred in enumerate(pred_out) :
    if np.argmax(pred) != y_val[i]:
        print('\033[31m'+"Predict :"+str(list_dir[np.argmax(pred)])+" , Real :"+str(list_dir[y_val[i]])+", Likelihood :"+str(np.max(pred)))
    else:
        print('\033[30m'+"Predict :"+str(list_dir[np.argmax(pred)])+" , Real :"+str(list_dir[y_val[i]])+", Likelihood :"+str(np.max(pred)))
    real.append(list_dir.index(list_dir[y_val[i]]))
    pred_out_idex.append(np.argmax(pred))
    
# get the accuracy
print ('\n\033[30m'+"Final Predict Likelihood : "+format(accuracy_score(real, pred_out_idex)*100, ".4f"))