# Import libraries & define functions


In [3]:
import os
import pickle
from scipy import signal
from scipy.io import wavfile
import numpy as np


# set data folder path
DATA_ROOT = os.getcwd() + '/dataset/'
train_audio_path = DATA_ROOT + 'train/audio/'


def print_bar(ratio):
    # print bar to show current progress
    print_str = '['
    progress_count = int(20.0*ratio)
    for i in range(progress_count - 1):
        print_str += '='
    if progress_count == 20:
        print_str += '='
    elif progress_count > 0:
        print_str += '>'
    for i in range(20 - progress_count):
        print_str += ' '
    print_str += ']'
    print(print_str + '\r', end = '')


def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    # Preview of Spectograms across different words
    # Borrowing log spec function from
    # https://www.kaggle.com/davids1992/data-visualization-and-investigation

    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, _, spec = signal.spectrogram(audio, fs=sample_rate,
                                    window='hann', nperseg=nperseg,
                                    noverlap=noverlap, detrend=False)
    return freqs, np.log(spec.T.astype(np.float32) + eps)

# preprocessing

In [4]:
# Count number of commands
commands = []
for x in os.listdir(train_audio_path):
    if os.path.isdir(train_audio_path + '/' + x):
        commands.append(x)

uni_cmd = len(commands)

# creat lookup table for one-hot encoding
lookup_cmd2vec = {}
lookup_vec2cmd = {}
lookup_cmd2num = {}
get_label = {}
for (idx, cmd) in enumerate(commands):
    lookup_cmd2vec[cmd] = np.zeros(uni_cmd)
    lookup_cmd2vec[cmd][idx] = 1
    lookup_cmd2num[cmd] = idx
    lookup_vec2cmd[idx] = cmd


pieces = 10000
pcs_idx = 0
pkl_num = 0
# initialize spec and commands
data = np.zeros((pieces, 99, 161))
lbls = np.zeros(pieces)

pickle_path = DATA_ROOT + 'data_pre/'
if not os.path.exists(pickle_path):
    os.makedirs(pickle_path)
    
for i, cmd in enumerate(commands):
    print('\n', i, ':', cmd)
    # get all the wave files in one certain subFolderList
    all_files = [y for y in os.listdir(train_audio_path + cmd) if '.wav' in y]
    idx = 1000
    len_fls = len(all_files[:idx])
    
    for wav_idx, file in enumerate(all_files[:idx], 1):
        # show progress
        print_bar(wav_idx/len_fls)
        
        # use soundfile library to read in the wave files
        wav_path = (train_audio_path + cmd + '/' + file)
        samplerate, test_sound  = wavfile.read(wav_path)
        _, spectrogram = log_specgram(test_sound, samplerate)

        # drop the sample if the first dim is not 99
        if spectrogram.shape[0] != 99:
            continue

        data[pcs_idx] = spectrogram
        lbls[pcs_idx] = lookup_cmd2num[cmd]
        pcs_idx += 1
        if pcs_idx >= pieces:
            with open(pickle_path + str(pkl_num) + '.pickle', 'wb') as f:
                pickle.dump([data, lbls], f)
                f.close()
            pcs_idx = 0
            pkl_num += 1
            spec = np.zeros((pieces, 99, 161))
            lbls = np.zeros(pieces)
# save the last several pieces
with open(pickle_path + str(pkl_num) + '.pickle', 'wb') as f:
    pickle.dump([data[:pcs_idx], lbls[:pcs_idx]], f)
    f.close()
# save look-up tables
with open(pickle_path + 'lookup.pkl', 'wb') as f:
    pickle.dump([lookup_cmd2vec, lookup_cmd2num, lookup_vec2cmd], f)
    f.close()
print('\nData pre-processing complete.')
        


 0 : go
 1 : nine
 2 : dog
 3 : up
 4 : two
 5 : zero
 6 : _background_noise_
[=====>              ]



 7 : right
 8 : stop
 9 : no
 10 : three
 11 : bed
 12 : sheila
 13 : left
 14 : happy
 15 : marvin
 16 : tree
 17 : six
 18 : down
 19 : seven
 20 : on
 21 : one
 22 : bird
 23 : five
 24 : wow
 25 : cat
 26 : eight
 27 : yes
 28 : house
 29 : four
 30 : off
Data pre-processing complete.
