# Isolated Word Recognition using Neural Nets

### Load libs

In [1]:
from scipy.io.wavfile import read
import numpy as np
import sys
import tflearn

# extract audio features
from python_speech_features import mfcc
from python_speech_features import logfbank

### find out the input wav info

In [2]:
#
# check input wavefile format
#
# https://docs.scipy.org/doc/scipy-0.18.1/reference/generated/scipy.io.wavfile.write.html
[samplerate, x] = read("./datasets/fruits/apple/apple01.wav")

print('samplerate : {}'.format(samplerate))

bits_per_sample = None
if x.dtype == 'int16':
    bits_per_sample = 16
elif x.dtype == 'int32':
    bits_per_sample = 32
elif x.dtype == 'uint8':
    bits_per_sample = 8
else:
    sys.exit("unknow wav datatype")

maxAmptitude = float(2 ** (bits_per_sample - 1))

print('Bits per sample: {}, Max Amptitude : {}'.format(bits_per_sample, maxAmptitude))

samplerate : 8000
Bits per sample: 16, Max Amptitude : 32768.0


### prepare dataset

In [3]:
word_list = ['apple', 'banana', 'kiwi', 'lime', 'orange', 'peach', 'pineapple']
word2vec_dd = {}
for label, word in enumerate(word_list):
    word2vec_dd[word] = label

totalwords = len(word_list)

print(word2vec_dd)

{'kiwi': 2, 'apple': 0, 'peach': 5, 'pineapple': 6, 'orange': 4, 'banana': 1, 'lime': 3}


In [4]:
samples_per_word = 14
samples_total = samples_per_word * len(word_list)
print("There are total {} speech samples.\n".format(samples_total))

inputaudio_dd = {}

targetFolder = './datasets/fruits'
for word in word_list:
    for sid in range(1,samples_per_word+1): # start from 1
        myid = str(sid)
        if sid < 10: myid = '0' + str(sid)   # pad zero for numbers < 10
        targetWavFile = targetFolder + '/' + word + '/' + word + myid + '.wav'
        input_audio = read(targetWavFile)
        input_audio = np.array(input_audio[1],dtype=float)
        input_audio = input_audio / maxAmptitude  # scale the input using maxAmptitude        
        wordtag = word + myid
        inputaudio_dd[wordtag] = input_audio 

There are total 98 speech samples.



### truncate the input signal

In [5]:
LARGELEN = 1e10
truncate_len = LARGELEN
for _, sig in inputaudio_dd.items():
    if len(sig) < truncate_len:  truncate_len = len(sig)

print('Truncated signal length : {}'.format(truncate_len))

#
# truncate with min length of the input signal
#
for name, sig in inputaudio_dd.items():
    trunc_sig = sig[:truncate_len]
    #
    # extract mfcc
    #
    mfcc_feat = mfcc(trunc_sig,samplerate) 
    mfcc_feat = mfcc_feat.flatten('C')  # flattern the 33x13 to 1D array(429)
    inputaudio_dd[name] = mfcc_feat
    
## check
featDim = len(inputaudio_dd['kiwi11'])
print('Each signal is represented by {} features.'.format(featDim))

Truncated signal length : 2694
Each signal is represented by 429 features.


In [6]:
#
# convert data input numpy array
# 
data = np.zeros((samples_total, featDim))
label = np.zeros(samples_total, dtype = np.int)

# print data.shape
# print label.shape

index = 0
for key, value in inputaudio_dd.items():
    fruitname = str(key)[:-2]
    tagid = word2vec_dd[fruitname]
    #print fruitname + str(tagid)
    data[index,:] = value
    #print value
    label[index] = int(tagid)
    index += 1

In [7]:
print(data.shape)
print(label.shape)
print(label[:10])
# print min(data[0])

(98, 429)
(98,)
[2 2 2 2 2 3 3 3 3 3]


### split data into training and testing set

In [8]:
#
# shuffle data sets
# note: apply K-fold
#
total_samples = data.shape[0]
indices = np.random.permutation(total_samples)
frac = 0.8
train_samples = int(round(total_samples * frac))
print('Use {} out of {} for training'.format(train_samples, total_samples))

training_idx, test_idx = indices[:train_samples], indices[train_samples:]
# print test_idx
# print label[test_idx]

# notes: add validation set
x_train, x_test = data[training_idx,:], data[test_idx,:]
y_train, y_test = label[training_idx], label[test_idx]

Use 78 out of 98 for training


In [9]:
# print(x_test)
print(y_test)
print type(y_test[0])

[5 5 1 3 1 0 2 4 0 4 1 5 6 5 0 4 4 4 2 2]
<type 'numpy.int64'>


### one hot encoding on the labels

In [10]:
def one_hot_encode(x, n_classes):
    """
    One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
    : x: List of sample Labels
    : return: Numpy array of one-hot encoded labels
     """
    return np.eye(n_classes)[x]

y_train = one_hot_encode(y_train, totalwords)
y_test = one_hot_encode(y_test, totalwords)
#print(y_test)

### Build Neural Nets

In [11]:
# [2] https://github.com/tflearn/tflearn/blob/master/examples/images/convnet_mnist.py
# val_acc: 0.0500
net = tflearn.input_data(shape=[None, featDim])
net = tflearn.fully_connected(net, 64, 
                                 activation='relu',
                                 regularizer='L2', 
                                 weight_decay=0.001)
net = tflearn.dropout(net, 0.8)

net = tflearn.fully_connected(net, 64, 
                                activation='relu',
                                regularizer='L2', 
                                weight_decay=0.001)
net = tflearn.dropout(net, 0.8)

net = tflearn.fully_connected(net, totalwords, 
                                  activation='softmax')

sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.96, decay_step=1000)
top_k = tflearn.metrics.Top_k(k=1)
net = tflearn.regression(net, optimizer=sgd, metric=top_k,
                         loss='categorical_crossentropy')

### training

In [12]:
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(x_train, y_train, 
          n_epoch=50, 
          validation_set=(x_test, y_test),
          show_metric=True, run_id="dense_model")

Training Step: 99  | total loss: [1m[32m1.24264[0m[0m | time: 0.001s
| SGD | epoch: 050 | loss: 1.24264 - top1: 0.8658 -- iter: 64/78
Training Step: 100  | total loss: [1m[32m1.67711[0m[0m | time: 1.004s
| SGD | epoch: 050 | loss: 1.67711 - top1: 0.8149 | val_loss: 0.08835 - val_acc: 1.0000 -- iter: 78/78
--
