-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathspeech.py
149 lines (113 loc) · 4.63 KB
/
speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import keras
def build_tiny_conv(input_frames, input_bins, n_classes=12, dropout=0.5):
"""
Ported from Tensorflow examples. create_tiny_conv_model
"""
from keras.layers import Conv2D, Dense, Dropout, Flatten
input_shape = (input_bins, input_frames, 1)
model = keras.Sequential([
Conv2D(8, (8, 10), strides=(2, 2),
padding='same', activation='relu', use_bias=True,
input_shape=input_shape),
Dropout(dropout),
Flatten(),
Dense(n_classes, activation='softmax', use_bias=True),
])
return model
def build_one(frames=64, bands=40, n_classes=10, dropout=0.0, tstride = 1, fstride = 4):
"""
Ported from Tensorflow examples. create_low_latency_conv
This is roughly the network labeled as 'cnn-one-fstride4' in the
'Convolutional Neural Networks for Small-footprint Keyword Spotting' paper:
http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
"""
from keras.layers import Conv2D, Dense, Dropout, Flatten
# In the paper there are some differences
# uses log-mel as input instead of MFCC
# uses 4 in stride for frequency
# has a linear bottleneck as second layer to reduce multiplications,
# instead of doing a single full-frequency convolution
# probably uses ReLu for the DNN layers?
# probably does not use ReLu for the conv layer?
# Note, in keyword spotting task tstride=2,4,8 performed well also
conv_f = 8
conv_t = 32
kernels = 90
bottleneck = 32
input_shape = (frames, bands, 1)
model = keras.Sequential([
Conv2D(kernels, (conv_t, conv_f), strides=(tstride, fstride),
padding='valid', activation='relu', use_bias=True,
input_shape=input_shape),
Dense(bottleneck, activation=None, use_bias=True),
Dropout(dropout),
Dense(128, activation='relu', use_bias=True),
Dropout(dropout),
Dense(128, activation='relu', use_bias=True),
Dropout(dropout),
Dense(n_classes, activation='softmax', use_bias=True),
])
return model
def build_low_latency_conv(input_frames, input_bins, n_classes=12, dropout=0.5):
"""
Ported from Tensorflow examples. create_low_latency_conv
This is roughly the network labeled as 'cnn-one-fstride4' in the
'Convolutional Neural Networks for Small-footprint Keyword Spotting' paper:
http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
"""
from keras.layers import Conv2D, Dense, Dropout, Flatten
input_shape = (input_frames, input_bins, 1)
# In the paper there are some differences
# uses log-mel as input instead of MFCC
# uses 4 in stride for frequency
# has a linear bottleneck as second layer to reduce multiplications,
# instead of doing a single full-frequency convolution
# probably uses ReLu for the DNN layers?
# probably does not use ReLu for the conv layer?
# Note, in keyword spotting task tstride=2,4,8 performed well also
model = keras.Sequential([
Conv2D(186, (input_frames, 8), strides=(1, 1),
padding='valid', activation='relu', use_bias=True,
input_shape=input_shape),
Dropout(dropout),
Flatten(),
Dense(128, activation=None, use_bias=True),
Dropout(dropout),
Dense(128, activation=None, use_bias=True),
Dropout(dropout),
Dense(n_classes, activation='softmax', use_bias=True),
])
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
return model
def build_aclnet_lowlevel(input_samples, c1=32, s1=8, s2=4, input_tensor=None):
"""
The following values were tested in the paper.
c1= 8,16,32
s1= 2,4,8
s2= 2,4
"""
from keras.layers import Conv1D, MaxPooling1D, InputLayer, Flatten, Dense
input_shape = (input_samples, 1)
model = keras.Sequential([
InputLayer(input_shape=input_shape, input_tensor=input_tensor),
Conv1D(filters=c1, kernel_size=9, strides=s1,
padding='valid', activation=None, use_bias=False,),
Conv1D(filters=64, kernel_size=5, strides=s2,
padding='valid', activation=None, use_bias=False,),
MaxPooling1D(pool_size=(int(160/(s2*s1)),),
padding='valid', data_format='channels_last'),
Flatten(),
Dense(1, activation=None),
])
return model
def main():
m = build_low_latency_conv(41, 40)
m.summary()
m = build_tiny_conv(32, 40)
m.summary()
m = build_one()
m.summary()
#m = build_aclnet_lowlevel(20480)
#m.summary()
if __name__ == '__main__':
main()