In [28]:
import tensorflow_io as tfio
import tensorflow as tf 
import os 
import matplotlib.pyplot as plt 
from tensorflow.keras.layers import Conv2D , Flatten ,Dense , Dropout , MaxPooling2D
from tensorflow.keras.models import Sequential 

In [2]:
# change the current working dir 
os.chdir('F:/trigger-word-detection')

# loading the wav file and decode it 

In [3]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels) 
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Goes from 44100Hz to 16000hz - amplitude of the audio signal
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav 

# create the dataset 

In [35]:
pos_ds = tf.data.Dataset.list_files(os.path.join(os.getcwd() , 'data' , 'recorded-data' , 'positive' ,'*.wav'))
neg_ds = tf.data.Dataset.list_files(os.path.join(os.getcwd() , 'data' , 'recorded-data' , 'negative' ,'*.wav'))


In [36]:
# label the data 
labeled_pos_ds = tf.data.Dataset.zip((pos_ds , tf.data.Dataset.from_tensor_slices(tf.ones(len(pos_ds)))))
labeled_neg_ds = tf.data.Dataset.zip((neg_ds , tf.data.Dataset.from_tensor_slices(tf.zeros(len(neg_ds)))))

In [37]:
ds = labeled_pos_ds.concatenate(labeled_neg_ds)

In [38]:
def preprocess(file_path, label): 
    wav = load_wav_16k_mono(file_path)
    # zero pad the inputs to make them all the same legnth 
    zero_padding = tf.zeros([48000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav],0)
    spectrogram = tf.signal.stft(wav, frame_length=320, frame_step=32)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label

In [39]:
ds = ds.map(preprocess).cache().shuffle(1000).batch(8).prefetch(4)





In [40]:
train_ds = ds.take(16) 
val_ds = ds.skip(16)

In [41]:
samples, labels = train_ds.as_numpy_iterator().next()

In [42]:
samples.shape

(8, 1491, 257, 1)

# the model 

In [43]:
model = Sequential([
    Conv2D(16 , (3,3) , padding='same' , activation='relu' ,input_shape =(1491 , 257 , 1 )) , 
    MaxPooling2D((2,2)) , 
    Conv2D(16 , (3,3) , padding= 'same' , activation='relu') , 
    Flatten() , 
    Dense(128 , activation= 'relu') , 
    Dropout(.2) , 
    Dense(1, activation ='sigmoid')
]) 

In [44]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 1491, 257, 16)     160       
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 745, 128, 16)     0         
 2D)                                                             
                                                                 
 conv2d_3 (Conv2D)           (None, 745, 128, 16)      2320      
                                                                 
 flatten_1 (Flatten)         (None, 1525760)           0         
                                                                 
 dense_2 (Dense)             (None, 128)               195297408 
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                      

In [45]:
model.compile('Adam', loss='BinaryCrossentropy', metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [46]:
hist = model.fit(train_ds, epochs=4, validation_data=val_ds)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [47]:
model.save(os.path.join(os.getcwd() , 'trigger_word_0'))



INFO:tensorflow:Assets written to: F:\trigger-word-detection\trigger_word_0\assets


INFO:tensorflow:Assets written to: F:\trigger-word-detection\trigger_word_0\assets
