# Model Architecture Tests

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bicycle_bell_sed_models.models.crnn import crnn 
from bicycle_bell_sed_models.models.yamnet_base import yamnet_base
from bicycle_bell_sed_models.models.yamnet_lstm_fc import yamnet_lstm_fc

import tensorflow as tf
import tensorflow.keras as keras
import numpy

## CRNN Test

In [3]:
audioLength = 44100 * 3 # sr*sec
rdmAudio = numpy.array([numpy.random.random(audioLength) for _ in range(3)])
rdmLabel = numpy.array([numpy.random.randint(0, 2) for _ in range(3)])

In [4]:
model_crnn = crnn()
model_crnn.summary()

Model: "crnn"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 wav_44100_mono_input (InputLay  [(None, None)]      0           []                               
 er)                                                                                              
                                                                                                  
 padded_wave (PadWaveformLayer)  (None, None)        0           ['wav_44100_mono_input[0][0]']   
                                                                                                  
 log_mel_spectrogram_transform   ((None, None, 128),  0          ['padded_wave[0][0]']            
 (LogMelSpectrogramTransformLay   (None, None, 96, 1                                              
 er)                            28))                                                           

In [5]:
ds = tf.data.Dataset.from_tensor_slices(({'wav_44100_mono_input': rdmAudio}, {'class_output': rdmLabel}))
ds = ds.batch(3)

print(ds)
for xbatch, ybatch in ds:
  print(xbatch)
  print(ybatch)
  for x, y in zip(xbatch, ybatch):
    print(x)
    print(y)
    break
  break

<BatchDataset shapes: ({wav_44100_mono_input: (None, 132300)}, {class_output: (None,)}), types: ({wav_44100_mono_input: tf.float64}, {class_output: tf.int32})>
{'wav_44100_mono_input': <tf.Tensor: shape=(3, 132300), dtype=float64, numpy=
array([[0.89141616, 0.34423255, 0.62442053, ..., 0.90638314, 0.56695136,
        0.5429709 ],
       [0.6035598 , 0.58982039, 0.8125937 , ..., 0.22250388, 0.4883958 ,
        0.02650325],
       [0.76169059, 0.4786687 , 0.3311342 , ..., 0.13176616, 0.10555902,
        0.58250765]])>}
{'class_output': <tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 0, 1])>}
wav_44100_mono_input
class_output


In [6]:
model_crnn = crnn()
model_crnn.compile(optimizer='adam', 
    loss=['binary_crossentropy', None],
    metrics=[('accuracy',), (None,)],
    loss_weights=[1.0, 0.0], 
)
model_crnn.fit(ds, batch_size=3)



<keras.callbacks.History at 0x21f244b59d0>

In [7]:
model_crnn = crnn()
model_crnn.compile(optimizer='adam', 
    loss={
        "class_output": 'binary_crossentropy', # last layer name
        "log_mel_spectrogram_output": None,
    },
    metrics={
        "class_output": [
            'accuracy',
        ],
        "log_mel_spectrogram_output": [
          None,
        ],
    },
    loss_weights={
        "class_output": 1.0, 
        "log_mel_spectrogram_output": 0.0,
    }, 
)
model_crnn.fit(ds, batch_size=3)



<keras.callbacks.History at 0x21fefd4c550>

## YAMNet Base Test

In [8]:
audioLength = 16000 * 3 # sr*sec
rdmAudio = numpy.array([numpy.random.random(audioLength) for _ in range(3)])
rdmLabel = numpy.array([numpy.random.randint(0, 2) for _ in range(3)])

In [9]:
model_yn_base = yamnet_base()
model_yn_base.summary()

Model: "yamnet_base"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 wav_16000_mono_input (InputLay  [(None, None)]      0           []                               
 er)                                                                                              
                                                                                                  
 yamnet_wrapper (YAMNetWrapper)  {'scores': (None, N  0          ['wav_16000_mono_input[0][0]']   
                                one, 521),                                                        
                                 'spectrogram': (No                                               
                                ne, None, 64)}                                                    
                                                                                        

In [10]:
ds = tf.data.Dataset.from_tensor_slices(({'wav_16000_mono_input': rdmAudio}, {'class_output': rdmLabel}))
ds = ds.batch(3)

In [11]:
model_yn_base = yamnet_base()
model_yn_base.compile(optimizer='adam', 
    loss=['binary_crossentropy', None],
    metrics=[('accuracy',), (None,)],
    loss_weights=[1.0, 0.0], 
)
model_yn_base.fit(ds, batch_size=3)



<keras.callbacks.History at 0x22046f89f70>

In [12]:
model_yn_base = yamnet_base()
model_yn_base.compile(optimizer='adam', 
    loss={
        "class_output": 'binary_crossentropy', # last layer name
        "log_mel_spectrogram_output": None,
    },
    metrics={
        "class_output": [
            'accuracy',
        ],
        "log_mel_spectrogram_output": [
          None,
        ],
    },
    loss_weights={
        "class_output": 1.0, 
        "log_mel_spectrogram_output": 0.0,
    }, 
)
model_yn_base.fit(ds, batch_size=3)



<keras.callbacks.History at 0x2205f2a2d60>

## YAMNet Extended

In [13]:
audioLength = 16000 * 3 # sr*sec
rdmAudio = numpy.array([numpy.random.random(audioLength) for _ in range(3)])
rdmLabel = numpy.array([numpy.random.randint(0, 2) for _ in range(3)])

In [14]:
model_yn_extended = yamnet_lstm_fc()
model_yn_extended.summary()

Model: "yamnet_lstm_fc"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 wav_16000_mono_input (InputLay  [(None, None)]      0           []                               
 er)                                                                                              
                                                                                                  
 yamnet_wrapper (YAMNetWrapper)  {'embeddings': (Non  0          ['wav_16000_mono_input[0][0]']   
                                e, None, 1024),                                                   
                                 'spectrogram': (No                                               
                                ne, None, 64)}                                                    
                                                                                     

In [15]:
ds = tf.data.Dataset.from_tensor_slices(({'wav_16000_mono_input': rdmAudio}, {'class_output': rdmLabel}))
ds = ds.batch(3)

In [16]:
model_yn_extended = yamnet_lstm_fc()
model_yn_extended.compile(optimizer='adam', 
    loss=['binary_crossentropy', None],
    metrics=[('accuracy',), (None,)],
    loss_weights=[1.0, 0.0], 
)
model_yn_extended.fit(ds, batch_size=3)







<keras.callbacks.History at 0x22092e8aac0>

In [17]:
model_yn_extended = yamnet_lstm_fc()
model_yn_extended.compile(optimizer='adam', 
    loss={
        "class_output": 'binary_crossentropy', # last layer name
        "log_mel_spectrogram_output": None,
    },
    metrics={
        "class_output": [
            'accuracy',
        ],
        "log_mel_spectrogram_output": [
          None,
        ],
    },
    loss_weights={
        "class_output": 1.0, 
        "log_mel_spectrogram_output": 0.0,
    }, 
)
model_yn_extended.fit(ds, batch_size=3)







<keras.callbacks.History at 0x220825e7a00>