In [1]:
import csv
import datetime
import h5py
import keras
import numpy as np
import os
import pandas as pd
import pescador
import sys
import tensorflow as tf
import time

import sys
sys.path.append("../src")
import localmodule

# Define constants.
dataset_name = localmodule.get_dataset_name()
folds = localmodule.fold_units()
models_dir = localmodule.get_models_dir()
n_input_hops = 104
n_filters = [24, 48, 48]
kernel_size = [5, 5]
pool_size = [2, 4]
n_hidden_units = 64
steps_per_epoch = 256
epochs = 32
validation_steps = 256
batch_size = 32
n_context_classes = 4


# Read command-line arguments.
args = ["3600", "unit01"]
aug_kind_str = "none"
bg_duration = int(args[0])
unit_str = args[1]


# Retrieve fold such that unit_str is in the test set.
fold = [f for f in folds if unit_str in f[0]][0]
test_units = fold[0]
training_units = fold[1]
validation_units = fold[2]


# Print header.
start_time = int(time.time())
print(str(datetime.datetime.now()) + " Start.")
print("Training Salamon's ICASSP 2017 convnet on " + dataset_name + ". ")
print("Training set: " + ", ".join(training_units) + ".")
print("Validation set: " + ", ".join(validation_units) + ".")
print("Test set: " + ", ".join(test_units) + ".")
print("")
print('h5py version: {:s}'.format(h5py.__version__))
print('keras version: {:s}'.format(keras.__version__))
print('numpy version: {:s}'.format(np.__version__))
print('pandas version: {:s}'.format(pd.__version__))
print('pescador version: {:s}'.format(pescador.__version__))
print('tensorflow version: {:s}'.format(tf.__version__))
print("")


# Define and compile Keras model.
# NB: the original implementation of Justin Salamon in ICASSP 2017 relies on
# glorot_uniform initialization for all layers, and the optimizer is a
# stochastic gradient descent (SGD) with a fixed learning rate of 0.1.
# Instead, we use a he_normal initialization for the layers followed
# by rectified linear units (see He ICCV 2015), and replace the SGD by
# the Adam adaptive stochastic optimizer (see Kingma ICLR 2014).
# Moreover, we disable dropout because we found that it consistently prevented
# the model to train at all.

# Main channel.
# Input
spec_input = keras.layers.Input(
    shape=(128, n_input_hops, 1), name="spec_input")

# Layer 1
spec_bn = keras.layers.normalization.BatchNormalization(
    name="spec_bn")(spec_input)
spec_conv1 = keras.layers.Convolution2D(n_filters[0], kernel_size,
    padding="same", kernel_initializer="he_normal",
    name="spec_conv1")(spec_bn)
spec_pool1 = keras.layers.MaxPooling2D(
    pool_size=pool_size, name="spec_pool1")(spec_conv1)

# Layer 2
spec_conv2 = keras.layers.Convolution2D(n_filters[1], kernel_size,
    padding="same", kernel_initializer="he_normal",
    activation="relu", name="spec_conv2")(spec_pool1)
spec_pool2 = keras.layers.MaxPooling2D(
    pool_size=pool_size, name="spec_pool2")(spec_conv2)

# Layer 3
spec_conv3 = keras.layers.Convolution2D(n_filters[2], kernel_size,
    padding="same", kernel_initializer="he_normal",
    activation="relu", name="spec_conv3")(spec_pool2)

# Layer 4
spec_flatten = keras.layers.Flatten(
    name="spec_flatten")(spec_conv3)
spec_dense = keras.layers.Dense(n_hidden_units,
    kernel_initializer="he_normal", activation="relu",
    kernel_regularizer=keras.regularizers.l2(0.001),
    name="spec_dense1")(spec_flatten)

# Reshape.
spec_reshape = keras.layers.Reshape((-1, 4),
    name="spec_reshape")(spec_dense)


# Side channel.
# Input
bg_input = keras.layers.Input(
    shape=(128, 5), name="bg_input")

# Pool
bg_pool = keras.layers.AveragePooling1D(
    pool_size=4, name="bg_pool")(bg_input)

# Permute
bg_permute = keras.layers.Permute(
    (2, 1), name="bg_permute")(bg_pool)

# Conv
bg_conv = keras.layers.Conv1D(
    8, 1, kernel_initializer="he_normal",
    activation="relu", name="bg_conv")(bg_permute)

# Flatten
bg_flatten = keras.layers.Flatten(
    name="bg_flatten")(bg_conv)

# Dense 1
bg_dense1 = keras.layers.Dense(16,
    kernel_initializer="he_normal",
    activation="relu", name="bg_dense1")(bg_flatten)

# Dense 2
bg_dense2 = keras.layers.Dense(4,
    kernel_initializer="he_normal",
    activation="softmax", name="bg_dense2")(bg_dense1)

# Reshape
bg_reshape = keras.layers.Reshape((1, 4),
    name="bg_reshape")(bg_dense2)


# Element-wise multiplication
multiply = keras.layers.Multiply(
    name="multiply")([spec_reshape, bg_reshape])

# Flatten
flatten = keras.layers.Flatten(
    name="flatten")(multiply)


# Layer 5
# We put a single output instead of 43 in the original paper, because this
# is binary classification instead of multilabel classification.
# Furthermore, this layer contains 43 times less connections than in the
# original paper, so we divide the l2 weight penalization by 50, which is
# of the same order of magnitude as 43.
# 0.001 / 50 = 0.00002
dense = keras.layers.Dense(1,
    kernel_initializer="normal", activation="sigmoid",
    kernel_regularizer=keras.regularizers.l2(0.00002),
    name="dense")(flatten)


# Compile model, print model summary.
inputs = [spec_input, bg_input]
model = keras.models.Model(inputs=inputs, outputs=dense)
model.compile(loss="binary_crossentropy",
    optimizer="adam", metrics=["accuracy"])
model.summary()

Using TensorFlow backend.


2017-11-26 17:11:58.611296 Start.
Training Salamon's ICASSP 2017 convnet on BirdVox-70k. 
Training set: unit02, unit03, unit05.
Validation set: unit07, unit10.
Test set: unit01.

h5py version: 2.6.0
keras version: 2.0.6
numpy version: 1.13.1
pandas version: 0.20.3
pescador version: 1.0.0
tensorflow version: 1.2.1

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
spec_input (InputLayer)          (None, 128, 104, 1)   0                                            
____________________________________________________________________________________________________
spec_bn (BatchNormalization)     (None, 128, 104, 1)   4           spec_input[0][0]                 
____________________________________________________________________________________________________
spec_conv1 (Conv2D)              (None, 128, 104, 24)  624         spec_bn[0][

In [1]:
# Define function for multiplexing streamers.
def multiplex_lms_with_background(
        augs, fold_units, n_hops, batch_size, percentile_ids):

    # Define constants.
    aug_dict = localmodule.get_augmentations()
    data_dir = localmodule.get_data_dir()
    dataset_name = localmodule.get_dataset_name()
    tfr_name = "_".join([dataset_name, "clip-logmelspec"])
    tfr_dir = os.path.join(data_dir, tfr_name)
    bg_name = "_".join(
        [dataset_name, "clip-logmelspec-backgrounds"])
    bg_dir = os.path.join(data_dir, bg_name)
    T_str = "T-" + str(bg_duration).zfill(4)
    T_dir = os.path.join(bg_dir, T_str)

    # Loop over augmentations.
    streams = []
    for aug_str in augs:

        # Define instances.
        aug_dir = os.path.join(tfr_dir, aug_str)
        if aug_str == "original":
            instances = [aug_str]
        else:
            n_instances = aug_dict[aug_str]
            instances = ["-".join([aug_str, str(instance_id)])
                for instance_id in range(n_instances)]

        # Define bias.
        if aug_str[:5] == "noise":
            bias = np.float32(-17.0)
        else:
            bias = np.float32(0.0)

        # Loop over instances.
        for instanced_aug_str in instances:

            # Loop over units.
            for unit_str in fold_units:

                # Define path to time-frequency representation.
                lms_name = "_".join(
                    [dataset_name, instanced_aug_str, unit_str])
                lms_path = os.path.join(aug_dir, lms_name + ".hdf5")

                # Define path to background.
                bg_name = "_".join(
                    [dataset_name, "background-summaries",
                     unit_str, T_str + ".hdf5"])
                bg_path = os.path.join(bg_dir, bg_name)

                # Define pescador streamer.
                stream = pescador.Streamer(yield_lms_and_background,
                    lms_path, n_hops, bias, bg_path, percentile_ids)
                streams.append(stream)

    # Multiplex streamers together.
    mux = pescador.Mux(streams,
        k=len(streams), lam=None, with_replacement=True, revive=True)

    # Create buffered streamer with specified batch size.
    buffered_streamer = pescador.BufferedStreamer(mux, batch_size)

    return buffered_streamer.tuples("X", "y", cycle=True)

In [14]:
def yield_lms_and_background(tfr_path, n_hops, bias, bg_path, percentile_ids):
    
    # Open HDF5 container.
    with h5py.File(tfr_path, "r") as tfr_container:
        # Open HDF5 group corresponding to time-freq representation (TFR).
        tfr_group = tfr_container[tfr_str]

        # The naming convention of a key is
        # [unit]_[time]_[freq]_[y]_[aug]_[instance]
        # where y=1 if the key corresponds to a positive clip and 0 otherwise.
        keys = list(tfr_group.keys())
        
        
        
        # Infinite "yield" loop.
        while True:
            # Pick a key uniformly as random.
            key = random.choice(keys)

            # Load time-frequency spectrogram (TFR).
            X_spec = tfr_group[key]

            # Trim TFR in time to required number of hops.
            X_width = X_spec.shape[1]
            first_col = int((X_width-n_hops) / 2)
            last_col = int((X_width+n_hops) / 2)
            X_spec = X_spec[:, first_col:last_col]

            # Add trailing singleton dimension for Keras interoperability.
            X_spec = X_spec[:, :, np.newaxis]

            # Apply bias
            X_spec = X_spec + bias
            
            # Load background.
            X_bg = bg_group[key]
            
            # Retrieve label y from key name.
            y = np.array([np.float32(key.split("_")[3])])

            # Yield data and label as dictionary.
            yield dict(X_spec=X_spec, X_bg=X_bg, y=y)

In [26]:
import numpy as np
import os
import sys
sys.path.append("../src")
import localmodule

aug_kind_str = "all"
    
        
# Define constants.
dataset_name = localmodule.get_dataset_name()
folds = localmodule.fold_units()
models_dir = localmodule.get_models_dir()
n_input_hops = 104
n_filters = [24, 48, 48]
kernel_size = [5, 5]
pool_size = [2, 4]
n_hidden_units = 64
steps_per_epoch = 256
epochs = 32
validation_steps = 256
batch_size = 32
n_context_classes = 4


# Read command-line arguments.
args = ["3600", "unit01"]
aug_kind_str = "none"
bg_duration = int(args[0])
unit_str = args[1]

# Retrieve fold such that unit_str is in the test set.
fold = [f for f in folds if unit_str in f[0]][0]
test_units = fold[0]
training_units = fold[1]
validation_units = fold[2]
fold_units = training_units            

BirdVox-70k_background-summaries_unit02_T-3600.hdf5
BirdVox-70k_background-summaries_unit03_T-3600.hdf5
BirdVox-70k_background-summaries_unit05_T-3600.hdf5


In [None]:
# Parse augmentation kind string (aug_kind_str).
if aug_kind_str == "none":
    augs = ["original"]
elif aug_kind_str == "pitch":
    augs = ["original", "pitch"]
elif aug_kind_str == "stretch":
    augs = ["original", "stretch"]
else:
    noise_augs = ["noise-" + unit_str for unit_str in fold_units]
    if aug_kind_str == "all":
        augs = noise_augs + ["original", "pitch", "stretch"]
    elif aug_kind_str == "noise":
        augs = noise_augs + ["original"]


