In [None]:
import tensorflow.compat.v1 as tf

In [None]:
import numpy as np
import os
import librosa
import librosa.display

In [None]:
from tensorflow.keras.callbacks import *

In [None]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

In [None]:
import librosa
def get_feature_matrix(audio_filename_path):  
    """Extract acoustic features (log mel-energies) for given audio file and store them."""
    
    audio, fs = librosa.load(audio_filename_path)
    hop_len = 0.02
    mel_extractor = dcase_util.features.MelExtractor(n_mels=40, win_length_seconds=0.04, hop_length_seconds=hop_len, fs = fs)
    mel_data = mel_extractor.extract(y=audio)
    #numpy.save(feature_filename, mel_data)
    return mel_data, hop_len
        

In [None]:
import librosa
def get_feature_matrix_script_features(audio_filename_path):  
    """Extract acoustic features (log mel-energies) for given audio file and store them."""
    audio, fs = librosa.load(audio_filename_path, sr = 16000)
    mel_data = calc_spec(audio)

    return mel_data, 10/mel_data.shape[1]
        

In [None]:
def csv_to_meta_container(csv_file_path):
  dict_container = dcase_util.containers.ListDictContainer(filename = csv_file_path)
  dict_container.load()
  train_meta = dcase_util.containers.MetaDataContainer(dict_container)
  print(train_meta)
  return train_meta

In [None]:
train_meta = csv_to_meta_container('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/dataset_updated/labels_updated.csv')

In [None]:
X_train = []
Y_train = []
audio_folder_name = '/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/dataset_updated/wav_updated'
for audio_filename in os.listdir(audio_folder_name)[:3]:#os.listdir() s1.wav, s2.wav
    #print('Load', db.absolute_to_relative_path(audio_filename))
    
    # Extract features, load them from file if they exists, if not extract and save
    audio_path = os.path.join(audio_folder_name, audio_filename)
    features, hop_length_seconds = get_feature_matrix_script_features(audio_path)

    # Targets
    event_list = train_meta.filter(filename=audio_filename.split('.')[0])
    labels_ = np.zeros(2)
    for event in event_list:
      if(event.event_label == 'music'):
        labels_[1] = 1
      elif(event.event_label == 'speech'):
        labels_[0] = 1
    
    X_train.append(features) 
    Y_train.append(labels_)

In [None]:
X_train = np.stack(X_train)
Y_train = np.stack(Y_train)
#Y_train = np.moveaxis(Y_train, 1, 2)
print('----------------------')
print('X_train shape', X_train.shape)
print('Y_train shape', Y_train.shape)

In [None]:
import numpy as np
def load_features_and_labels(features_path, labels_path):
  X = np.load(features_path)
  Y = np.load(labels_path)
  return X, Y

In [None]:
validation_meta = csv_to_meta_container('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/dataset_updated/labels_updated.csv')

In [None]:
X_validation = []
Y_validation = []
validation_data = {}
audio_folder_name = '/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/validation_data/validation_wavs'
for audio_filename in os.listdir(audio_folder_name):#os.listdir() s1.wav, s2.wav
    
    # Extract features, load them from file if they exists, if not extract and save
    audio_path = os.path.join(audio_folder_name, audio_filename)
    features, hop_length_seconds = get_feature_matrix_script_features(audio_path)
    features = features[:,:500] 

    # Targets
    event_list = validation_meta.filter(filename=audio_filename.split('.')[0])
    labels_ = np.zeros(2)
    for event in event_list:
      if(event.event_label == 'music'):
        labels_[1] = 1
      elif(event.event_label == 'speech'):
        labels_[0] = 1


    X_validation.append(features) 
    Y_validation.append(labels_)

    validation_data[audio_filename] = {
        'features' : features,
        'ground_truth' : labels_
    }
    

In [None]:
X_validation = np.stack(X_validation)
Y_validation = np.stack(Y_validation)
#Y_validation = np.moveaxis(Y_validation, 1, 2)
print('----------------------')
print('X_validation shape', X_validation.shape)
print('Y_validation shape', Y_validation.shape)

Next we create **CRNN** styled neural network structure layer by layer


**Input** layer and **Reshaping** layer to add channel axis into input to match `channels_last` mode:

In [None]:
feature_vector_length = 513   # Number of mel bands
sequence_length = 313

In [None]:
print(feature_vector_length)

513


In [None]:
input_layer = Input(
    shape=(feature_vector_length, sequence_length), 
    name='Input'
)
x = Reshape(
    target_shape=(feature_vector_length, sequence_length, 1), 
    name='Input_Reshape'
)(input_layer)


In [None]:
print('Output shape','(sequence, frequency, time, channel)', x.shape)

Output shape (sequence, frequency, time, channel) (None, 513, 313, 1)


**Two convolutional groups** are used to capture small shifts in time and frequency. 

Similar groups as in sound classification example, except max **pooling done only along frequency** axis as time axis is retained for the detection.


In [None]:
print('Input shape','(sequence, frequency, time, channel)', x.shape)

Input shape (sequence, frequency, time, channel) (None, 513, 313, 1)


In [None]:
# Convolution
x = Conv2D(filters=64, kernel_size=(3, 3), activation='linear', kernel_initializer='random_normal',
           padding='same', data_format='channels_last', name='Conv1')(x)
# Batch normalization
x = BatchNormalization(axis=-1, name='Conv1_BatchNorm')(x)
# Activation
x = Activation(activation='relu', name='Conv1_Activation')(x)
# Max pooling along frequency axis
x = MaxPooling2D(pool_size=(5, 1), name='Conv1_Pooling')(x)
# Drop out
x = Dropout(rate=0.2, name='Conv1_DropOut')(x)


In [None]:
print('Output shape', '(sequence, frequency, time, feature)', x.shape)

Output shape (sequence, frequency, time, feature) (None, 102, 313, 64)


In [None]:
print('Input shape','(sequence, frequency, time, channel)', x.shape)

Input shape (sequence, frequency, time, channel) (None, 102, 313, 64)


In [None]:
# Convolution
x = Conv2D(filters=64, kernel_size=(3, 3), activation='linear', kernel_initializer='random_normal',
           padding='same', data_format='channels_last', name='Conv2')(x)
# Batch normalization
x = BatchNormalization(axis=-1, name='Conv2_BatchNorm')(x)
# Activation
x = Activation(activation='relu', name='Conv2_Activation')(x)
# Max pooling along frequency axis
x = MaxPooling2D(pool_size=(4, 1), name='Conv2_Pooling')(x)
# Drop out
x = Dropout(rate=0.2, name='Conv2_DropOut')(x)


In [None]:
print('Output shape', '(sequence, frequency, time, feature)', x.shape)

Output shape (sequence, frequency, time, feature) (None, 25, 313, 64)


To **connect** convolutional layers and recurrent layers, output of the last convolutional group has to be  **Reordered** and **Reshaped**:

In [None]:
print('Input shape', '(sequence, frequency, time, feature)', x.shape)

Input shape (sequence, frequency, time, feature) (None, 25, 313, 64)


In [None]:
x = Permute(
    dims=(1, 3, 2), 
    name='Permute'
)(x)

x = Reshape(
    target_shape=(sequence_length, -1), 
    name='Reshape'
)(x)


In [None]:
print('Output shape', '(sequence, time, feature)', x.shape)

Output shape (sequence, time, feature) (None, 313, 1600)


Two **bidirectional** **recurrent** layers (Gated Recurrent Units) are used to integrate information from large time window:

**Recognition** is done with two **fully-connected** layers using information extracted by the previous layers. 

Layers are wrapped with `TimeDistributed` class to apply layers independently to each time step.

**Output layer** (last fully-connected layer) is with sigmoid activation.

In [None]:
x = Dense(units=32, kernel_initializer='random_normal', name='FC1' )(x)
x = Dropout(rate=0.2, name='FC_DropOut')(x)
x = Dense(units=2, kernel_initializer='random_normal', name='Output')(x)
output_layer = Activation('sigmoid', name='Output_Activation')(x)

In [None]:
print('Output shape', '(sequence, time, classes)', output_layer.shape)

Output shape (sequence, time, classes) (None, 2)


Create a model network:

In [None]:
model = Model(inputs=input_layer, outputs=output_layer)

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 513, 313)]        0         
                                                                 
 Input_Reshape (Reshape)     (None, 513, 313, 1)       0         
                                                                 
 Conv1 (Conv2D)              (None, 513, 313, 64)      640       
                                                                 
 Conv1_BatchNorm (BatchNorma  (None, 513, 313, 64)     256       
 lization)                                                       
                                                                 
 Conv1_Activation (Activatio  (None, 513, 313, 64)     0         
 n)                                                              
                                                                 
 Conv1_Pooling (MaxPooling2D  (None, 102, 313, 64)     0     

## Training

One should evaluate validation data with **same metric** which is used in actual system evaluation with test set 

For sound event detection, `keras` does not provide any suitable metric (such as *segment-based error rate (ER)* or *f-score (F1)*)

Default `keras` training process needs to be modified by halting it after each epoch:
- Validation data is evaluated with current model **outside the training process**
- Metric values are stored and used to control the training process (e.g. model selection or early stopping)

## Callbacks

In [None]:
!pip install colorama

Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.4


In [None]:
epochs = 100
keras_metric = 'accuracy'
keras_loss = 'binary_crossentropy'
metric_to_monitor = 'ER'
external_metrics = {
    'ER': 'Error rate',
    'F1': 'F-score'
}

callback_list = [
    dcase_util.tfkeras.ProgressLoggerCallback(
        epochs=epochs,
        metric=keras_metric,
        loss=keras_loss,
        output_type='console',
        show_timing=False,
        manual_update=True,     
    ),
    dcase_util.tfkeras.StasherCallback(
        epochs=epochs,
        monitor=metric_to_monitor,
        initial_delay=50,
        manual_update=True,
    )
]

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(
    loss=keras_loss,
    metrics=[keras_metric],
    optimizer=Adam(lr=0.001, decay=0.001)
)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score, classification_report

In [None]:
epochs = 40
# Variable to training history including metrics calculated outside keras
history_over_epochs = {
    'loss': [],
    'val_loss': [],
    'val_acc': [],
    'val_f1': [],
}

# Do training epoch by epoch
for epoch_start in range(0, 40):
    epoch_end = epoch_start + 1

    # Make sure we have only specified amount of epochs
    if epoch_end > epochs:
        epoch_end = epochs

    # Train model
    history = model.fit( x=X_train,        y=Y_train,        validation_data=(X_validation, Y_validation),
        callbacks= callback_list,
        verbose=0,
        initial_epoch=epoch_start,
        epochs=epoch_end,        batch_size=8,        shuffle=True
    )
    
    item_probabilities = model.predict(X_validation, batch_size = 8) #(batch_size, 2)
    # for item in item_probabilities:
    #   for x in item:
    #     x = x > 0.5

    ground_truth = Y_validation#validation_item['ground_truth']
    #print(ground_truth.shape, item_probabilities.shape)
    labels_ = np.zeros(item_probabilities.shape)

    for label, item in zip(labels_, item_probabilities):
      if(item[0] >= 0.5):
        label[0] = 1
      if(item[1] >= 0.5):
        label[1] = 1

    f1_score_ = f1_score(ground_truth, labels_, average='micro')
    acc = accuracy_score(ground_truth, labels_)
    print("EPOCH", epoch_start, "----------", "F1 score", f1_score_, "------------", "Accuracy", acc)

    # Store metrics
    history_over_epochs['loss'].append(history.history['loss'])
    history_over_epochs['val_loss'].append(history.history['val_loss'])
    history_over_epochs['val_acc'].append(acc)
    history_over_epochs['val_f1'].append(f1_score_)
    # history_over_epochs['binary_accuracy'].append(history.history['binary_accuracy'])
    # history_over_epochs['val_binary_accuracy'].append(history.history['val_binary_accuracy'])

    # Manually update callbacks
    for callback in callback_list:
        if hasattr(callback, 'update'):
            callback.update()

    # Check if we need to stop training
    stop_training = False
    for callback in callback_list:
        if hasattr(callback, 'stop'):
            if callback.stop():
                stop_training = True
                break

    if stop_training:
        # Stop the training loop
        break

# Manually update callbacks
for callback in callback_list:
    if hasattr(callback, 'close'):
        callback.close()

In [None]:
# model.save(os.path.join('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/crnn_gru_sed_weight', 'model_seds_direct_script_30_upto_40_epochs_new.h5'))

## Best performing model

Best performing model was stored during the training process in `StasherCallback`:

In [None]:
# for callback in callback_list:
#     if isinstance(callback, dcase_util.keras.StasherCallback):                
#         model.set_weights(callback.get_best()['weights'])       # Fetch the best performing model        
#         callback.show()                                         # Show information
#         break

Save model and training history:

In [None]:
# # Save training history
# dcase_util.files.Serializer().save_cpickle(
#     filename=os.path.join('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project', 'model_cnn_rnn_sed_training_history_30_upto_40_epochs_new.cpickle'),
#     data=history_over_epochs
# )

## Training history

In [None]:
hist = dcase_util.files.Serializer().load_cpickle(filename=os.path.join('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project', 'model_cnn_rnn_sed_training_history_upto_10_epochs.cpickle'))

In [None]:
print(hist)

{'loss': [[0.6190886497497559], [0.564346194267273], [0.5319881439208984], [0.5088754296302795], [0.4856886863708496], [0.4726979732513428], [0.47057390213012695], [0.5011380314826965], [0.4881892502307892], [0.47413286566734314]], 'val_loss': [[0.6866677403450012], [0.6893298625946045], [0.7021951079368591], [0.7254590392112732], [0.7537027597427368], [0.7206918001174927], [0.745861828327179], [0.6805738210678101], [0.7056502103805542], [0.682929515838623]], 'val_er': [1.1379310344827587, 1.103448275862069, 1.206896551724138, 1.2413793103448276, 0.9770114942528736, 0.9885057471264368, 0.9195402298850575, 1.2413793103448276, 1.2413793103448276, 1.0459770114942528], 'val_f1': [0.3421052631578948, 0.5142857142857142, 0.6137184115523466, 0.6120996441281138, 0.5361702127659573, 0.5485232067510549, 0.5374449339207049, 0.592057761732852, 0.6071428571428571, 0.5702811244979921], 'binary_accuracy': [[0.687228262424469], [0.7013139724731445], [0.7080639600753784], [0.7212162613868713], [0.74498

In [None]:
epochs = range(1, len(hist['loss']) + 1)
fig = plt.figure(figsize=(19,8))

plt.subplot(3,1,1)
plt.plot(epochs, hist['loss'], color='red', linewidth=3, label='Training loss')
plt.plot(epochs, hist['val_loss'], color='green', linewidth=3, label='Validation loss')
#plt.fill_between(epochs, numpy.squeeze(numpy.array(hist['val_loss'])), color='#6aa84f', linewidth=3, label='Validation loss')
plt.ylabel('Loss', fontsize=18)
plt.legend(loc='best', fontsize=16)
panel = plt.gca()
panel.set_xlim([1,len(hist['loss']) + 1])
panel.get_xaxis().set_visible(True)

plt.subplot(3,1,2)
plt.plot(epochs, hist['val_er'], color='blue', linewidth=3, label='Validation Error rate')
plt.axhline(y=1, color='red', linestyle='-', linewidth=5, alpha=0.2)
plt.ylabel('Error rate', fontsize=18)
er_min_index = numpy.argmin(hist['val_er'])
plt.axhline(hist['val_er'][er_min_index], color='green', linestyle='-', linewidth=5, alpha=0.5) #'o', markersize=10, color='red')
plt.annotate('Minimum achieved ER value', xy=(len(hist['loss']),hist['val_er'][er_min_index]-0.15), fontsize=14, ha='right')
plt.annotate('ER=1.0', xy=(len(hist['loss']),1+0.05), fontsize=14, ha='right')
panel = plt.gca()
panel.set_xlim([1,len(hist['loss']) + 1])
panel.get_xaxis().set_visible(True)

plt.subplot(3,1,3)
plt.plot(epochs, hist['val_f1'], color='black', linewidth=3, label='F-score')
plt.ylabel('F-score', fontsize=18)
plt.xlabel('Epochs', fontsize=18)
panel = plt.gca()
panel.set_xlim([1,len(hist['loss']) + 1])
plt.tight_layout()
plt.show()

# Testing stage

In [None]:
#model = keras.models.load_model('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/crnn_gru_sed_weight/model_seds_direct_script_upto_10_epochs_new.h5') # Load model

## Going through all test material

In [None]:
def binarization(self, probabilities, binarization_type='global_threshold', threshold=0.5, time_axis=1):
        """Binarization
        Parameters
        ----------
        probabilities : numpy.ndarray
            Probabilities to be binarized
        binarization_type : str ('global_threshold', 'class_threshold', 'frame_max')
        threshold : float
            Binarization threshold, value of the threshold are replaced with 1 and under with 0.
            Default value 0.5
        time_axis : int
            Axis index for the frames
            Default value 1
        Raises
        ------
        AssertionError:
            Unknown binarization_type
        Returns
        -------
        numpy.ndarray
            Binarized data
        """

        if binarization_type not in ['global_threshold', 'class_threshold', 'frame_max']:
            message = '{name}: Unknown frame_binarization type [{type}].'.format(
                name=self.__class__.__name__,
                type=binarization_type
            )

            self.logger.exception(message)
            raise AssertionError(message)

        # Get data_axis
        if time_axis == 0:
            data_axis = 1
        else:
            data_axis = 0

        if binarization_type == 'global_threshold':
            return numpy.array(probabilities >= threshold, dtype=int)

        elif binarization_type == 'class_threshold' and isinstance(threshold, list):
            data = []
            for class_id, class_threshold in enumerate(threshold):
                if data_axis == 0:
                    data.append(numpy.array(probabilities[class_id, :] >= class_threshold, dtype=int))

                elif data_axis == 1:
                    data.append(numpy.array(probabilities[:, class_id] >= class_threshold, dtype=int))

            if data_axis == 0:
                return numpy.vstack(data)

            elif data_axis == 1:
                return numpy.vstack(data).T

        elif binarization_type == 'frame_max':
            if data_axis == 0:
                return numpy.array((probabilities / numpy.max(probabilities, axis=0)) == 1, dtype=int)

            elif data_axis == 1:
                return numpy.array((probabilities.T / numpy.max(probabilities, axis=1)).T == 1, dtype=int)

In [None]:
res = dcase_util.containers.MetaDataContainer(filename=os.path.join(data_storage_path, 'results_sed.csv'))

audio_folder_name = '/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/validation_data/validation_wavs'
for audio_filename in os.listdir(audio_folder_name):
    # Load features
    # features = get_feature_matrix(item.filename)
    # features_sequenced = data_sequencer.sequence(features).data
    # input_data = numpy.moveaxis(features_sequenced, 2, 0) # Rearrange axes => (sequence, feature vector, time)
    audio_path = os.path.join(audio_folder_name, audio_filename)
    features, hop_length_seconds = get_feature_matrix_script_features(audio_path)
    #features = features[:,:500]
    input_data = features.reshape(1, features.shape[0], features.shape[1])


    # Get network output
    item_probabilities_seq = model.predict(x=input_data)        # Get per frame probabilities in sequences (3D matrix)
    item_probabilities = numpy.vstack(item_probabilities_seq)   # Merge sequences together (2D matrix)

    # Event activity
    event_activity = dcase_util.data.ProbabilityEncoder().binarization(
        probabilities=item_probabilities,
        binarization_type='global_threshold',
        threshold=0.5
    )
    current_estimated = dcase_util.containers.MetaDataContainer()
    for event_id, event_label in enumerate(['speech', 'music']):
        # Convert active frames into segments and translate frame indices into timestamps
        event_segments = dcase_util.data.DecisionEncoder().find_contiguous_regions(
            activity_array=event_activity[:, event_id]
        ) * 0.03194888178

        # Form event items
        for event in event_segments:
            current_estimated.append(
                {
                    'filename': audio_filename,
                    'onset': event[0],
                    'offset': event[1],
                    'event_label': event_label
                }
            )
            
        # Merge events together from same class which are within 100ms
        current_estimated = current_estimated.process_events(minimum_event_gap=0.5)
        # Remove events which are < 100ms 
        current_estimated = current_estimated.process_events(minimum_event_length=0.5)
        
    # Store result into results container
    res += current_estimated
    
# Save results container
res.save().show(mode='print')