In [2]:
!pip install -r /content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/requirements.txt

Collecting dcase_util>=0.2.8
  Downloading dcase_util-0.2.18.tar.gz (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.1 MB/s 
Collecting sed_eval>=0.2.0
  Downloading sed_eval-0.2.1.tar.gz (21 kB)
Collecting validators>=0.12.0
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting python-magic>=0.4.13
  Downloading python_magic-0.4.24-py2.py3-none-any.whl (12 kB)
Building wheels for collected packages: dcase-util, sed-eval
  Building wheel for dcase-util (setup.py) ... [?25l[?25hdone
  Created wheel for dcase-util: filename=dcase_util-0.2.18-py3-none-any.whl size=2147231 sha256=b23fb54c0eddc56923ed27ce83392783d119260204f234ade22a8cfd25d22b58
  Stored in directory: /root/.cache/pip/wheels/f2/75/d1/e4fc6f415d1100dc30c39c955aa2bfbd0d88c6138076e74793
  Building wheel for sed-eval (setup.py) ... [?25l[?25hdone
  Created wheel for sed-eval: filename=sed_eval-0.2.1-py3-none-any.whl size=26124 sha256=e8942a70b0ea03a07ae94da444416afc94f63686501cf6473364926e5dc0c442
 

In [3]:
import tensorflow.compat.v1 as tf
#tf.disable_v2_behavior()

In [4]:
import numpy as np
import os
import librosa
import librosa.display

In [5]:
! pip install dcase_util
import dcase_util



In [7]:
import dcase_util
from dcase_util.containers import AudioContainer
import os, numpy
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sed_eval
%matplotlib inline

# Handy tool to print data in HTML form
log = dcase_util.ui.FancyHTMLPrinter()

# Paths to store data
data_storage_path = '/content/data'
dataset_storage_path = os.path.join(data_storage_path, 'datasets')
feature_storage_path = os.path.join(data_storage_path, 'features_sed')
dcase_util.utils.Path().create(
    [data_storage_path, dataset_storage_path, feature_storage_path]
)

# Filename for acoustic model
model_filename = os.path.join(data_storage_path, 'model_sed.h5')  
    
# Setup Keras to use tensorflow as backend
dcase_util.keras.setup_keras(backend='tensorflow')
import keras
from keras.layers import *
from keras.models import Model

In [8]:
def csv_to_meta_container(csv_file_path):
  dict_container = dcase_util.containers.ListDictContainer(filename = csv_file_path)
  dict_container.load()
  train_meta = dcase_util.containers.MetaDataContainer(dict_container)
  print(train_meta)
  return train_meta

In [9]:
train_meta = csv_to_meta_container('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/dataset_updated/labels_updated.csv')

MetaDataContainer :: Class
Items                               : 2160 
Unique
  Files                             : 1020 
  Scene labels                      : 0 
  Event labels                      : 2 
  Tags                              : 0 
  Identifiers                       : 0 
  Source labels                     : 0 

Meta data
  Source                  Onset   Offset   Scene             Event             Tags              Identifier   
  --------------------   ------   ------   ---------------   ---------------   ---------------   -----   
  S1                       0.00     5.00   -                 music             -                 -       
  S1                       5.00    10.00   -                 speech            -                 -       
  S2                       0.00     5.00   -                 music             -                 -       
  S2                       5.00    10.00   -                 speech            -                 -       
  S3                 

In [10]:
#################                MODEL     Starts ##########################

In [11]:
feature_vector_length = 513   # Number of mel bands
sequence_length = 313

In [12]:
input_layer = Input(
    shape=(feature_vector_length, sequence_length), 
    name='Input'
)
x = Reshape(
    target_shape=(feature_vector_length, sequence_length, 1), 
    name='Input_Reshape'
)(input_layer)


In [13]:
print('Output shape','(sequence, frequency, time, channel)', x.shape)

Output shape (sequence, frequency, time, channel) (None, 513, 313, 1)


**Two convolutional groups** are used to capture small shifts in time and frequency. 

Similar groups as in sound classification example, except max **pooling done only along frequency** axis as time axis is retained for the detection.


In [14]:
print('Input shape','(sequence, frequency, time, channel)', x.shape)

Input shape (sequence, frequency, time, channel) (None, 513, 313, 1)


In [15]:
# Convolution
x = Conv2D(filters=64, kernel_size=(3, 3), activation='linear', kernel_initializer='random_normal',
           padding='same', data_format='channels_last', name='Conv1')(x)
# Batch normalization
x = BatchNormalization(axis=-1, name='Conv1_BatchNorm')(x)
# Activation
x = Activation(activation='relu', name='Conv1_Activation')(x)
# Max pooling along frequency axis
x = MaxPooling2D(pool_size=(5, 1), name='Conv1_Pooling')(x)
# Drop out
x = Dropout(rate=0.2, name='Conv1_DropOut')(x)


In [16]:
print('Output shape', '(sequence, frequency, time, feature)', x.shape)

Output shape (sequence, frequency, time, feature) (None, 102, 313, 64)


In [17]:
print('Input shape','(sequence, frequency, time, channel)', x.shape)

Input shape (sequence, frequency, time, channel) (None, 102, 313, 64)


In [18]:
# Convolution
x = Conv2D(filters=64, kernel_size=(3, 3), activation='linear', kernel_initializer='random_normal',
           padding='same', data_format='channels_last', name='Conv2')(x)
# Batch normalization
x = BatchNormalization(axis=-1, name='Conv2_BatchNorm')(x)
# Activation
x = Activation(activation='relu', name='Conv2_Activation')(x)
# Max pooling along frequency axis
x = MaxPooling2D(pool_size=(4, 1), name='Conv2_Pooling')(x)
# Drop out
x = Dropout(rate=0.2, name='Conv2_DropOut')(x)


In [19]:
print('Output shape', '(sequence, frequency, time, feature)', x.shape)

Output shape (sequence, frequency, time, feature) (None, 25, 313, 64)


To **connect** convolutional layers and recurrent layers, output of the last convolutional group has to be  **Reordered** and **Reshaped**:

In [20]:
print('Input shape', '(sequence, frequency, time, feature)', x.shape)

Input shape (sequence, frequency, time, feature) (None, 25, 313, 64)


In [21]:
x = Permute(
    dims=(1, 3, 2), 
    name='Permute'
)(x)

x = Reshape(
    target_shape=(sequence_length, -1), 
    name='Reshape'
)(x)


In [22]:
print('Output shape', '(sequence, time, feature)', x.shape)

Output shape (sequence, time, feature) (None, 313, 1600)


Two **bidirectional** **recurrent** layers (Gated Recurrent Units) are used to integrate information from large time window:

In [23]:
print('Input shape', '(sequence, time, feature)', x.shape)

Input shape (sequence, time, feature) (None, 313, 1600)


In [24]:
x = Bidirectional(
    SimpleRNN(
        units=32, activation='tanh', 
        dropout=0.2, recurrent_dropout=0.2, 
        return_sequences=True, kernel_initializer='random_normal'
    ), merge_mode='mul', name='Recurrent_1'
)(x)
x = Bidirectional(
    SimpleRNN(units=32, activation='tanh', 
        dropout=0.2, recurrent_dropout=0.2, 
        return_sequences=True, kernel_initializer='random_normal'
    ), merge_mode='mul', name='Recurrent_2'
)(x)

In [25]:
print('Output shape', '(sequence, time, feature)', x.shape)

Output shape (sequence, time, feature) (None, 313, 32)


**Recognition** is done with two **fully-connected** layers using information extracted by the previous layers. 

Layers are wrapped with `TimeDistributed` class to apply layers independently to each time step.

**Output layer** (last fully-connected layer) is with sigmoid activation.

In [26]:
x = TimeDistributed(
    Dense(units=32, kernel_initializer='random_normal'), name='FC1', 
)(x)
x = Dropout(rate=0.2, name='FC_DropOut')(x)
x = TimeDistributed(
    Dense(units=2, kernel_initializer='random_normal'), name='Output'
)(x)
output_layer = Activation('sigmoid', name='Output_Activation')(x)

In [27]:
print('Output shape', '(sequence, time, classes)', output_layer.shape)

Output shape (sequence, time, classes) (None, 313, 2)


Create a model network:

In [28]:
model = Model(inputs=input_layer, outputs=output_layer)

In [29]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 513, 313)]        0         
                                                                 
 Input_Reshape (Reshape)     (None, 513, 313, 1)       0         
                                                                 
 Conv1 (Conv2D)              (None, 513, 313, 64)      640       
                                                                 
 Conv1_BatchNorm (BatchNorma  (None, 513, 313, 64)     256       
 lization)                                                       
                                                                 
 Conv1_Activation (Activatio  (None, 513, 313, 64)     0         
 n)                                                              
                                                                 
 Conv1_Pooling (MaxPooling2D  (None, 102, 313, 64)     0     

## Training

One should evaluate validation data with **same metric** which is used in actual system evaluation with test set 

For sound event detection, `keras` does not provide any suitable metric (such as *segment-based error rate (ER)* or *f-score (F1)*)

Default `keras` training process needs to be modified by halting it after each epoch:
- Validation data is evaluated with current model **outside the training process**
- Metric values are stored and used to control the training process (e.g. model selection or early stopping)

In [30]:
model = keras.models.load_model(os.path.join('/content/drive/MyDrive/5th_sem/Ee603_mlsp/Project/cnn_birnn_sed_weights', f'model_seds_direct_script_upto_30_epochs_new.h5')) # Load model

# Testing stage

## Going through all test material

In [31]:
import numpy as np

res = dcase_util.containers.MetaDataContainer(filename=os.path.join(data_storage_path, 'results_sed.csv'))

audio_folder_name = '/content/spectograms'

for audio_filename in os.listdir(audio_folder_name):
    
    features = np.load(os.path.join(audio_folder_name, audio_filename))
    input_data = features.reshape(1, features.shape[0], features.shape[1])

    # Get network output
    item_probabilities_seq = model.predict(x=input_data)       
    item_probabilities = numpy.vstack(item_probabilities_seq)   # Merge sequences together (2D matrix)

    # Event activity
    event_activity = dcase_util.data.ProbabilityEncoder().binarization(
        probabilities=item_probabilities,
        binarization_type='global_threshold',
        threshold=0.5
    )
    current_estimated = dcase_util.containers.MetaDataContainer()
    for event_id, event_label in enumerate(['speech', 'music']):
        # Convert active frames into segments and translate frame indices into timestamps
        event_segments = dcase_util.data.DecisionEncoder().find_contiguous_regions(
            activity_array=event_activity[:, event_id]
        ) * 0.03194888178

        # Form event items
        for event in event_segments:
            current_estimated.append(
                {
                    'filename': audio_filename,
                    'onset': event[0],
                    'offset': event[1],
                    'event_label': event_label
                }
            )
            
        # Merge events together from same class which are within 100ms
        current_estimated = current_estimated.process_events(minimum_event_gap=0.5)
        # Remove events which are < 100ms 
        current_estimated = current_estimated.process_events(minimum_event_length=0.5)
        
    # Store result into results container
    res += current_estimated
    
# Save results container
res.save().show(mode='print')

MetaDataContainer :: Class
Filename                            : /content/data/results_sed.csv 
Items                               : 31 
Unique
  Files                             : 10 
  Scene labels                      : 0 
  Event labels                      : 2 
  Tags                              : 0 
  Identifiers                       : 0 
  Source labels                     : 0 

Event statistics
  Event label             Count   Tot. Length   Avg. Length   
  --------------------   ------   -----------   -----------   
  music                      12         40.86          3.41   
  speech                     19         60.58          3.19   




In [33]:
train_meta = csv_to_meta_container('/content/data/results_sed.csv')

MetaDataContainer :: Class
Items                               : 31 
Unique
  Files                             : 10 
  Scene labels                      : 0 
  Event labels                      : 2 
  Tags                              : 0 
  Identifiers                       : 0 
  Source labels                     : 0 

Meta data
  Source                  Onset   Offset   Scene             Event             Tags              Identifier   
  --------------------   ------   ------   ---------------   ---------------   ---------------   -----   
  test_sample-0.npy        1.95     5.14   -                 music             -                 -       
  test_sample-0.npy        0.99     3.71   -                 speech            -                 -       
  test_sample-0.npy        5.88    10.00   -                 speech            -                 -       
  test_sample-6.npy        0.13     4.76   -                 music             -                 -       
  test_sample-6.npy      

In [39]:
tagged_data = []
for file_name in train_meta.unique_files:
  #print(file_name)
  file_data = [file_name.split('.')[0], '0', '0']
  event_list = train_meta.filter(filename=file_name)
  for event in event_list:
    if(event.event_label == 'speech'):
      #print("yo")
      file_data[1] = '1'
    elif(event.event_label == 'music'):
      file_data[2] = '1'
  
  tagged_data.append(file_data)

In [40]:
print(len(tagged_data), type(tagged_data), tagged_data)

10 <class 'list'> [['test_sample-0', '1', '1'], ['test_sample-1', '1', '1'], ['test_sample-2', '1', '1'], ['test_sample-3', '1', '1'], ['test_sample-4', '1', '1'], ['test_sample-5', '1', '1'], ['test_sample-6', '1', '1'], ['test_sample-7', '1', '1'], ['test_sample-8', '1', '1'], ['test_sample-9', '1', '1']]


In [41]:
import csv

In [42]:
with open('results_tagging.csv', 'w') as f:
  write = csv.writer(f)
  write.writerow(['filename', 'speech', 'music'])
  write.writerows(tagged_data)

In [43]:
# !pip install pyyaml

In [44]:
# import yaml

# with open(r'/content/task1.yaml') as file:
#     # The FullLoader parameter handles the conversion from YAML
#     # scalar values to Python the dictionary format
#     fruits_list = yaml.load(file)#, Loader=yaml.FullLoader)

#     print(fruits_list)

# Evaluation

`sed_eval` toolbox is used to get Error rate and F-score for test set:

In [None]:
print(validation_meta[0].filename)

S1


In [None]:
res = dcase_util.containers.MetaDataContainer().load(filename=os.path.join(data_storage_path, 'results_sed.csv'))

## Preparing data for evaluation

Prepare reference data and estimated to have filenames in uniform format:

In [None]:
reference_event_list = validation_meta
estimated_event_list = dcase_util.containers.MetaDataContainer(
    filename=os.path.join(data_storage_path, 'results_sed.csv')
).load()

# for item_id, item in enumerate(reference_event_list):
#   print(item.filename, item_id)
#   # reference_event_list[item_id][f'{filename}'] = os.path.split(item.filename)[-1]

for item_id, item in enumerate(estimated_event_list):
    estimated_event_list[item_id]['filename'] = item.filename.split('.')[0]    

In [None]:
print(estimated_event_list)

MetaDataContainer :: Class
Filename                            : /content/data/results_sed.csv 
Items                               : 38 
Unique
  Files                             : 12 
  Scene labels                      : 0 
  Event labels                      : 2 
  Tags                              : 0 
  Identifiers                       : 0 
  Source labels                     : 0 

Meta data
  Source                  Onset   Offset   Scene             Event             Tags              Identifier   
  --------------------   ------   ------   ---------------   ---------------   ---------------   -----   
  S161                     3.07     8.98   -                 music             -                 -       
  S161                     3.07     3.64   -                 speech            -                 -       
  S161                     8.27    10.00   -                 speech            -                 -       
  S138                     0.38     0.96   -                 m

In [None]:
print(reference_event_list)

MetaDataContainer :: Class
Items                               : 2160 
Unique
  Files                             : 1020 
  Scene labels                      : 0 
  Event labels                      : 2 
  Tags                              : 0 
  Identifiers                       : 0 
  Source labels                     : 0 

Meta data
  Source                  Onset   Offset   Scene             Event             Tags              Identifier   
  --------------------   ------   ------   ---------------   ---------------   ---------------   -----   
  S1                       0.00     5.00   -                 music             -                 -       
  S1                       5.00    10.00   -                 speech            -                 -       
  S2                       0.00     5.00   -                 music             -                 -       
  S2                       5.00    10.00   -                 speech            -                 -       
  S3                 

In [None]:
# Initialize evaluator with list of event labels to be evaluated and segment length
evaluator = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=['speech', 'music'], 
    time_resolution=0.2                    # 1 second segments
)
# Loop file by file and accumulate intermediate statistics
for filename in reference_event_list.unique_files:
    evaluator.evaluate(
        reference_event_list=reference_event_list.filter(filename=filename),
        estimated_event_list=estimated_event_list.filter(filename=filename)
    ) 
metrics = evaluator.results_overall_metrics()

## Metric values

In [None]:
log.table(
    column_headers=['Metric', 'Value'],
    cell_data=[
        [
          '<strong>F-score</strong>',
          'Precision',
          'Recall',
          '<strong>Error rate</strong>',
            'Substitutions',
            'Deletions',
            'Insertions'
        ],
        [
            metrics['f_measure']['f_measure']*100.0,
            metrics['f_measure']['precision']*100.0,
            metrics['f_measure']['recall']*100.0,
            metrics['error_rate']['error_rate'],
            metrics['error_rate']['substitution_rate'],
            metrics['error_rate']['deletion_rate'],
            metrics['error_rate']['insertion_rate'],
        ]
    ],
    row_separators=[3],
    scaling=130
)

Metric,Value
F-score,1.46
Precision,49.45
Recall,0.74
Error rate,1.0
Substitutions,0.0
Deletions,0.99
Insertions,0.0


## Class-wise metrics

In [None]:
class_metrics = evaluator.results_class_wise_metrics()
Nref = []
Nsys = []
Fscore=[]
for event_label in list(class_metrics.keys()):
    Nref.append(class_metrics[event_label]['count']['Nref'])
    Nsys.append(class_metrics[event_label]['count']['Nsys'])
    Fscore.append(class_metrics[event_label]['f_measure']['f_measure']*100.0)
log.table(
    column_headers=['Event', 'Nref', 'Nsys', 'Fscore'],
    cell_data=[
        list(class_metrics.keys()),Nref,Nsys,Fscore
    ],
    column_types=['str25', 'int', 'int', 'float2'],
    column_separators=[0,2],
    scaling=130
)

Event,Nref,Nsys,Fscore
speech,15750,286,1.4
music,14250,237,1.1


Output directly from `sed_eval` evaluator:

In [None]:
print(evaluator)    

Segment based metrics
  Evaluated length                  : 116.19 sec
  Evaluated files                   : 12 
  Segment length                    : 200.00 ms

  Overall metrics (micro-average)
  F-measure
    F-measure (F1)                  : 41.85 %
    Precision                       : 32.61 %
    Recall                          : 58.40 %
  Error rate
    Error rate (ER)                 : 1.39 
    Substitution rate               : 0.24 
    Deletion rate                   : 0.18 
    Insertion rate                  : 0.97 
  Accuracy
    Sensitivity                     : 58.40 %
    Specificity                     : 40.96 %
    Balanced accuracy               : 49.68 %
    Accuracy                        : 46.69 %

  Class-wise average metrics (macro-average)
  F-measure
    F-measure (F1)                  : 41.59 %
    Precision                       : 33.30 %
    Recall                          : 59.44 %
  Error rate
    Error rate (ER)                 : 1.64 
    Deletion rate