In [15]:
import tensorflow as tf
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


# Functions definitons.

In [16]:
def normalization(x):
    return (x - np.mean(x, axis = 0))/np.std(x, axis = 0)

def frameToTime(frame):
    time = frame*(10/313)
    return time

def HardClassAssign(y_pred):
    y_pred_labels = np.argmax(y_pred,axis = 2)

    y_labels = np.zeros(y_pred.shape)
    for i in range(y_labels.shape[0]):
        for j in range(y_labels.shape[1]):
            y_labels[i,j, y_pred_labels[i,j]] = 1

    return y_labels

def audio_tag(y_pred):
    count_speech = 0
    count_music = 0
    y_audio = []
    for i in range(y_pred.shape[0]):
        for j in range(y_pred.shape[1]):
            if (np.array([0., 1., 0.]) == y_pred[i,:]).all():
                count_speech = count_speech+1

            elif (np.array([1., 0., 0.]) == y_pred[i,:]).all():
                count_music = count_music+1
                
    if count_music == 0 and count_speech != 0:
        y_audio = [0,1]
    elif count_music != 0 and count_speech == 0:
        y_audio = [1,0]
    else:
        y_audio = [1,1]

    return np.array(y_audio)

# Loading the training data, normalizing and shuffling through the batch-wise.

In [17]:
root_dir = '/content/gdrive/My Drive/Colab Notebooks/'
x_train = np.load(root_dir+'X_train.npy')
y_train = np.load(root_dir+'Y_train.npy')
seed = 0
np.random.seed(seed)
index = np.random.choice(np.arange(x_train.shape[0]), x_train.shape[0], replace = False)
x_train = x_train[index,:,:]
y_train = y_train[index,:,:]
x_train = np.reshape(x_train, [x_train.shape[0], x_train.shape[1], x_train.shape[2], 1])
x_train = normalization(x_train)

# Loading validation data and normalizing.

In [18]:
x_val = np.load(root_dir+'X_val.npy')
y_val = np.load(root_dir+'Y_val.npy')
x_val = np.reshape(x_val, [x_val.shape[0], x_val.shape[1], x_val.shape[2], 1])
x_val = normalization(x_val)

In [19]:
# converting to compatible datatypes for processing
y_train = y_train.astype(dtype = 'float32')
y_val = y_val.astype(dtype = 'float32')

# Convolutional Recurrent Neural Network Model 

In [35]:
class crnn_model(tf.keras.models.Model):
    def __init__(self, pooling_1, pooling_2, pooling_3, rnn_size, in_shape, num_of_classes):
        super(crnn_model, self).__init__()
        self.pooling = [pooling_1, pooling_2, pooling_3]
        self.rnn_size = rnn_size
        self.in_shape = in_shape
        
        self.conv_1 = tf.keras.layers.Conv2D(filters = 128, kernel_size = (3,3), input_shape = in_shape, padding = 'same')     
        self.bn1 = tf.keras.layers.BatchNormalization()     
        self.act1 = tf.keras.layers.Activation('relu')
        self.max1 = tf.keras.layers.MaxPool2D(pool_size = (pooling_1, 1))
        
        self.conv_2 = tf.keras.layers.Conv2D(filters = 128, kernel_size = (3,3), padding = 'same')
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.act2 = tf.keras.layers.Activation('relu')
        self.max2 = tf.keras.layers.MaxPool2D(pool_size = (pooling_2, 1))
        
        self.conv_3 = tf.keras.layers.Conv2D(filters = 128, kernel_size = (3,3), padding = 'same')
        self.bn3 = tf.keras.layers.BatchNormalization()
        self.act3 = tf.keras.layers.Activation('relu')
        self.max3 = tf.keras.layers.MaxPool2D(pool_size = (pooling_3, 1))
        
        self.rnn1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_size, return_sequences = True))
        self.rnn2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_size, return_sequences = True))
        
        self.dense1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(rnn_size * 2))
        self.dense2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_of_classes, activation = 'sigmoid'))
        
    def call(self, inputs):
        # convolutional layer 1
        c0_x = self.conv_1(inputs)
        b0_x = self.bn1(c0_x)
        a0_x = self.act1(b0_x)
        m0_x = self.max1(a0_x)   #pooling is only on frequency bins
        
        # convolutional layer 2
        c1_x = self.conv_2(m0_x)
        b1_x = self.bn2(c1_x)
        a1_x = self.act2(b1_x)
        m1_x = self.max2(a1_x)
        
        # convolutional layer 3
        c2_x = self.conv_3(m1_x)
        b2_x = self.bn3(c2_x)
        a2_x = self.act3(b2_x)
        m2_x = self.max3(a2_x) 
        
        #reshaping output for rnn input
        p_x = tf.reshape(m2_x, [-1, 4, 128, 313])
        re_x = tf.reshape(p_x, [-1, 313, 512])
        
        #rnn layers
        r1_x = self.rnn1(re_x)
        r2_x = self.rnn2(r1_x)                                     
        
        #output hidden dense layer 1
        r2_x = self.dense1(r2_x)                                                   
        
        #final dense layer for output
        frame_level_prob = self.dense2(r2_x)
        
        return frame_level_prob

In [36]:
# defining the parameters for architecture of the model
pool_1 = 4
pool_2 = 4
pool_3 = 2
rnn_len = 32
input_shape = (-1, 128, 313, 1)
num_class = 3

In [37]:
# instantiating the model
crnn = crnn_model(pool_1, pool_2, pool_3, rnn_len, input_shape, num_class)

In [38]:
# defining the loss function and accuracy metric
def f1(ground_truth, predicted):
    predicted = tf.keras.backend.round(predicted) 
    
    true_positive = tf.keras.backend.sum(tf.keras.backend.cast(ground_truth*predicted, 'float'), axis=0)
    true_negative = tf.keras.backend.sum(tf.keras.backend.cast((1-ground_truth)*(1-predicted), 'float'), axis=0)
    false_positive = tf.keras.backend.sum(tf.keras.backend.cast((1-ground_truth)*predicted, 'float'), axis=0)
    false_negative = tf.keras.backend.sum(tf.keras.backend.cast(ground_truth*(1-predicted), 'float'), axis=0)

    precision = true_positive / (true_positive + false_positive + tf.keras.backend.epsilon())
    recall = true_positive / (true_positive + false_negative + tf.keras.backend.epsilon())

    f1 = 2*precision*recall / (precision+recall+tf.keras.backend.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return tf.keras.backend.mean(f1)

def f1_loss(ground_truth, predicted):
    
    true_positive = tf.keras.backend.sum(tf.keras.backend.cast(ground_truth*predicted, 'float'), axis=0)
    true_negative = tf.keras.backend.sum(tf.keras.backend.cast((1-ground_truth)*(1-predicted), 'float'), axis=0)
    false_positive = tf.keras.backend.sum(tf.keras.backend.cast((1-ground_truth)*predicted, 'float'), axis=0)
    false_negative = tf.keras.backend.sum(tf.keras.backend.cast(ground_truth*(1-predicted), 'float'), axis=0)

    precision = true_positive / (true_positive + false_positive + tf.keras.backend.epsilon())
    recall = true_positive / (true_positive + false_negative + tf.keras.backend.epsilon())

    f1 = 2*precision*recall / (precision+recall+tf.keras.backend.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - tf.keras.backend.mean(f1)

In [39]:
# compiling the model
crnn.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=f1_loss, metrics=['accuracy', f1])

### Training the CRNN model

In [49]:
crnn.fit(x_train, y_train, epochs = 10, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9db0275990>

In [44]:
# saving weights of the model
crnn.save_weights(root_dir+'/crnn_model_weights/CRNN_weights.h5')
crnn.summary()

Model: "crnn_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_6 (Conv2D)           multiple                  1280      
                                                                 
 batch_normalization_6 (Batc  multiple                 512       
 hNormalization)                                                 
                                                                 
 activation_6 (Activation)   multiple                  0         
                                                                 
 max_pooling2d_6 (MaxPooling  multiple                 0         
 2D)                                                             
                                                                 
 conv2d_7 (Conv2D)           multiple                  147584    
                                                                 
 batch_normalization_7 (Batc  multiple                

# Loading and normalizing test data.

In [53]:
x_test = np.load(root_dir+'/x_test.npy')
x_test = np.reshape(x_test, [x_test.shape[0], x_test.shape[1], x_test.shape[2], 1])
x_test = normalization(x_test)
print(x_test.shape)

(100, 128, 313, 1)


# Predicting with the model

In [54]:
y_pred = crnn.predict(x_test)


y_pred = HardClassAssign(y_pred)

# Sound event detection

In [55]:
n_batches = y_pred.shape[0]
n_frames = y_pred.shape[1]
i = 0
j = 0
start_music = 0
end_music = 0
start_speech = 0
end_speech = 0
name = 'test_sample-'
print('filename,event,onset,offset')
while j<n_batches:
    i=0
    start_music = 0
    end_music = 0
    start_speech = 0
    end_speech = 0
    temp = y_pred[j,:,:]
    temp = temp.astype(dtype='int')
    while i<n_frames-1:
        if (temp[i+1,:] == np.array([1,0,0])).all() and (temp[i,:] == np.array([0,0,1])).all():
            start_music = frameToTime(i+1)
        elif (temp[i,:] == np.array([1,0,0])).all() and (temp[i+1,:] == np.array([0,0,1])).all():
            end_music = frameToTime(i)
            print(name+str(j)+',Music,'+str(start_music)+','+str(end_music))
        elif (temp[i+1,:] == np.array([0,1,0])).all() and (temp[i,:] == np.array([0,0,1])).all():
            start_speech = frameToTime(i+1)
        elif (temp[i,:] == np.array([0,1,0])).all() and (temp[i+1,:] == np.array([0,0,1])).all():
            end_speech = frameToTime(i)
            print(name+str(j)+',Speech,'+str(start_speech)+','+str(end_speech))
        
        i = i+1
        
    j=j+1

filename,event,onset,offset
test_sample-0,Speech,0,9.648562300319488
test_sample-1,Music,0,0.22364217252396165
test_sample-1,Speech,2.0127795527156547,9.233226837060702
test_sample-2,Music,0,5.5910543130990416
test_sample-2,Speech,8.051118210862619,9.648562300319488
test_sample-3,Music,0,0.3194888178913738
test_sample-3,Speech,2.0447284345047922,9.26517571884984
test_sample-4,Music,0,5.143769968051118
test_sample-4,Speech,7.444089456869009,9.233226837060702
test_sample-5,Speech,0,9.648562300319488
test_sample-6,Music,0,0.8626198083067093
test_sample-6,Speech,5.015974440894569,5.079872204472843
test_sample-6,Speech,6.3897763578274756,6.741214057507987
test_sample-6,Speech,6.805111821086261,9.648562300319488
test_sample-7,Music,0,3.961661341853035
test_sample-7,Speech,5.175718849840256,6.261980830670926
test_sample-7,Speech,7.380191693290734,8.945686900958465
test_sample-8,Music,0,4.440894568690096
test_sample-9,Music,0,9.00958466453674
test_sample-10,Music,0,5.079872204472843
test_sampl

# Audio Tagging

In [56]:
y_label = y_pred
y_aud = []
for i in range(y_label.shape[0]):
    temp = audio_tag(y_label[i,:,:])
    y_aud.append(temp)

y_aud = np.array(y_aud)
n_batches = y_aud.shape[0]
i = 0
j = 0
name = 'test_sample-'
print('filename,Music,Speech')
while i<n_batches:
    if (y_aud[i,:] == np.array([1,0])).all():
        print(name+str(i)+',1,0')
    elif (y_aud[i,:] == np.array([0,1])).all():
        print(name+str(i)+',0,1')
    elif (y_aud[i,:] == np.array([1,1])).all():
        print(name+str(i)+',1,1')
    i=i+1

filename,Music,Speech
test_sample-0,0,1
test_sample-1,1,1
test_sample-2,1,1
test_sample-3,1,1
test_sample-4,1,1
test_sample-5,0,1
test_sample-6,1,1
test_sample-7,1,1
test_sample-8,1,0
test_sample-9,1,0
test_sample-10,1,1
test_sample-11,0,1
test_sample-12,1,1
test_sample-13,1,1
test_sample-14,1,1
test_sample-15,1,1
test_sample-16,0,1
test_sample-17,1,1
test_sample-18,0,1
test_sample-19,1,1
test_sample-20,0,1
test_sample-21,1,1
test_sample-22,1,1
test_sample-23,1,1
test_sample-24,1,1
test_sample-25,1,1
test_sample-26,0,1
test_sample-27,0,1
test_sample-28,0,1
test_sample-29,1,0
test_sample-30,1,1
test_sample-31,1,1
test_sample-32,0,1
test_sample-33,1,1
test_sample-34,1,1
test_sample-35,1,1
test_sample-36,1,1
test_sample-37,1,1
test_sample-38,1,0
test_sample-39,1,0
test_sample-40,0,1
test_sample-41,1,1
test_sample-42,0,1
test_sample-43,1,1
test_sample-44,1,1
test_sample-45,1,1
test_sample-46,1,1
test_sample-47,1,1
test_sample-48,0,1
test_sample-49,1,0
test_sample-50,1,0
test_sample-51,1,1
