In [17]:
import os
import platform
import random
import time
import sys

import numpy as np
import scipy as sp
import tensorflow as tf

In [18]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling2D, TimeDistributed, Conv1D, Conv2D, GlobalAveragePooling1D, MaxPool2D, Flatten, add
from keras.models import Model
from keras.optimizers import Adam

# silence tensorflow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# getting rid of the warning messages about optimizer graph
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

In [19]:
# print Tensorflow and CUDA information
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
print(f"Tensorflow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")
 
if tf.test.gpu_device_name():
    gpu_devices = tf.config.list_physical_devices('GPU')
    details = tf.config.experimental.get_device_details(gpu_devices[0])
    name = details.get('device_name', 'Unknown GPU')
    
    print(f"Using {name}")
else:
    print("No GPU found")

Num GPUs Available:  1
Num CPUs Available:  1
Tensorflow version: 2.11.0
Keras version: 2.11.0
Using NVIDIA GeForce RTX 3070 Laptop GPU


2023-05-09 14:07:06.697373: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-09 14:07:06.709154: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-09 14:07:06.709213: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-09 14:07:07.595607: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-09 14:07:07.595673: I tensorflow/core/co

In [20]:
import vggish_params as params


path = 'vggish_model.ckpt'

class VGGish(tf.keras.Model):
    def __init__(self, training=False):
        super(VGGish, self).__init__()
        self.training = training

        # The VGG stack of alternating convolutions and max-pools.
        self.conv1 = Conv2D(64, kernel_size=[3, 3], padding='same', activation=tf.nn.relu, trainable=self.training)
        self.pool1 = MaxPool2D(pool_size=[2, 2], padding='same', trainable=self.training)
        self.conv2 = Conv2D(128, kernel_size=[3, 3], padding='same', activation=tf.nn.relu, trainable=self.training)
        self.pool2 = MaxPool2D(pool_size=[2, 2], padding='same', trainable=self.training)
        self.conv3_1 = Conv2D(256, kernel_size=[3, 3], padding='same', activation=tf.nn.relu, trainable=self.training)
        self.conv3_2 = Conv2D(256, kernel_size=[3, 3], padding='same', activation=tf.nn.relu, trainable=self.training)
        self.pool3 = MaxPool2D(pool_size=[2, 2], padding='same', trainable=self.training)
        self.conv4_1 = Conv2D(512, kernel_size=[3, 3], padding='same', activation=tf.nn.relu, trainable=self.training)
        self.conv4_2 = Conv2D(512, kernel_size=[3, 3], padding='same', activation=tf.nn.relu, trainable=self.training)
        self.pool4 = MaxPool2D(pool_size=[2, 2], padding='same', trainable=self.training)

        # Flatten before entering fully-connected layers
        self.flatten = Flatten()
        self.fc1_1 = Dense(4096, activation=tf.nn.relu, trainable=self.training)
        self.fc1_2 = Dense(4096, activation=tf.nn.relu, trainable=self.training)
        # The embedding layer.
        self.fc2 = Dense(params.EMBEDDING_SIZE, activation=None, trainable=self.training)

    def call(self, inputs):
        net = self.conv1(inputs)
        net = self.pool1(net)
        net = self.conv2(net)
        net = self.pool2(net)
        net = self.conv3_1(net)
        net = self.conv3_2(net)
        net = self.pool3(net)
        net = self.conv4_1(net)
        net = self.conv4_2(net)
        net = self.pool4(net)

        net = self.flatten(net)
        net = self.fc1_1(net)
        net = self.fc1_2(net)
        net = self.fc2(net)
        
        return net

    def load_vggish_slim_checkpoint(self, checkpoint_path):
        """Loads a pre-trained VGGish-compatible checkpoint."""
        self.load_weights(checkpoint_path)

vggish = VGGish()
vggish.load_vggish_slim_checkpoint(path)


In [21]:
import vggish_input

class VGGishClassifier(tf.keras.Model):
    def __init__(self, vggish_model, num_classes):
        super(VGGishClassifier, self).__init__()
        self.vggish_model = vggish_model
        self.dense1 = Dense(512, activation='relu')
        self.dense2 = Dense(256, activation='relu')
        self.dense3 = Dense(128, activation='relu')
        self.skip1 = Dense(128, activation='relu')
        self.dense4 = Dense(num_classes, activation='sigmoid')
        self.dropout = Dropout(0.5)

    def call(self, inputs):
        x = self.vggish_model(inputs)
        x = self.dense1(x)
        x = self.dropout(x)
        skip = self.skip1(x)
        x = self.dense2(x)
        x = self.dropout(x)
        x = self.dense3(x)
        x = self.dropout(x)
        x = add([x, skip])
        x = self.dense4(x)
        return x



num_classes = 20  # Set the number of classes as needed
classifier_model = VGGishClassifier(vggish, num_classes)

# Compile the model
classifier_model.compile(optimizer=tf.keras.optimizers.Adam(),
                         loss=tf.keras.losses.BinaryCrossentropy(),
                         metrics=['accuracy'])

# Prepare the input data and labels
batch_size = 10
num_frames = params.NUM_FRAMES
num_bands = params.NUM_BANDS

input_data = np.random.rand(batch_size, num_frames, num_bands, 1).astype(np.float32)
# labels = np.random.randint(0, num_classes, size=(batch_size,))
# force all labels to be the same
labels = np.ones((batch_size,)) * 5

classifier_model.build(input_shape=(None, params.NUM_FRAMES, params.NUM_BANDS, 1))
classifier_model.summary()

# Train the classifier model
if False:
    classifier_model.fit(input_data, labels, epochs=10)

    predictions = classifier_model.predict(input_data)

    print(f"Predictions shape: {predictions.shape}")

Model: "vg_gish_classifier_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vg_gish_1 (VGGish)          multiple                  72141184  
                                                                 
 dense_16 (Dense)            multiple                  66048     
                                                                 
 dense_17 (Dense)            multiple                  131328    
                                                                 
 dense_18 (Dense)            multiple                  32896     
                                                                 
 dense_19 (Dense)            multiple                  65664     
                                                                 
 dense_20 (Dense)            multiple                  2580      
                                                                 
 dropout_4 (Dropout)         multiple         

In [27]:
import tensorflow as tf
from keras.layers import Dense, Dropout, Add
# import Rescaling
from keras.layers.preprocessing.image_preprocessing import Rescaling
from keras import layers

import tensorflow as tf
from keras.layers import Layer, MultiHeadAttention, Dense, LayerNormalization, Dropout, Reshape, Add

class TransformerBlock(Layer):
    def __init__(self, d_model, num_heads, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(d_model),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    @tf.function
    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class VGGishTransformerClassifier(tf.keras.Model):
    def __init__(self, vggish_model, num_classes, num_heads=4, ff_dim=512):
        super(VGGishTransformerClassifier, self).__init__()
        self.vggish_model = vggish_model
        self.reshape = Reshape((1, 128))  # Reshaping the output to (batch_size, 1, 128)
        self.transformer_block = TransformerBlock(d_model=128, num_heads=num_heads, ff_dim=ff_dim)
        self.dense1 = Dense(128, activation='relu')
        self.skip1 = Dense(128, activation='relu')
        self.flatten = Flatten()
        self.dense2 = Dense(num_classes, activation='sigmoid')
        self.dropout = Dropout(0.5)

    @tf.function
    def call(self, inputs, training=None):
        x = self.vggish_model(inputs)
        x = self.reshape(x)
        x = self.transformer_block(x)
        x = tf.squeeze(x, axis=1)  # Squeezing the output back to (batch_size, 128)
        x = self.dense1(x)
        x = self.dropout(x)
        skip = self.skip1(x)
        x = Add()([x, skip])
        x = self.flatten(x)
        x = self.dense2(x)
        return x


In [23]:
import os
import pandas as pd
import librosa
import numpy as np

# train csv path
train_csv_path = 'openmic-2018/partitions/split01_train.csv'
# test csv path
test_csv_path = 'openmic-2018/partitions/split01_test.csv'

# open csvs
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# convert to numpy arrays
train_df = train_df.to_numpy()
test_df = test_df.to_numpy()

# make each a single list
train_df = train_df.flatten()
test_df = test_df.flatten()

# print the first 5 rows of the train and test dataframes
print(train_df[:5])
print(test_df[:5])

# only use the first 10% of each csv
train_df = train_df[:int(len(train_df) * 0.15)]
test_df = test_df[:int(len(test_df) * 0.15)]

['000135_483840' '000139_119040' '000141_153600' '000144_30720'
 '000145_172800']
['000308_61440' '000312_184320' '000319_145920' '000321_218880'
 '000327_88320']


In [24]:
dataset_path = 'spectrograms'
labels_path = 'labels.csv'

# Read the labels CSV file
# ['filename' 'clarinet' 'flute' 'trumpet' 'saxophone' 'voice' 'accordion' 'ukulele' 'mallet_percussion' 'piano' 'guitar' 'mandolin' 'banjo' 'synthesizer' 'trombone' 'organ' 'drums' 'bass' 'cymbals' 'cello' 'violin']
labels_df = pd.read_csv(labels_path)

# Get the list of all the filenames
filenames = labels_df['filename'].values.tolist()

# load the spectrograms and labels
spectrograms_train = []
labels_train = []

spectrograms_test = []
labels_test = []

for filename in filenames:
    # if the filename is not in the train or test dataframe, skip it
    if filename not in train_df and filename not in test_df:
        continue

    # load the spectrogram
    spectrogram = np.load(os.path.join(dataset_path, filename + '.npy'))

    # the fist index is the filename, the next 20 are the labels and the last 20 are the masks
    label = labels_df[labels_df['filename'] == filename].values.tolist()[0][1:21]
    mask = labels_df[labels_df['filename'] == filename].values.tolist()[0][21:]

    # make a pair of the spectrogram and the label
    combined = list(zip(label, mask))

    # append each second seperatly
    if filename in train_df:
        for i in range(10):
            spectrograms_train.append(spectrogram[i])
            labels_train.append(combined)
    elif filename in test_df:
        for i in range(10):
            spectrograms_test.append(spectrogram[i])
            labels_test.append(combined)
    else:
        continue
        # print(f"Filename {filename} not found in train or test dataframes")

# convert the lists to numpy arrays
spectrograms_train = np.array(spectrograms_train)
labels_train = np.array(labels_train)

spectrograms_test = np.array(spectrograms_test)
labels_test = np.array(labels_test)

#spectrograms = spectrograms.reshape(spectrograms.shape[0], num_frames, num_bands, 1)
spectrograms_test = np.expand_dims(spectrograms_test, axis=-1)
spectrograms_train = np.expand_dims(spectrograms_train, axis=-1)


print(f"Spectrograms shape: {spectrograms_train.shape}")
print(f"Labels shape: {labels_train.shape}")

Spectrograms shape: (22370, 96, 64, 1)
Labels shape: (22370, 20, 2)


In [36]:
# custom partial binary crossentropy loss function
@tf.function
def partial_mean_squared_error(y_true, y_pred):
    # y_true and y_pred are tensors with shape (batch_size, 20, 2) and (batch_size, 20) respectively
    
    # Separate labels and masks from y_true
    labels = y_true[..., 0]
    masks = y_true[..., 1]

    # if the label is >.5 set it to 1, otherwise set it to 0
    labels = tf.cast(tf.greater_equal(labels, 0.5), tf.float32)
    
    # Compute the squared error between y_pred and labels
    squared_error = tf.square(labels - y_pred)
    
    # Apply the mask to the squared error
    masked_squared_error = squared_error * masks
    
    # Compute the mean of the masked squared error
    loss = tf.reduce_sum(masked_squared_error) / tf.reduce_sum(masks)
    
    return loss

@tf.function
def unmasked_accuracy(y_true, y_pred):
    # Separate labels and masks from y_true. The masks should be ignored.
    labels = y_true[..., 0]

    # Threshold predictions to convert them to binary values (assuming 0.5 as the threshold)
    binary_pred = tf.cast(tf.greater_equal(y_pred, 0.5), tf.float32)

    # Threshold the labels to convert them to binary values
    binary_true = tf.cast(tf.greater_equal(labels, 0.5), tf.float32)

    # Compute the element-wise equality between binary_pred and y_true
    correct_predictions = tf.cast(tf.equal(binary_pred, binary_true), tf.float32)

    # Calculate the accuracy
    accuracy = tf.reduce_mean(correct_predictions)

    return accuracy

@tf.function
def unmasked_f1_score(y_true, y_pred):
    # Separate labels and masks from y_true
    labels = y_true[..., 0]

    # Threshold predictions to convert them to binary values (assuming 0.5 as the threshold)
    binary_pred = tf.cast(tf.greater_equal(y_pred, 0.5), tf.float32)

    # Threshold the labels to convert them to binary values
    binary_true = tf.cast(tf.greater_equal(labels, 0.5), tf.float32)

    # Calculate true positives, false positives, and false negatives
    true_positives = tf.reduce_sum(binary_pred * binary_true)
    false_positives = tf.reduce_sum(binary_pred * (1 - binary_true))
    false_negatives = tf.reduce_sum((1 - binary_pred) * binary_true)

    # Calculate precision and recall
    precision = true_positives / (true_positives + false_positives + 1e-8)
    recall = true_positives / (true_positives + false_negatives + 1e-8)

    # Calculate the F1 score
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)

    return f1_score

X_train = spectrograms_train
y_train = labels_train

X_test = spectrograms_test
y_test = labels_test

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (22370, 96, 64, 1)
y_train shape: (22370, 20, 2)
X_test shape: (7620, 96, 64, 1)
y_test shape: (7620, 20, 2)


In [37]:
# import f1
from tensorflow.keras import backend as K

num_classes = 20  # Set the number of classes as needed
# classifier_model = VGGishClassifier(vggish, num_classes)
classifier_model = VGGishTransformerClassifier(vggish, num_classes)

# Compile the model
classifier_model.compile(optimizer=tf.keras.optimizers.Adam(),
                         loss=partial_mean_squared_error,
                         metrics=[unmasked_accuracy, unmasked_f1_score])

classifier_model.build(input_shape=(None, params.NUM_FRAMES, params.NUM_BANDS, 1))

classifier_model.summary()

history = classifier_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=32, batch_size=128)

predictions = classifier_model.predict(X_test)
print(f"Predictions shape: {predictions.shape}")

# print the first prediction rounded to 2 decimal places
print(f"First prediction: {np.round(predictions[5], 2)}")
print(f"First label:      {y_test[5][0]}")
print(f"First mask:       {y_test[5][1]}")

Model: "vg_gish_transformer_classifier_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vg_gish_1 (VGGish)          multiple                  72141184  
                                                                 
 reshape_7 (Reshape)         multiple                  0         
                                                                 
 transformer_block_7 (Transf  multiple                 396032    
 ormerBlock)                                                     
                                                                 
 dense_53 (Dense)            multiple                  16512     
                                                                 
 dense_54 (Dense)            multiple                  16512     
                                                                 
 flatten_9 (Flatten)         multiple                  0         
                                  