In [9]:
import librosa
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd

path_to_ogg = 'openmic-2018/audio/'

# convert all .ogg files to log-scaled mel spectrograms

# load label file
# sample_key,instrument,relevance,num_responses
names = ['sample_key', 'instrument', 'relevance', 'num_responses']
labels = pd.read_csv('openmic-2018/openmic-2018-aggregated-labels.csv', delimiter=',', names=names, header=0)


In [10]:
# get all unique labels
unique_labels = labels.instrument.unique()
print(unique_labels)

['clarinet' 'flute' 'trumpet' 'saxophone' 'voice' 'accordion' 'ukulele'
 'mallet_percussion' 'piano' 'guitar' 'mandolin' 'banjo' 'synthesizer'
 'trombone' 'organ' 'drums' 'bass' 'cymbals' 'cello' 'violin']


In [11]:
# Architectural constants.
NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
EMBEDDING_SIZE = 128  # Size of embedding layer.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
MEL_MIN_HZ = 125
MEL_MAX_HZ = 7500
LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.

In [12]:
import vggish_input

# for saving to csv along with file name
labels_to_save = []

# loop through all .ogg files
# loop through all subfolders
for idx, folder in enumerate(os.listdir(path_to_ogg)):
    full_path = path_to_ogg + folder

    for filename in os.listdir(full_path):
        if filename.endswith(".ogg"):
            # load audio file and resample to 16000 Hz
            y, sr = librosa.load(path_to_ogg + '/' + folder + '/' + filename, sr=SAMPLE_RATE)

            # get mel-spectrogram
            spec = vggish_input.waveform_to_examples(y, sr)

            np.save('spectrograms/' + filename[:-4] + '.npy', spec)

            # get the labels for this file
            file_labels = labels.loc[labels['sample_key'] == filename[:-4]]

            label = np.zeros(len(unique_labels))
            mask = np.zeros(len(unique_labels))

            for i in range(len(file_labels)):
                label[np.where(unique_labels == file_labels.iloc[i]['instrument'])] = file_labels.iloc[i]['relevance']
                mask[np.where(unique_labels == file_labels.iloc[i]['instrument'])] = 1

            # create a list with filename and labels and mask
            labels_to_save.append(np.append(filename[:-4], np.append(label, mask)))

            continue
        else:
            continue

    # print percentage of files processed
    print(f'Percentage of folders processed: {round((idx + 1) / len(os.listdir(path_to_ogg)) * 100, 2)}%', end='\r')

# save labels to csv
labels_to_save = np.array(labels_to_save)

# add header
# ['filename' 'clarinet' 'flute' 'trumpet' 'saxophone' 'voice' 'accordion' 'ukulele' 'mallet_percussion' 'piano' 'guitar' 'mandolin' 'banjo' 'synthesizer' 'trombone' 'organ' 'drums' 'bass' 'cymbals' 'cello' 'violin', 'mask']
headers = np.append(['filename'], np.append(unique_labels, unique_labels + '_mask'))
print(headers)
labels_to_save = np.vstack((headers, labels_to_save))

np.savetxt('labels.csv', labels_to_save, delimiter=',', fmt='%s')

['filename' 'clarinet' 'flute' 'trumpet' 'saxophone' 'voice' 'accordion'
 'ukulele' 'mallet_percussion' 'piano' 'guitar' 'mandolin' 'banjo'
 'synthesizer' 'trombone' 'organ' 'drums' 'bass' 'cymbals' 'cello'
 'violin' 'clarinet_mask' 'flute_mask' 'trumpet_mask' 'saxophone_mask'
 'voice_mask' 'accordion_mask' 'ukulele_mask' 'mallet_percussion_mask'
 'piano_mask' 'guitar_mask' 'mandolin_mask' 'banjo_mask'
 'synthesizer_mask' 'trombone_mask' 'organ_mask' 'drums_mask' 'bass_mask'
 'cymbals_mask' 'cello_mask' 'violin_mask']
