In [None]:
from IPython.display import Audio
import matplotlib.pyplot as plt
import tensorflow as tf
import librosa
import pandas as pd
import numpy as np
import os
import glob
import re

In [None]:
db_root = '.../Audio'   # add your own database directory path

In [None]:
class log_mel_spec_builder:
    def __init__(self, db_root, sr, n_fft, hop_length, n_mels):
        self.db_root = db_root
        self.sr = sr
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.all_audio_root = db_root + '/samples'
        self.all_audio_files = glob.glob(self.all_audio_root + "/*.*")
        self.logmel_saver_path = os.path.join(db_root, 'melspec_npy')

    
    def x_to_log_mel_spec(self, samples):
        samples_mel_spec = librosa.feature.melspectrogram(samples,sr=self.sr,n_fft=self.n_fft,hop_length=self.hop_length,n_mels=self.n_mels)
        samples_log_mel_spec = librosa.power_to_db(samples_mel_spec)   # create log-melspectogram of audio
        return samples_log_mel_spec



    def samples_to_logmel_converter(self):
        audio_samples = []
        count = 0
        for audio in self.all_audio_files:
            audio_name = re.findall('[0-9]+', audio)[0]
            samples,sr = librosa.load(audio,sr=self.sr)

            if samples.shape[0]<(sr*10):        # append zeros if clip duration is less than 10 seconds
                appended_zeros = np.zeros(((sr*10)-samples.shape[0],))
                samples = np.append(samples,appended_zeros,axis=0)
            if samples.shape[0]>(sr*10):
                splitted_samples = np.split(samples,[sr*10])        # crop to 10 sec if clip length is larger
                samples = splitted_samples[0]

            samples_log_mel_spec = self.x_to_log_mel_spec(samples)
            samples_log_mel_spec = np.stack(samples_log_mel_spec)
            audio_samples.append(samples_log_mel_spec)
            count+=1
            print("count: {}, audio-file: {}.wav".format(count, audio_name))
        audio_samples = np.stack(audio_samples,axis=0)
        np.save(self.logmel_saver_path, audio_samples)    # saving all log-melspectogram in npy files
        return audio_samples





    

In [None]:
def plotSpec(x):
    plt.imshow(x)
    plt.show()

In [None]:
class audio_tasks_encoder:
    def __init__(self, db_root, no_of_frames):
        self.db_root = db_root
        self.audio_hard_timestamp = pd.read_csv(db_root + '/meta.csv', sep='\t')
        self.audio_frame_label_path = os.path.join(db_root,'audio_frame_label')
        self.no_of_frames = no_of_frames
        self.audio_tag_path = os.path.join(self.db_root, 'audio_tag')


    def one_hot_encoding(self):
        integer_labeling = {key:value for (value,key) in enumerate(sorted(self.audio_hard_timestamp.event_label.unique()))}        # labeling the events as 0 to N
        one_hot_encoded_labels = tf.keras.utils.to_categorical([i for i in integer_labeling.values()],num_classes=len(integer_labeling))        # one hot encoding

        return integer_labeling, one_hot_encoded_labels


    def framelevel_label_ground_truth(self):
        integer_labeling, one_hot_encoded_labels = self.one_hot_encoding()
        '''
        from timestamp to frame convert:
        hoplength of 512 at 16000 Hz sampling rate corresponds to 20 ms approx.
        And window length is 1024.
        Hence frame_no = ((timestamp * 1000) + 20)/20
        '''
        self.audio_hard_timestamp[['starting_time_frame','stopping_time_frame']] = (((self.audio_hard_timestamp[['onset','offset']]*1000)+20)/20).apply(np.ceil).astype(np.int64)      
        audio_filenames = self.audio_hard_timestamp.filename.unique().tolist()
        no_of_classes = len(list(self.audio_hard_timestamp.event_label.unique()))
        self.audio_hard_timestamp = self.audio_hard_timestamp.set_index(keys=['filename','event_label'])
        framewise_encoded_labels = []
        for file in audio_filenames:
            specific_clip_label_chunk = self.audio_hard_timestamp.xs(file,level=0,drop_level=True).loc[:,['starting_time_frame','stopping_time_frame']]
            frame_labels = np.zeros((self.no_of_frames, no_of_classes))
            for row in specific_clip_label_chunk.iterrows():
                event_label = row[0]
                event_starting_time = row[1]['starting_time_frame']
                event_stopping_time = row[1]['stopping_time_frame']
                event_encoded = one_hot_encoded_labels[integer_labeling[event_label]]
                frame_labels[(event_starting_time-1):(event_stopping_time-1),:]+=event_encoded
            framewise_encoded_labels.append(frame_labels)
        np.save(self.audio_frame_label_path, framewise_encoded_labels)
        return np.ascontiguousarray(framewise_encoded_labels)


    def audio_tag_ground_truth(self):
        integer_labeling, one_hot_encoded_labels = self.one_hot_encoding()
        audio_filenames = self.audio_hard_timestamp.filename.unique().tolist()
        self.audio_hard_timestamp = self.audio_hard_timestamp.set_index(keys=['filename', 'event_label'])
        cliplevel_encoded_labels = []
        for file in audio_filenames:
            specific_clip_label_chunk_unique_labels = self.audio_hard_timestamp.xs(file,level=0).reset_index().event_label.unique().tolist()
            cliplevel_labels = np.zeros((len(integer_labeling)))
            for label in specific_clip_label_chunk_unique_labels:
                cliplevel_labels[integer_labeling[label]]+=1
            cliplevel_encoded_labels.append(cliplevel_labels)
        
        np.save(self.audio_tag_path, np.stack(cliplevel_encoded_labels))
        return np.ascontiguousarray(cliplevel_encoded_labels)
    
