Overview

Author: stephankokkas

This notebook defines a pipeline that tasks an input directory of audio files and converts them to images using mel-spectrogram transofrmation and preprocessing techniques.

In [1]:
# disable warnings to tidy up output
import warnings
warnings.filterwarnings("ignore")

# some basic libraries 
from platform import python_version
#import pandas as pd
#import seaborn as sns
import numpy as np
import os
import random
from os import listdir
from os.path import isfile, join
import shutil
import torch
import pandas as pd

# plot support
import matplotlib.pyplot as plt

# tensorflow support
import tensorflow as tf
#import tensorflow_transform as tft
import tensorflow_io as tfio
#from tensorflow.contrib.framework.python.ops import audio_ops

# scipy
import scipy
from pydub import AudioSegment, effects

# turn off tensorflow warnings
tf.get_logger().setLevel('ERROR')

# turn off absl warnings
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# print system information
print('Python Version     : ', python_version())
print('TensorFlow Version : ', tf.__version__)

2022-11-25 20:11:55.449501: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Python Version     :  3.10.0
TensorFlow Version :  2.11.0


In [2]:
# below code adapted from:
# https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(123)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
random.seed(123)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(1234)

In [3]:
# set system parameters
DATASET_PATH  = '/Users/stephankokkas/Downloads/birdclef2022/'

## Preprocessing Pipeline

This pipeline will go through a root directory and find all the audio files that exist and are of accepted format. Then, depending on the params set, it with normalise, trim and split the data.

In [143]:
class raw_file_pre_processing():
    def __init__(self) -> None:
        self.CLIP_LENGTH   = 5000   # only look at 5000 milliseconds of clip at the start of loaded audio file
        self.BITRATE = "32k"        # all the samples are converted to bit rate of 32000 (Samples/Second)
        self.labels = []
        self.raw_dirs = {}
        self.dataset = pd.DataFrame(columns=['Label', 'FileName', 'FileType', 'Directory'])
        self.TARGET_FORMAT = 'mp3'
        self.ACCEPTED_FORMAT = ['.mp3', '.flac', '.aiff', '.mp4', '.m4a', '.wav', '.ogg']

        self.OUTPUT_DIR = os.path.join(DATASET_PATH, 'OUTPUT')
        self.TRAIN_DIR = os.path.join(DATASET_PATH, 'TRAIN')
        self.TEST_DIR = os.path.join(DATASET_PATH, 'TEST')
        self.VALIDATION_DIR = os.path.join(DATASET_PATH, 'VALIDATION')

        self.CLEAN_DIR_ = True

        self.TRAIN_SPLIT = 0.8
        self.VALIDATION_SPLIT = 0.1

        def clean_dir(self):
            if os.path.exists(self.OUTPUT_DIR):
                shutil.rmtree(self.OUTPUT_DIR)
            if os.path.exists(self.TRAIN_DIR):
                shutil.rmtree(self.TRAIN_DIR)
            if os.path.exists(self.TEST_DIR):
                shutil.rmtree(self.TEST_DIR)
            if os.path.exists(self.VALIDATION_DIR):
                shutil.rmtree(self.VALIDATION_DIR)
        if self.CLEAN_DIR_: clean_dir(self)
            

    def get_raw_file_paths(self, directory):
        print(f'Looking for files... acceptable formats include: {self.ACCEPTED_FORMAT}')
        for root, dir, files in os.walk(directory):
            if dir == [] and "output" not in str(root).split("/")[1]:
                tmp_lable = str(root).split("/")[-1]
                tmp_file_dir = []
                for file in files:
                    for ext in self.ACCEPTED_FORMAT:
                        if ext in str(file):
                            tmp_file_dir.append(os.path.join(root, file))
                        
                self.raw_dirs.update({tmp_lable:tmp_file_dir})

        for key in self.raw_dirs:
            print(f'FOUND: {key} -> {len(self.raw_dirs[key])}')

    def audio_preprocessing(self, TRIM_AUDIO:bool = False, NORM_AUDIO:bool = False, TRAIN_TEST_SPLIT:bool = False):
        print('\nConvering audio files....')
        if not os.path.exists(self.OUTPUT_DIR):
            os.makedirs(self.OUTPUT_DIR)


        for key, item in self.raw_dirs.items():
            print(f'Converting {key} data ->> ...')
            tmp_dir_key = os.path.join(self.OUTPUT_DIR, key)
            if not os.path.exists(tmp_dir_key):
                os.makedirs(tmp_dir_key)

            for dir in item:
                try:
                    # read file
                    tmp_file_name = str(dir).split("/")[-1].split('.')[0]
                    raw_sound = AudioSegment.from_file(dir, format=dir.split('.')[-1])

                    if NORM_AUDIO:
                        # normalise file
                        raw_sound = effects.normalize(raw_sound)

                    # trim file
                    if TRIM_AUDIO:
                        arr_split_file = [raw_sound[idx:idx + self.CLIP_LENGTH] for idx in range(0, len(raw_sound), self.CLIP_LENGTH)]             
                        for count_sample, sample in enumerate(arr_split_file):
                            # padding audio < 5s
                            if len(sample) < self.CLIP_LENGTH:
                                silence = AudioSegment.silent(duration=((self.CLIP_LENGTH-len(sample))))
                                sample = sample + silence  # Adding silence after the audio

                            # export raw file
                            tmp_raw_new_dir = os.path.join(tmp_dir_key, tmp_file_name + '_raw_trim_sample_' + str(count_sample) + '.' + self.TARGET_FORMAT)
                            sample.export(tmp_raw_new_dir, format=self.TARGET_FORMAT, bitrate=self.BITRATE)

                            new_row = pd.Series({"Label": key,
                                        "FileName": tmp_file_name + '_raw_trim_sample_' + str(count_sample) + '.' + self.TARGET_FORMAT,
                                        "FileType": self.TARGET_FORMAT,
                                        "Directory": tmp_raw_new_dir})
                            self.dataset = pd.concat([self.dataset, new_row.to_frame().T], ignore_index=True)
                    else:
                        tmp_raw_new_dir = os.path.join(tmp_dir_key, tmp_file_name + '_raw_' + '.' + self.TARGET_FORMAT)
                        raw_sound.export(tmp_raw_new_dir, format=self.TARGET_FORMAT, bitrate=self.BITRATE)

                        new_row = pd.Series({"Label": key,
                                    "FileName": tmp_file_name + '_raw_' + '.' + self.TARGET_FORMAT,
                                    "FileType": self.TARGET_FORMAT,
                                    "Directory": tmp_raw_new_dir})
                        self.dataset = pd.concat([self.dataset, new_row.to_frame().T], ignore_index=True)
                except Exception as e:
                    print(e)

        if TRAIN_TEST_SPLIT:
            print(f'\nSplitting data into sub-directories Train, Test and Validation...')

            if not os.path.exists(self.TRAIN_DIR):
                os.mkdir(self.TRAIN_DIR)
            if not os.path.exists(self.TEST_DIR):
                os.mkdir(self.TEST_DIR)
            if not os.path.exists(self.VALIDATION_DIR):
                os.mkdir(self.VALIDATION_DIR)


            dict_keys = self.dataset['Label'].value_counts().to_dict()
            for key, item in dict_keys.items():
                if not os.path.exists(os.path.join(self.TRAIN_DIR, key)):
                    os.mkdir(os.path.join(self.TRAIN_DIR, key))
                if not os.path.exists(os.path.join(self.TEST_DIR, key)):
                    os.mkdir(os.path.join(self.TEST_DIR, key))
                if not os.path.exists(os.path.join(self.VALIDATION_DIR, key)):
                    os.mkdir(os.path.join(self.VALIDATION_DIR, key))

                if item % 2 == 0:
                    train_split_count_ = int(round(item*self.TRAIN_SPLIT,0))
                else:
                    train_split_count_ = int(round(item*self.TRAIN_SPLIT,0) -1)
                validation_split_count_ = int(round(item*self.VALIDATION_SPLIT,0))

                tmp_train_dirs = self.dataset.loc[self.dataset['Label'] == key][:train_split_count_]['Directory'].to_list()
                tmp_validation_dirs = self.dataset.loc[self.dataset['Label'] == key][train_split_count_:train_split_count_+validation_split_count_]['Directory'].to_list()
                tmp_test_dirs = self.dataset.loc[self.dataset['Label'] == key][train_split_count_+validation_split_count_:]['Directory'].to_list()

                for i in tmp_train_dirs:
                    os.replace(i, i.replace('OUTPUT', 'TRAIN'))
                for i in tmp_validation_dirs:
                    os.replace(i, i.replace('OUTPUT', 'VALIDATION'))
                for i in tmp_test_dirs:
                    os.replace(i, i.replace('OUTPUT', 'TEST'))

            shutil.rmtree(self.OUTPUT_DIR)

In [144]:
data_preprocessing_pipeline = raw_file_pre_processing()

data_preprocessing_pipeline.get_raw_file_paths(DATASET_PATH)
dataset = data_preprocessing_pipeline.audio_preprocessing(TRIM_AUDIO=True, NORM_AUDIO=True, TRAIN_TEST_SPLIT=True)

Looking for files... acceptable formats include: ['.mp3', '.flac', '.aiff', '.mp4', '.m4a', '.wav', '.ogg']
FOUND: jabwar -> 78
FOUND: wiltur -> 76
FOUND: sheowl -> 128
FOUND: brant -> 135
FOUND: spodov -> 107

Convering audio files....
Converting jabwar data ->> ...
Converting wiltur data ->> ...
Converting sheowl data ->> ...
Converting brant data ->> ...
Converting spodov data ->> ...

Splitting data into subdirectories Train, Test and Validation...


## Melspectrogram Pipeline

This pipeline will get all the new mp3 files, convert them to tfio tensors, convert to spectrograms, convert again to mel-spectrograms, then congert to db scale mel-spectrograms. It will then save the tensors as .pt files which can be read again

In [11]:
class mel_spectrogram_pipeline():
    def __init__(self) -> None:
        self.target_dir = ''
        self.labels = []
        self.augmented_dirs = {}
        self.OUTPUT_DIR = {}
        self.ACCEPTED_FORMAT = '.mp3'
        self.TENSOR_OUTPUT_DIR = os.path.join(DATASET_PATH, 'tensors')

        self.NFFT = 512
        self.WINDOW = 512
        self.STRIDE = 256
        self.RATE = 16000
        self.MELS = 128
        self.FMIN = 0
        self.FMAX = 8000
        self.TOP_DB = 80

    
    def clean_dirs(self):
        print('Cleaning tensor directory')
        if os.path.exists(self.TENSOR_OUTPUT_DIR):
            shutil.rmtree(self.TENSOR_OUTPUT_DIR)

    def get_output_dir(self):
        print('Finding all pre-processed files')
        for root, dir, files in os.walk(DATASET_PATH):
            if "TRAIN" not in dir:
                if "OUTPUT" not in dir:
                    raise ValueError('Cant find any directories with pre-processed data. Looking for OUTPUT or TRAIN, TEST, and VAIDATION')
                else:
                    self.OUTPUT_DIR.update({"OUTPUT": os.path.join(root, "OUTPUT")})
            else:
                if "TEST" in dir and "VALIDATION" in dir:
                    self.OUTPUT_DIR.update({"TRAIN": os.path.join(root, "TRAIN")})
                    self.OUTPUT_DIR.update({"TEST": os.path.join(root, "TEST")})
                    self.OUTPUT_DIR.update({"VALIDATION": os.path.join(root, "VALIDATION")})
            if not self.OUTPUT_DIR:
                raise ValueError('Cant find any directories with pre-processed data. Looking for OUTPUT or TRAIN, TEST, and VAIDATION')

            print(f'Found the following directories {self.OUTPUT_DIR}')
            break
            
    def get_preprocessed_files(self):
        self.get_output_dir()
        
        for key, item in self.OUTPUT_DIR.items():
            for root, dir, files in os.walk(item):
                if dir == []:
                    tmp_lable = str(root).split("/")[-1] + "-" + key
                    tmp_file_dir = []
                    for file in files:
                        if self.ACCEPTED_FORMAT in str(file):
                            tmp_file_dir.append(os.path.join(root, file))
                            
                    self.augmented_dirs.update({tmp_lable:tmp_file_dir})

        for key in self.augmented_dirs:
            print(f'FOUND: {key} -> {len(self.augmented_dirs[key])}')

    def generate_mel_spectrograms(self, DB_SCALE:bool = False, MEL_SPECTRO:bool = False):
        print(f'\nGenerating melspectrogram and db-melspectrogram per mp3 file... \n')
        for key, item in self.augmented_dirs.items():
            for dir in item:
                tmp_dir_key = f'{self.TENSOR_OUTPUT_DIR}/{key}/'
                if not os.path.exists(tmp_dir_key):
                    os.makedirs(tmp_dir_key)

                tmp_audio = tfio.audio.AudioIOTensor(dir)
                tmp_audio_t = tmp_audio.to_tensor()

                # Convert to spectrogram
                spectrogram = tfio.audio.spectrogram(
                    tmp_audio_t, nfft=self.NFFT, window=self.WINDOW, stride=self.STRIDE)

                if MEL_SPECTRO:
                    # # Convert to mel-spectrogram
                    # ValueError: upper_edge_hertz must not be larger than the Nyquist frequency (sample_rate / 2)
                    mel_spectrogram = tfio.audio.melscale(
                        spectrogram, rate=self.RATE, mels=self.MELS, fmin=self.FMIN, fmax=self.FMAX)
                    torch.save(mel_spectrogram, f'{self.TENSOR_OUTPUT_DIR}/{key}/{dir.split("/")[-1].split(".")[0]}_raw_mel_spectrogram.pt')

                if DB_SCALE:
                    # Convert to db scale mel-spectrogram
                    dbscale_mel_spectrogram = tfio.audio.dbscale(
                        mel_spectrogram, top_db=self.TOP_DB)
                    torch.save(dbscale_mel_spectrogram, f'{self.TENSOR_OUTPUT_DIR}/{key}/{dir.split("/")[-1].split(".")[0]}_dbscale_raw_mel_spectrogram.pt')

                print('Done')
                break
            break

                # if FREQ_SHIFT:
                #     #frequency shift
                #     octave = -0.5
                #     new_sample_rate = int(sample.frame_rate * (2.0 ** octave))
                #     freg_shift_sample = sample._spawn(sample.raw_data, overrides={'frame_rate': new_sample_rate})
                    
                #     # export freq_shift file
                #     tmp_freq_new_dir = os.path.join(tmp_dir_key, tmp_file_name + '_freq_' + str(count_sample) + '.' + self.TARGET_FORMAT)
                #     freg_shift_sample.export(tmp_freq_new_dir, format=self.TARGET_FORMAT, bitrate=self.BITRATE)
                #     tmp_arr_mp3_dir.append(tmp_freq_new_dir)

                # # Freq masking
                # freq_mask = tfio.audio.freq_mask(dbscale_mel_spectrogram, param=10)
                # print(freq_mask)
                # input()

                # Time masking
                # time_mask = tfio.audio.time_mask(dbscale_mel_spectrogram, param=10)

In [12]:
data_spectro_pipeline = mel_spectrogram_pipeline()

data_spectro_pipeline.clean_dirs()
data_spectro_pipeline.get_preprocessed_files()
data_spectro_pipeline.generate_mel_spectrograms(DB_SCALE=False, MEL_SPECTRO=True)

Cleaning tensor directory


TypeError: mel_spectrogram_pipeline.get_output_dir() takes 1 positional argument but 2 were given