Overview

Author: stephankokkas

This notebook defines a pipeline that tasks an input directory of audio files and converts them to images using mel-spectrogram transofrmation and preprocessing techniques.

In [121]:
# disable warnings to tidy up output
import warnings
warnings.filterwarnings("ignore")

# some basic libraries 
from platform import python_version
#import pandas as pd
#import seaborn as sns
import numpy as np
import os
import random
from os import listdir
from os.path import isfile, join


# plot support
import matplotlib.pyplot as plt

# tensorflow support
import tensorflow as tf
#import tensorflow_transform as tft
#import tensorflow_io as tfio
#from tensorflow.contrib.framework.python.ops import audio_ops

# scipy
import scipy
from pydub import AudioSegment

# turn off tensorflow warnings
tf.get_logger().setLevel('ERROR')

# turn off absl warnings
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

# print system information
print('Python Version     : ', python_version())
print('TensorFlow Version : ', tf.__version__)

Python Version     :  3.10.0
TensorFlow Version :  2.11.0


In [122]:
# below code adapted from:
# https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(123)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
random.seed(123)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(1234)

In [123]:
# set system parameters
DATASET_PATH  = '/Users/stephankokkas/Downloads/birdclef2022/'

SAMPLE_RATE   = 32000   # all the samples are converted to bit rate of 32000 (Samples/Second)
MIN_FREQUENCY = 16      # minimum frequency (Hz) for the Fast Fourier Transform related functions
MAX_FREQUENCY = 4096*3  # minimum frequency (Hz) for the Fast Fourier Transform related functions
HOP_LENGTH    = 128     # the number of samples to slide spectrogram window along the audio samples
NUMBER_FFT    = 2048    # the number of FFT to execute within a single spectrogram window
NUMBER_MELS   = 128     # the number of Mel-Spectrogram groups to split the frequency dimension
CLIP_LENGTH   = 5      # only look at 10 seconds of clip at the start of loaded audio file

In [128]:
class raw_file_pre_processing():
    def __init__(self) -> None:
        self.labels = []
        self.raw_dirs = {}
        self.mp3_dirs = {}
        self.TARGET_FORMAT = 'mp3'
        self.ACCEPTED_FORMAT = ['.mp3', '.flac', '.aiff', '.mp4', '.m4a', '.wav', '.ogg']
        self.OUTPUT_DIR = os.path.join(DATASET_PATH, 'output')
            

    def get_raw_file_paths(self, directory):
        print(f'Looking for files... acceptable formats include: {self.ACCEPTED_FORMAT}')
        for root, dir, files in os.walk(directory):
            if dir == [] and "output" not in str(root).split("/")[1]:
                tmp_lable = str(root).split("/")[-1]
                tmp_file_dir = []
                for file in files:
                    for ext in self.ACCEPTED_FORMAT:
                        if ext in str(file):
                            tmp_file_dir.append(os.path.join(root, file))
                        

                self.raw_dirs.update({tmp_lable:tmp_file_dir})

        for key in self.raw_dirs:
            print(f'FOUND: {key} -> {len(self.raw_dirs[key])}')

    def convert_audo_files(self):
        print('\nConvering audio files....')
        if not os.path.exists(self.OUTPUT_DIR):
            os.makedirs(self.OUTPUT_DIR)

        for key, item in self.raw_dirs.items():
            tmp_dir_key = os.path.join(self.OUTPUT_DIR, key)
            if not os.path.exists(tmp_dir_key):
                os.makedirs(tmp_dir_key)

            tmp_arr_mp3_dir = []
            for dir in item:
                try:
                    tmp_file_name = str(dir).split("/")[-1].split('.')[0] + '.' + self.TARGET_FORMAT
                    AudioSegment.from_file(dir, format=dir.split('.')[-1]).export(os.path.join(tmp_dir_key, tmp_file_name), format=self.TARGET_FORMAT)
                    tmp_arr_mp3_dir.append(os.path.join(tmp_dir_key, tmp_file_name))
                except Exception as e:
                    print(e)
            
            self.mp3_dirs.update({key: tmp_arr_mp3_dir})

In [129]:
data_preprocessing_pipeline = raw_file_pre_processing()

data_preprocessing_pipeline.get_raw_file_paths(DATASET_PATH)
data_preprocessing_pipeline.convert_audo_files() 

Looking for files... acceptable formats include: ['.mp3', '.flac', '.aiff', '.mp4', '.m4a', '.wav', '.ogg']
FOUND: jabwar -> 78
FOUND: wiltur -> 76
FOUND: sheowl -> 128
FOUND: brant -> 135
FOUND: spodov -> 107

Convering audio files....
