Overview

Author: stephankokkas

This notebook defines a pipeline that tasks an input directory of audio files and converts them to images using mel-spectrogram transofrmation and preprocessing techniques.

In [None]:
# disable warnings to tidy up output
import warnings
warnings.filterwarnings("ignore")

# some basic libraries 
from platform import python_version
#import pandas as pd
#import seaborn as sns
import numpy as np
import os
import random
from os import listdir
from os.path import isfile, join


# plot support
import matplotlib.pyplot as plt

# tensorflow support
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_io as tfio
from tensorflow.contrib.framework.python.ops import audio_ops

# scipy
import scipy

# reading audio datasets
import librosa
import librosa.display

# turn off tensorflow warnings
tf.get_logger().setLevel('ERROR')

# turn off absl warnings
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

# print system information
print('Python Version     : ', python_version())
print('TensorFlow Version : ', tf.__version__)
print('Librosa Version    : ', librosa.__version__)

In [None]:
# below code adapted from:
# https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(123)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
random.seed(123)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(1234)

In [None]:
# set system parameters
DATASET_PATH  = 'C:/Users/Andrew/OneDrive - Deakin University/DataSets/birdclef2022/'
FILE_FORMAT = ''

SAMPLE_RATE   = 32000   # all the samples are converted to bit rate of 32000 (Samples/Second)
MIN_FREQUENCY = 16      # minimum frequency (Hz) for the Fast Fourier Transform related functions
MAX_FREQUENCY = 4096*3  # minimum frequency (Hz) for the Fast Fourier Transform related functions
HOP_LENGTH    = 128     # the number of samples to slide spectrogram window along the audio samples
NUMBER_FFT    = 2048    # the number of FFT to execute within a single spectrogram window
NUMBER_MELS   = 128     # the number of Mel-Spectrogram groups to split the frequency dimension
CLIP_LENGTH   = 5      # only look at 10 seconds of clip at the start of loaded audio file