In [1]:
########################################################################################
# library imports
########################################################################################

# disable warnings
import warnings
warnings.filterwarnings("ignore")

# generic libraries
from platform import python_version
import functools
import diskcache as dc
import time
import hashlib
import numpy as np

# tensor flow / keras related libraries
import tensorflow as tf
import tensorflow_io as tfio
from keras.utils import dataset_utils

# image processing related libraries
import librosa 
import imageio
import PIL

# print system information
print('Python Version        : ', python_version())
print('TensorFlow Version    : ', tf.__version__)
print('TensorFlow IO Version : ', tfio.__version__)
print('Librosa Version       : ', librosa.__version__)

Python Version        :  3.8.16
TensorFlow Version    :  2.10.1
TensorFlow IO Version :  0.27.0
Librosa Version       :  0.10.0


In [2]:
########################################################################################
# system constants
########################################################################################
AUDIO_DATA_DIRECTORY = "d:\\data\\bc"
CACHE_DIRETORY       = "d:\\pipeline_cache"

AUDIO_NFFT = 512
AUDIO_WINDOW = 512
AUDIO_STRIDE = 512
AUDIO_SAMPLE_RATE = int(44100/2)
AUDIO_MELS = 128
AUDIO_FMIN = 0
AUDIO_FMAX = int(AUDIO_SAMPLE_RATE)/2
AUDIO_TOP_DB = 80
        
MODEL_INPUT_IMAGE_WIDTH = 256
MODEL_INPUT_IMAGE_HEIGHT = 256
MODEL_INPUT_IMAGE_CHANNELS = 3

CLASSIFIER_BATCH_SIZE=2


In [3]:
########################################################################################
# Create a DiskCache instance
# This cache will allow us store intermediate function results to speed up the 
# data processing pipeline
########################################################################################
cache = dc.Cache(CACHE_DIRETORY, cull_limit=0, size_limit=10**9) 


########################################################################################
# a helper function to create a hash key from a function signature and arguments
########################################################################################
def create_function_key(func, *args, **kwargs):
    partial_func = functools.partial(func, *args, **kwargs)
    func_name = partial_func.func.__name__
    func_module = partial_func.func.__module__
    args_repr = repr(partial_func.args)
    kwargs_repr = repr(sorted(partial_func.keywords.items()))

    key = f"{func_module}.{func_name}:{args_repr}:{kwargs_repr}"
    # Use hashlib to create a hash of the key for shorter and consistent length
    key_hash = hashlib.sha256(key.encode()).hexdigest()

    return key, key_hash, partial_func


########################################################################################
# Execute a function and cache the result
# If already executed, retrieve function output from the cache instead
########################################################################################
def execute_cached_function(func, *args, **kwargs):
    key_string,key,partial_func = create_function_key(func, *args, **kwargs)
    #print(f'key: {key_string} {key}')
    # Check if the result is in the cache
    if key in cache:
        result = cache[key]
        print(f"Result loaded from cache: {result}")
    else:
        # If not in cache, call the slow operation and store the result in cache
        result = partial_func()
        cache[key] = result
        print(f"Result calculated and stored in cache: {result}")
    return result

In [4]:
########################################################################################
# these helper functions load the audio data into a 'dataset' using only paths
# just dealing with paths at this early stage means the entire dataset can be shuffled in
# memory and split before loading the actual audio data into memory
########################################################################################
def paths_and_labels_to_dataset(image_paths, labels, num_classes):
    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    label_ds = dataset_utils.labels_to_dataset(
        labels, 
        'categorical', 
        num_classes)
    zipped_path_ds = tf.data.Dataset.zip((path_ds, label_ds))
    return zipped_path_ds

def create_datasets(audio_files, train_split=0.7, val_split=0.2):
    file_paths, labels, class_names = dataset_utils.index_directory(
            audio_files,
            labels="inferred",
            formats=('.ogg','.mp3','.wav','.flac'),
            class_names=None,
            shuffle=False,
            seed=42,
            follow_links=False)

    dataset = paths_and_labels_to_dataset(
        image_paths=file_paths,
        labels=labels,
        num_classes=len(class_names))
    
    # Calculate the size of the dataset
    dataset_size = len(dataset)
    
    # Calculate the number of elements for each dataset split
    train_size = int(train_split * dataset_size)
    val_size = int(val_split * dataset_size)
    test_size = dataset_size - train_size - val_size

    # Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=dataset_size, seed=42)

    # Split the dataset
    train_ds = dataset.take(train_size)
    val_ds = dataset.skip(train_size).take(val_size)
    test_ds = dataset.skip(train_size + val_size).take(test_size)
    
    return train_ds, val_ds, test_ds, class_names

In [5]:
# create the dataset
train_ds, val_ds, test_ds, class_names = create_datasets(AUDIO_DATA_DIRECTORY,train_split=0.8, val_split=0.19)
print("Class names: ", class_names)
print(f"Training   dataset length: {len(train_ds)}")
print(f"Validation dataset length: {len(val_ds)}")
print(f"Test       dataset length: {len(test_ds)}")

Found 524 files belonging to 5 classes.
Class names:  ['brant', 'jabwar', 'sheowl', 'spodov', 'wiltur']
Training   dataset length: 419
Validation dataset length: 99
Test       dataset length: 6


In [6]:
# show what the pipeline looks like at this stage
for item in train_ds.take(10):
    print(item)

(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\sheowl\\XC666501.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 1., 0., 0.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\wiltur\\XC317966.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 1.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\wiltur\\XC618595.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 1.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\brant\\XC540354.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 0., 0., 0.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\sheowl\\XC295378.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 1., 0., 0.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\spodov\\XC443310.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0

In [7]:
def load_random_subsection(path, duration_secs):

    # read the file data
    file_contents=tf.io.read_file(path)

    try:
        tmp_audio_t = tfio.audio.decode_flac(input=file_contents)
    except:
        pass

    try:
        tmp_audio_t = tfio.audio.decode_vorbis(input=file_contents)
    except:
        pass

    print(tmp_audio_t)

    # cast and keep left channel only
    tmp_audio_t = tf.cast(tmp_audio_t, tf.float32)[:,-1]
    
    print(tmp_audio_t.shape)

    # resample the sample rate
    tmp_audio_t = tfio.audio.resample(tmp_audio_t, tfio.audio.AudioIOTensor(path)._rate.numpy(), AUDIO_SAMPLE_RATE)

    # Determine the audio file's duration in seconds
    audio_duration_secs = tf.shape(tmp_audio_t)[0] / AUDIO_SAMPLE_RATE
    
    if audio_duration_secs>duration_secs:
    
        print(f'duration {audio_duration_secs} {duration_secs}')

        # Calculate the starting point of the 5-second subsection
        max_start = tf.cast(audio_duration_secs - duration_secs, tf.float32)
        start_time_secs = tf.random.uniform((), 0.0, max_start, dtype=tf.float32)
        
        print(f'start_time_secs {start_time_secs} max_start_time {max_start}')

        start_index = tf.cast(start_time_secs * AUDIO_SAMPLE_RATE, dtype=tf.int32)
        print(f'start_index {start_index}')

        # Load the 5-second subsection
        end_index = tf.cast(start_index + tf.cast(duration_secs, tf.int32) * AUDIO_SAMPLE_RATE, tf.int32)
        
        print(f'end_index {start_index}')
        
        subsection = tmp_audio_t[start_index : end_index]
    
    else:
        print(f' padding it ')
        # Pad the subsection with silence if it's shorter than 5 seconds
        padding_length = duration_secs * AUDIO_SAMPLE_RATE - tf.shape(tmp_audio_t)[0]
        padding = tf.zeros([padding_length], dtype=tmp_audio_t.dtype)
        subsection = tf.concat([tmp_audio_t, padding], axis=0)

    print(f'subsection {subsection.shape}')

    return subsection

clip = load_random_subsection('d:\\data\\bc\\spodov\\XC441823.ogg', duration_secs=5.0)

tf.Tensor(
[[-8.7266113e-07 -1.0969178e-07]
 [ 1.8670749e-05  2.4067995e-05]
 [ 1.0681602e-05  1.3049542e-05]
 ...
 [ 1.3494791e-03  1.9154038e-03]
 [ 2.3285199e-04  5.0018949e-04]
 [ 4.7588945e-04  4.7343917e-04]], shape=(275712, 2), dtype=float32)
(275712,)
duration 8.615963718820861 5.0
start_time_secs 0.8278378844261169 max_start_time 3.6159636974334717
start_index 18253
end_index 18253
subsection (110250,)


In [8]:
def dataset_example_pipeline(path, label):
    
    print(f'path {path}')
    
    tmp_audio_t = load_random_subsection(path, duration_secs=5)
    
    print(f'tmp_audio_t shape {tmp_audio_t}')
        
    # Convert to spectrogram
    image = tfio.audio.spectrogram(
        tmp_audio_t,
        nfft=AUDIO_NFFT, 
        window=AUDIO_WINDOW, 
        stride=AUDIO_STRIDE)
    
    # Convert to melspectrogram
    image = tfio.audio.melscale(
        image, 
        rate=AUDIO_SAMPLE_RATE, 
        mels=AUDIO_MELS, 
        fmin=AUDIO_FMIN, 
        fmax=AUDIO_FMAX)
    
    print(f'image shape {image.shape}')

    # reshape into standard 3 channels to add the color channel
    image = tf.expand_dims(image, -1)
    
    # most pre-trained model expect 3 color channels
    image = tf.repeat(image, MODEL_INPUT_IMAGE_CHANNELS, axis=2)
    
    print(f'image shape {image.shape}')
    
    image = tf.ensure_shape(image, [216, 128, MODEL_INPUT_IMAGE_CHANNELS])
    image = tf.image.resize(image, (MODEL_INPUT_IMAGE_WIDTH,MODEL_INPUT_IMAGE_HEIGHT), 
                            method=tf.image.ResizeMethod.LANCZOS5)
    
    # for some reason the melspecs seem rotated by 90 degrees. This corrects that.
    image = tf.image.rot90(image, k=1)
    
    # rescale to range [0,1]
    image = image - tf.reduce_min(image) 
    image = image / (tf.reduce_max(image)+0.00001)
    
    return image, label

In [9]:
# this will allow python execution
def dataset_example_pipeline_wrapper(path, label):
    # Use a lambda function to pass two arguments to the dataset_example_pipeline function
    return tf.py_function(func=lambda x, y: dataset_example_pipeline(x, y), inp=(path, label), Tout=(tf.float32, label.dtype))


In [10]:
########################################################################################
# create the datasets useful for training a classification model
########################################################################################
train_dataset = (train_ds
                 .map(dataset_example_pipeline_wrapper)
                 .batch(CLASSIFIER_BATCH_SIZE)          
)

# validation_dataset = (val_ds
#                       .map(dataset_example_pipeline_wrapper)
#                       .batch(CLASSIFIER_BATCH_SIZE)
# )

# test_dataset = (test_ds
#                 .map(dataset_example_pipeline_wrapper)
#                 .batch(CLASSIFIER_BATCH_SIZE)
#)

In [16]:
# show what the pipeline looks like at this stage
for melspectrogram,label in train_dataset.take(1):
    print(f' sample info: {melspectrogram.shape}, {label}')

path b'd:\\data\\bc\\jabwar\\XC191196.ogg'
tf.Tensor(
[[ 1.4516487e-05  1.4623534e-05]
 [-7.2555854e-06 -1.4011867e-05]
 [ 1.6997699e-05  1.3320816e-05]
 ...
 [-3.3115927e-04  8.5126521e-05]
 [-5.6644093e-04 -2.4861511e-04]
 [-2.0296275e-04 -3.6039841e-05]], shape=(3046087, 2), dtype=float32)
(3046087,)
duration 95.19020408163266 5
start_time_secs 54.84065628051758 max_start_time 90.19020080566406
start_index 1209236
end_index 1209236
subsection (110250,)
tmp_audio_t shape [ 0.00392026  0.0064552   0.00502223 ...  0.00150487  0.00057983
 -0.00167692]
image shape (216, 128)
image shape (216, 128, 3)
path b'd:\\data\\bc\\sheowl\\XC431869.ogg'
tf.Tensor(
[[-5.3238455e-06 -8.2403221e-06]
 [ 2.3792973e-05  1.2055902e-05]
 [-1.3287732e-05 -8.7958533e-06]
 ...
 [-1.9378622e-06 -5.5549265e-08]
 [ 2.5502404e-05  2.9752951e-05]
 [ 3.5742160e-06  7.2412176e-06]], shape=(4482816, 2), dtype=float32)
(4482816,)
duration 140.08798185941043 5
start_time_secs 112.33496856689453 max_start_time 135.08798

In [12]:
# testing the cache works
class ArrayProcessor:
    def sum_plus_five(self, arr, v2):
        array_sum = np.sum(arr)
        return array_sum + 5.0 + v2

# Usage example
processor = ArrayProcessor()

# Create a 2D NumPy array
arr = np.random.rand(1024, 1024)

In [13]:
%timeit -r1 -n1 processor.sum_plus_five(arr, 18)

1.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [14]:
%timeit -r1 -n1 execute_cached_function(processor.sum_plus_five, arr, 18)

Result calculated and stored in cache: 523760.31744759623
4.94 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [15]:
%timeit -r1 -n1 execute_cached_function(processor.sum_plus_five,arr, 18)

Result loaded from cache: 523760.31744759623
736 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
