In [1]:
########################################################################################
# library imports
########################################################################################

# disable warnings
import warnings
warnings.filterwarnings("ignore")

# generic libraries
from platform import python_version
import functools
import diskcache as dc
import time
import hashlib
import numpy as np

# tensor flow / keras related libraries
import tensorflow as tf
import tensorflow_io as tfio
from keras.utils import dataset_utils

# image processing related libraries
import librosa 
import imageio
import PIL

# print system information
print('Python Version        : ', python_version())
print('TensorFlow Version    : ', tf.__version__)
print('TensorFlow IO Version : ', tfio.__version__)
print('Librosa Version       : ', librosa.__version__)

Python Version        :  3.8.16
TensorFlow Version    :  2.10.1
TensorFlow IO Version :  0.27.0
Librosa Version       :  0.10.0


In [2]:
########################################################################################
# system constants
########################################################################################
AUDIO_DATA_DIRECTORY = "d:\\data\\bc"
CACHE_DIRETORY       = "d:\\pipeline_cache"

AUDIO_NFFT = 512
AUDIO_WINDOW = 512
AUDIO_STRIDE = 512
AUDIO_SAMPLE_RATE = int(44100/2)
AUDIO_MELS = 128
AUDIO_FMIN = 0
AUDIO_FMAX = int(AUDIO_SAMPLE_RATE)/2
AUDIO_TOP_DB = 80
        
MODEL_INPUT_IMAGE_WIDTH = 256
MODEL_INPUT_IMAGE_HEIGHT = 256
MODEL_INPUT_IMAGE_CHANNELS = 3

CLASSIFIER_BATCH_SIZE=32


In [3]:
########################################################################################
# Create a DiskCache instance
# This cache will allow us store intermediate function results to speed up the 
# data processing pipeline
########################################################################################
cache = dc.Cache(CACHE_DIRETORY, cull_limit=0, size_limit=10**9) 


########################################################################################
# a helper function to create a hash key from a function signature and arguments
########################################################################################
def create_function_key(func, *args, **kwargs):
    partial_func = functools.partial(func, *args, **kwargs)
    func_name = partial_func.func.__name__
    func_module = partial_func.func.__module__
    args_repr = repr(partial_func.args)
    kwargs_repr = repr(sorted(partial_func.keywords.items()))

    key = f"{func_module}.{func_name}:{args_repr}:{kwargs_repr}"
    # Use hashlib to create a hash of the key for shorter and consistent length
    key_hash = hashlib.sha256(key.encode()).hexdigest()

    return key, key_hash, partial_func


########################################################################################
# Execute a function and cache the result
# If already executed, retrieve function output from the cache instead
########################################################################################
def execute_cached_function(func, *args, **kwargs):
    key_string,key,partial_func = create_function_key(func, *args, **kwargs)
    #print(f'key: {key_string} {key}')
    # Check if the result is in the cache
    if key in cache:
        result = cache[key]
        print(f"Result loaded from cache: {result}")
    else:
        # If not in cache, call the slow operation and store the result in cache
        result = partial_func()
        cache[key] = result
        print(f"Result calculated and stored in cache: {result}")
    return result

In [4]:
########################################################################################
# these helper functions load the audio data into a 'dataset' using only paths
# just dealing with paths at this early stage means the entire dataset can be shuffled in
# memory and split before loading the actual audio data into memory
########################################################################################
def paths_and_labels_to_dataset(image_paths, labels, num_classes):
    path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
    label_ds = dataset_utils.labels_to_dataset(
        labels, 
        'categorical', 
        num_classes)
    zipped_path_ds = tf.data.Dataset.zip((path_ds, label_ds))
    return zipped_path_ds

def create_datasets(audio_files, train_split=0.7, val_split=0.2):
    file_paths, labels, class_names = dataset_utils.index_directory(
            audio_files,
            labels="inferred",
            formats=('.ogg','.mp3','.wav','.flac'),
            class_names=None,
            shuffle=False,
            seed=42,
            follow_links=False)

    dataset = paths_and_labels_to_dataset(
        image_paths=file_paths,
        labels=labels,
        num_classes=len(class_names))
    
    # Calculate the size of the dataset
    dataset_size = len(dataset)
    
    # Calculate the number of elements for each dataset split
    train_size = int(train_split * dataset_size)
    val_size = int(val_split * dataset_size)
    test_size = dataset_size - train_size - val_size

    # Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=dataset_size, seed=42)

    # Split the dataset
    train_ds = dataset.take(train_size)
    val_ds = dataset.skip(train_size).take(val_size)
    test_ds = dataset.skip(train_size + val_size).take(test_size)
    
    return train_ds, val_ds, test_ds, class_names

In [5]:
# create the dataset
train_ds, val_ds, test_ds, class_names = create_datasets(AUDIO_DATA_DIRECTORY,train_split=0.8, val_split=0.19)
print("Class names: ", class_names)
print(f"Training   dataset length: {len(train_ds)}")
print(f"Validation dataset length: {len(val_ds)}")
print(f"Test       dataset length: {len(test_ds)}")

Found 524 files belonging to 5 classes.
Class names:  ['brant', 'jabwar', 'sheowl', 'spodov', 'wiltur']
Training   dataset length: 419
Validation dataset length: 99
Test       dataset length: 6


In [6]:
# show what the pipeline looks like at this stage
for item in train_ds.take(10):
    print(item)

(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\sheowl\\XC666501.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 1., 0., 0.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\wiltur\\XC317966.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 1.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\wiltur\\XC618595.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 1.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\brant\\XC540354.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 0., 0., 0.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\sheowl\\XC295378.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 1., 0., 0.], dtype=float32)>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'd:\\data\\bc\\spodov\\XC443310.ogg'>, <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0

In [7]:
def dataset_example_pipeline(path, label):
    
    tmp_audio_t = None
    file_contents=tf.io.read_file(path)
    
    print(path)
    # print(file_contents)
    
    # try:
    #     if tmp_audio_t == None:
    #         tmp_audio_t = tfio.audio.decode_flac(input=file_contents, dtype=tf.int16)
    # except:
    #     pass
    
    # try:
    #     if tmp_audio_t == None:
    #         tmp_audio_t = tfio.audio.decode_flac(input=file_contents, dtype=tf.int32)
    # except:
    #     pass
    
    try:
        if tmp_audio_t == None:
            tmp_audio_t = tfio.audio.decode_vorbis(input=file_contents)
    except:
        pass    
           
    tmp_audio_t = tf.cast(tmp_audio_t, tf.float32)
    
    print(f'tmp_audio_t shape {tmp_audio_t}')
        
    # tmp_audio_t = tfio.audio.resample(tmp_audio_t, tfio.audio.AudioIOTensor(path)._rate.numpy(), AUDIO_SAMPLE_RATE)

    # Convert to spectrogram
    image = tfio.audio.spectrogram(
        tmp_audio_t[:, 0], # left channel only
        nfft=AUDIO_NFFT, 
        window=AUDIO_WINDOW, 
        stride=AUDIO_STRIDE)
    
    # Convert to melspectrogram
    image = tfio.audio.melscale(
        image, 
        rate=AUDIO_SAMPLE_RATE, 
        mels=AUDIO_MELS, 
        fmin=AUDIO_FMIN, 
        fmax=AUDIO_FMAX)
    
    print(f'image shape {image.shape}')

    # reshape into standard 3 channels to add the color channel
    image = tf.expand_dims(image, -1)
    
    # most pre-trained model expect 3 color channels
    image = tf.repeat(image, MODEL_INPUT_IMAGE_CHANNELS, axis=2)
    
    print(f'image shape {image.shape}')
    
    image = tf.ensure_shape(image, [216, 128, MODEL_INPUT_IMAGE_CHANNELS])
    image = tf.image.resize(image, (MODEL_INPUT_IMAGE_WIDTH,MODEL_INPUT_IMAGE_HEIGHT), 
                            method=tf.image.ResizeMethod.LANCZOS5)
    
    # for some reason the melspecs seem rotated by 90 degrees. This corrects that.
    image = tf.image.rot90(image, k=1)
    
    # rescale to range [0,1]
    image = image - tf.reduce_min(image) 
    image = image / (tf.reduce_max(image)+0.00001)
    
    return image, label

In [8]:
# this will allow python execution
def dataset_example_pipeline_wrapper(path, label):
    # Use a lambda function to pass two arguments to the dataset_example_pipeline function
    return tf.py_function(func=lambda x, y: dataset_example_pipeline(x, y), inp=(path, label), Tout=(tf.float32, label.dtype))


In [9]:
########################################################################################
# create the datasets useful for training a classification model
########################################################################################
train_dataset = (train_ds
                 .map(dataset_example_pipeline_wrapper)
                 .batch(CLASSIFIER_BATCH_SIZE)          
)

# validation_dataset = (val_ds
#                       .map(dataset_example_pipeline_wrapper)
#                       .batch(CLASSIFIER_BATCH_SIZE)
# )

# test_dataset = (test_ds
#                 .map(dataset_example_pipeline_wrapper)
#                 .batch(CLASSIFIER_BATCH_SIZE)
#)

In [10]:
# show what the pipeline looks like at this stage
for item in train_dataset.take(1):
    print(item)

tf.Tensor(b'd:\\data\\bc\\sheowl\\XC431869.ogg', shape=(), dtype=string)
tmp_audio_t shape [[-5.3238455e-06 -8.2403221e-06]
 [ 2.3792973e-05  1.2055902e-05]
 [-1.3287732e-05 -8.7958533e-06]
 ...
 [-1.9378622e-06 -5.5549265e-08]
 [ 2.5502404e-05  2.9752951e-05]
 [ 3.5742160e-06  7.2412176e-06]]
image shape (8756, 128)
tf.Tensor(b'd:\\data\\bc\\spodov\\XC181467.ogg', shape=(), dtype=string)
tmp_audio_t shape [[-5.3643462e-06  1.3893226e-06]
 [-4.5676023e-07  1.0300678e-05]
 [ 3.0567719e-06  2.5023275e-05]
 ...
 [ 1.1289783e-04  3.7449958e-05]
 [-4.3799012e-04 -3.3382158e-04]
 [-1.5377003e-04 -5.7742064e-04]]
image shape (482, 128)


UnknownError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} InvalidArgumentError: {{function_node __wrapped__EnsureShape_device_/job:localhost/replica:0/task:0/device:GPU:0}} Shape of tensor input [8756,128,3] is not compatible with expected shape [216,128,3]. [Op:EnsureShape]
Traceback (most recent call last):

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\ops\script_ops.py", line 269, in __call__
    return func(device, token, args)

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\ops\script_ops.py", line 147, in __call__
    outputs = self._call(device, args)

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\ops\script_ops.py", line 154, in _call
    ret = self._func(*args)

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "C:\Users\Andrew\AppData\Local\Temp\__autograph_generated_filexkvl894n.py", line 12, in <lambda>
    retval_ = ag__.converted_call(ag__.ld(tf).py_function, (), dict(func=ag__.autograph_artifact((lambda x, y: ag__.converted_call(ag__.ld(dataset_example_pipeline), (ag__.ld(x), ag__.ld(y)), None, fscope))), inp=(ag__.ld(path), ag__.ld(label)), Tout=(ag__.ld(tf).float32, ag__.ld(label).dtype)), fscope)

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 335, in converted_call
    return _call_unconverted(f, args, kwargs, options, False)

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 459, in _call_unconverted
    return f(*args)

  File "C:\Users\Andrew\AppData\Local\Temp\ipykernel_37152\550811495.py", line 65, in dataset_example_pipeline
    image = tf.ensure_shape(image, [216, 128, MODEL_INPUT_IMAGE_CHANNELS])

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None

  File "d:\miniconda3\envs\dev\lib\site-packages\tensorflow\python\framework\ops.py", line 7209, in raise_from_not_ok_status
    raise core._status_to_exception(e) from None  # pylint: disable=protected-access

tensorflow.python.framework.errors_impl.InvalidArgumentError: {{function_node __wrapped__EnsureShape_device_/job:localhost/replica:0/task:0/device:GPU:0}} Shape of tensor input [8756,128,3] is not compatible with expected shape [216,128,3]. [Op:EnsureShape]


	 [[{{node EagerPyFunc}}]] [Op:IteratorGetNext]

In [None]:
# testing the cache works
class ArrayProcessor:
    def sum_plus_five(self, arr, v2):
        array_sum = np.sum(arr)
        return array_sum + 5.0 + v2

# Usage example
processor = ArrayProcessor()

# Create a 2D NumPy array
arr = np.random.rand(1024, 1024)

In [None]:
%timeit -r1 -n1 processor.sum_plus_five(arr, 18)

In [None]:
%timeit -r1 -n1 execute_cached_function(processor.sum_plus_five, arr, 18)

In [None]:
%timeit -r1 -n1 execute_cached_function(processor.sum_plus_five,arr, 18)

In [None]:
def dataset_transforms(image, label):
    # reshape into standard 3 channels
    image = tf.io.parse_tensor(image, tf.float32)
    image = tf.expand_dims(image, -1)
    
    # most pre-trained model expect 3 color channels
    image = tf.repeat(image, MODEL_INPUT_IMAGE_CHANNELS, axis=2)
    
    image = tf.ensure_shape(image, [216, 128, MODEL_INPUT_IMAGE_CHANNELS])
    image = tf.image.resize(image, (MODEL_INPUT_IMAGE_WIDTH,MODEL_INPUT_IMAGE_HEIGHT), 
                            method=tf.image.ResizeMethod.LANCZOS5)
    
    # for some reason the melspecs seem rotated by 90 degrees. This corrects that.
    image = tf.image.rot90(image, k=1)
    
    # rescale to range [0,1]
    image = image - tf.reduce_min(image) 
    image = image / (tf.reduce_max(image)+0.00001)
    
    return image,label

In [None]:
train_dataset_b = ( 
                  train_dataset       
                  .shuffle(20000)
                  .map(dataset_transforms)
                  .batch(baseline_config.batch_size)
                  .cache()           
                )

validation_dataset_b = ( 
                  validation_dataset
                  .map(dataset_transforms)
                  .batch(baseline_config.batch_size)
                  .cache()
                )

test_dataset_b = ( 
                  test_dataset
                  .map(dataset_transforms)
                  .batch(baseline_config.batch_size)
                  .cache()
                )

train_dataset, class_names = create_dataset('TRAIN/')
test_dataset, _            = create_dataset('TEST/')
validation_dataset, _      = create_dataset('VALIDATION/')
print("class names: ", class_names)