Overview

Author: stephankokkas

This notebook defines a pipeline that tasks an input directory of audio files and converts them to images using mel-spectrogram transofrmation and preprocessing techniques.

In [109]:
# disable warnings to tidy up output
import warnings
warnings.filterwarnings("ignore")

# some basic libraries 
from platform import python_version
#import pandas as pd
#import seaborn as sns
import numpy as np
import os
import random
from os import listdir
from os.path import isfile, join
import shutil
import torch
from IPython.display import Audio
import pandas as pd

# plot support
import matplotlib.pyplot as plt

# tensorflow support
import tensorflow as tf
#import tensorflow_transform as tft
import tensorflow_io as tfio
#from tensorflow.contrib.framework.python.ops import audio_ops

# scipy
import scipy
from pydub import AudioSegment, effects

# turn off tensorflow warnings
tf.get_logger().setLevel('ERROR')

# turn off absl warnings
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# print system information
print('Python Version     : ', python_version())
print('TensorFlow Version : ', tf.__version__)

Python Version     :  3.10.0
TensorFlow Version :  2.11.0


In [2]:
# below code adapted from:
# https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(123)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
random.seed(123)

# The below set_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/random/set_seed
tf.random.set_seed(1234)

In [3]:
# set system parameters
DATASET_PATH  = '/Users/stephankokkas/Downloads/birdclef2022/'

if '/' in DATASET_PATH:
    DATASET_PATH = DATASET_PATH.split('/')
elif '\\' in DATASET_PATH:
    DATASET_PATH = DATASET_PATH.split('\\')

## Preprocessing Pipeline

This pipeline will go through a root directory and find all the audio files that exist and are of accepted format. Then, depending on the params set, it with normalise, trim and split the data.

In [126]:
class raw_file_pre_processing():
    def __init__(self) -> None:
        self.CLIP_LENGTH   = 5000   # only look at 5000 milliseconds of clip at the start of loaded audio file
        self.BITRATE = "32k"        # all the samples are converted to bit rate of 32000 (Samples/Second)
        self.labels = []
        self.raw_dirs = {}
        self.dataset = pd.DataFrame(columns=['Label', 'FileName', 'FileType', 'Directory'])
        self.TARGET_FORMAT = 'mp3'
        self.ACCEPTED_FORMAT = ['.mp3', '.flac', '.aiff', '.mp4', '.m4a', '.wav', '.ogg']

        self.OUTPUT_DIR = os.path.join(*DATASET_PATH, 'OUTPUT')
        self.TRAIN_DIR = os.path.join(*DATASET_PATH, 'TRAIN')
        self.TEST_DIR = os.path.join(*DATASET_PATH, 'TEST')
        self.VALIDATION_DIR = os.path.join(*DATASET_PATH, 'VALIDATION')

        self.CLEAN_DIR_ = True

        self.TRAIN_SPLIT = 0.8
        self.VALIDATION_SPLIT = 0.1

        def clean_dir(self):
            if os.path.exists(self.OUTPUT_DIR):
                shutil.rmtree(self.OUTPUT_DIR)
            if os.path.exists(self.TRAIN_DIR):
                shutil.rmtree(self.TRAIN_DIR)
            if os.path.exists(self.TEST_DIR):
                shutil.rmtree(self.TEST_DIR)
            if os.path.exists(self.VALIDATION_DIR):
                shutil.rmtree(self.VALIDATION_DIR)
        if self.CLEAN_DIR_: clean_dir(self)
            

    def get_raw_file_paths(self, directory):
        print(f'Looking for files... acceptable formats include: {self.ACCEPTED_FORMAT}')
        for root, dir, files in os.walk(os.path,join(*directory)):
            if dir == []:
                tmp_lable = os.path.split(root)[-1]
                tmp_file_dir = []
                for file in files:
                    for ext in self.ACCEPTED_FORMAT:
                        if ext in str(file):
                            tmp_file_dir.append(os.path.join(root, file))
                        
                self.raw_dirs.update({tmp_lable:tmp_file_dir})

        for key in self.raw_dirs:
            print(f'FOUND: {key} -> {len(self.raw_dirs[key])}')

    def audio_preprocessing(self, TRIM_AUDIO:bool = False, 
                                  NORM_AUDIO:bool = False, 
                                  TRAIN_TEST_SPLIT:bool = False):
        print('\nConvering audio files....')
        if not os.path.exists(self.OUTPUT_DIR):
            os.makedirs(self.OUTPUT_DIR)


        for key, item in self.raw_dirs.items():
            print(f'Converting {key} data ->> ...')
            tmp_dir_key = os.path.join(self.OUTPUT_DIR, key)
            if not os.path.exists(tmp_dir_key):
                os.makedirs(tmp_dir_key)

            for dir in item:
                try:
                    # read file

                    tmp_file_name = os.path.split(dir)[-1].split('.')[0]
                    raw_sound = AudioSegment.from_file(dir, format=dir.split('.')[-1])

                    if NORM_AUDIO:
                        # normalise file
                        raw_sound = effects.normalize(raw_sound)

                    # trim file
                    if TRIM_AUDIO:
                        arr_split_file = [raw_sound[idx:idx + self.CLIP_LENGTH] for idx in range(0, len(raw_sound), self.CLIP_LENGTH)]             
                        for count_sample, sample in enumerate(arr_split_file):
                            # padding audio < 5s
                            if len(sample) < self.CLIP_LENGTH:
                                silence = AudioSegment.silent(duration=((self.CLIP_LENGTH-len(sample))))
                                sample = sample + silence  # Adding silence after the audio

                            # export raw file
                            tmp_raw_new_dir = os.path.join(tmp_dir_key, tmp_file_name + '_raw_trim_sample_' + str(count_sample) + '.' + self.TARGET_FORMAT)
                            sample.export(tmp_raw_new_dir, format=self.TARGET_FORMAT, bitrate=self.BITRATE)

                            new_row = pd.Series({"Label": key,
                                        "FileName": tmp_file_name + '_raw_trim_sample_' + str(count_sample) + '.' + self.TARGET_FORMAT,
                                        "FileType": self.TARGET_FORMAT,
                                        "Directory": tmp_raw_new_dir})
                            self.dataset = pd.concat([self.dataset, new_row.to_frame().T], ignore_index=True)
                    else:
                        tmp_raw_new_dir = os.path.join(tmp_dir_key, tmp_file_name + '_raw_' + '.' + self.TARGET_FORMAT)
                        raw_sound.export(tmp_raw_new_dir, format=self.TARGET_FORMAT, bitrate=self.BITRATE)

                        new_row = pd.Series({"Label": key,
                                    "FileName": tmp_file_name + '_raw_' + '.' + self.TARGET_FORMAT,
                                    "FileType": self.TARGET_FORMAT,
                                    "Directory": tmp_raw_new_dir})
                        self.dataset = pd.concat([self.dataset, new_row.to_frame().T], ignore_index=True)
                except Exception as e:
                    print(e)

        if TRAIN_TEST_SPLIT:
            print(f'\nSplitting data into sub-directories Train, Test and Validation...')

            if not os.path.exists(self.TRAIN_DIR):
                os.mkdir(self.TRAIN_DIR)
            if not os.path.exists(self.TEST_DIR):
                os.mkdir(self.TEST_DIR)
            if not os.path.exists(self.VALIDATION_DIR):
                os.mkdir(self.VALIDATION_DIR)


            dict_keys = self.dataset['Label'].value_counts().to_dict()
            for key, item in dict_keys.items():
                if not os.path.exists(os.path.join(self.TRAIN_DIR, key)):
                    os.mkdir(os.path.join(self.TRAIN_DIR, key))
                if not os.path.exists(os.path.join(self.TEST_DIR, key)):
                    os.mkdir(os.path.join(self.TEST_DIR, key))
                if not os.path.exists(os.path.join(self.VALIDATION_DIR, key)):
                    os.mkdir(os.path.join(self.VALIDATION_DIR, key))

                if item % 2 == 0:
                    train_split_count_ = int(round(item*self.TRAIN_SPLIT,0))
                else:
                    train_split_count_ = int(round(item*self.TRAIN_SPLIT,0) -1)
                validation_split_count_ = int(round(item*self.VALIDATION_SPLIT,0))

                tmp_train_dirs = self.dataset.loc[self.dataset['Label'] == key][:train_split_count_]['Directory'].to_list()
                tmp_validation_dirs = self.dataset.loc[self.dataset['Label'] == key][train_split_count_:train_split_count_+validation_split_count_]['Directory'].to_list()
                tmp_test_dirs = self.dataset.loc[self.dataset['Label'] == key][train_split_count_+validation_split_count_:]['Directory'].to_list()

                for i in tmp_train_dirs:
                    os.replace(i, i.replace('OUTPUT', 'TRAIN'))
                for i in tmp_validation_dirs:
                    os.replace(i, i.replace('OUTPUT', 'VALIDATION'))
                for i in tmp_test_dirs:
                    os.replace(i, i.replace('OUTPUT', 'TEST'))

            shutil.rmtree(self.OUTPUT_DIR)

In [119]:
data_preprocessing_pipeline = raw_file_pre_processing()

data_preprocessing_pipeline.get_raw_file_paths(DATASET_PATH)

print('This next process will take approx 15 mins for the current bird dataset')
dataset = data_preprocessing_pipeline.audio_preprocessing(TRIM_AUDIO=True, NORM_AUDIO=True, TRAIN_TEST_SPLIT=True)

Looking for files... acceptable formats include: ['.mp3', '.flac', '.aiff', '.mp4', '.m4a', '.wav', '.ogg']
FOUND: jabwar -> 78
FOUND: wiltur -> 76
FOUND: sheowl -> 128
FOUND: brant -> 135
FOUND: spodov -> 107

Convering audio files....
Converting jabwar data ->> ...
Converting wiltur data ->> ...
Converting sheowl data ->> ...
Converting brant data ->> ...
Converting spodov data ->> ...

Splitting data into sub-directories Train, Test and Validation...


## Melspectrogram Pipeline

This pipeline will get all the new mp3 files, convert them to tfio tensors, convert to spectrograms, convert again to mel-spectrograms, then congert to db scale mel-spectrograms. It will then save the tensors as .pt files which can be read again

In [124]:
class mel_spectrogram_pipeline():
    def __init__(self) -> None:
        self.target_dir = ''
        self.labels = []
        self.augmented_dirs = {}
        self.OUTPUT_DIR = {}
        self.ACCEPTED_FORMAT = '.mp3'
        self.TENSOR_OUTPUT_DIR = os.path.join(*DATASET_PATH, 'tensors')

        self.NFFT = 512
        self.WINDOW = 512
        self.STRIDE = 512
        self.RATE = 16000
        self.MELS = 128
        self.FMIN = 0
        self.FMAX = 8000
        self.TOP_DB = 80

    
    def clean_dirs(self) -> None:
        print('Cleaning tensor directory')
        if os.path.exists(self.TENSOR_OUTPUT_DIR):
            shutil.rmtree(self.TENSOR_OUTPUT_DIR)

    def get_output_dir(self) -> None:
        print('Finding all pre-processed files')
        for root, dir, files in os.walk(os.path.join(*DATASET_PATH)):
            if "TRAIN" not in dir:
                if "OUTPUT" not in dir:
                    raise ValueError('Cant find any directories with pre-processed data. Looking for OUTPUT or TRAIN, TEST, and VAIDATION')
                else:
                    self.OUTPUT_DIR.update({"OUTPUT": os.path.join(root, "OUTPUT")})
            else:
                if "TEST" in dir and "VALIDATION" in dir:
                    self.OUTPUT_DIR.update({"TRAIN": os.path.join(root, "TRAIN")})
                    self.OUTPUT_DIR.update({"TEST": os.path.join(root, "TEST")})
                    self.OUTPUT_DIR.update({"VALIDATION": os.path.join(root, "VALIDATION")})
            if not self.OUTPUT_DIR:
                raise ValueError('Cant find any directories with pre-processed data. Looking for OUTPUT or TRAIN, TEST, and VAIDATION')

            print(f'\nFound the following directories {self.OUTPUT_DIR}\n')
            break
            
    def get_preprocessed_files(self) -> None:
        self.get_output_dir()
        
        for key, item in self.OUTPUT_DIR.items():
            for root, dir, files in os.walk(item):
                if dir == []:
                    tmp_lable = str(os.path.split(root)[-1]) + "-" + key
                    tmp_file_dir = []
                    for file in files:
                        if self.ACCEPTED_FORMAT in str(file):
                            tmp_file_dir.append(os.path.join(root, file))
                            
                    self.augmented_dirs.update({tmp_lable:tmp_file_dir})

        for key in self.augmented_dirs:
            print(f'FOUND: {key} -> {len(self.augmented_dirs[key])}')

    def generate_mel_spectrograms(self, MEL_SPECTRO:bool = False, 
                                        SHOW_PLOT:bool = False, 
                                        FREQ_MASK:bool = False, 
                                        TIME_MASK:bool = False) -> None:
                                        
        print(f'\nGenerating tensors... \n')
        for key, item in self.augmented_dirs.items():
            for dir in item:
                tmp_dir_key = os.path.join(self.TENSOR_OUTPUT_DIR, key)
                if not os.path.exists(tmp_dir_key):
                    os.makedirs(tmp_dir_key)
       
                file_contents=tf.io.read_file(dir)
                tmp_audio_t = tfio.audio.decode_mp3(file_contents)

                # Convert to spectrogram
                spectrogram = tfio.audio.spectrogram(
                    tmp_audio_t[:, 0], nfft=self.NFFT, window=self.WINDOW, stride=self.STRIDE)

                if SHOW_PLOT:
                    plt.figure()
                    plt.imshow(tf.math.log(spectrogram).numpy())

                if MEL_SPECTRO:
                    # # Convert to mel-spectrogram
                    mel_spectrogram = tfio.audio.melscale(
                        spectrogram, rate=self.RATE, mels=self.MELS, fmin=self.FMIN, fmax=self.FMAX)

                    if SHOW_PLOT:
                        plt.figure()
                        plt.imshow(tf.math.log(mel_spectrogram).numpy())

                    torch.save(mel_spectrogram, str(os.path.join(self.TENSOR_OUTPUT_DIR, key, os.path.split(dir)[-1].split('.')[0])) + '_raw_mel_spectrogram.pt')

                    if FREQ_MASK and "TEST" not in dir and "VALIDATION" not in dir:
                        freq_mask = tfio.audio.freq_mask(mel_spectrogram, param=10)
                        torch.save(freq_mask, str(os.path.join(self.TENSOR_OUTPUT_DIR, key, os.path.split(dir)[-1].split('.')[0])) + '_freq_mask_mel_spectrogram.pt')
                    
                    if TIME_MASK and "TEST" not in dir and "VALIDATION" not in dir:
                        time_mask = tfio.audio.time_mask(mel_spectrogram, param=10)
                        torch.save(time_mask, str(os.path.join(self.TENSOR_OUTPUT_DIR, key, os.path.split(dir)[-1].split('.')[0])) + '_time_mask_mel_spectrogram.pt')

        print("\nTensors complete.\n")

In [125]:
data_spectro_pipeline = mel_spectrogram_pipeline()

data_spectro_pipeline.clean_dirs()
data_spectro_pipeline.get_preprocessed_files()

print('This process will take approx 3 mins to complete')
data_spectro_pipeline.generate_mel_spectrograms(MEL_SPECTRO=True, SHOW_PLOT=False, FREQ_MASK=True, TIME_MASK=True)

Cleaning tensor directory
Finding all pre-processed files

Found the following directories {'TRAIN': '/Users/stephankokkas/Downloads/birdclef2022/TRAIN', 'TEST': '/Users/stephankokkas/Downloads/birdclef2022/TEST', 'VALIDATION': '/Users/stephankokkas/Downloads/birdclef2022/VALIDATION'}

FOUND: jabwar-TRAIN -> 704
FOUND: wiltur-TRAIN -> 1006
FOUND: sheowl-TRAIN -> 837
FOUND: brant-TRAIN -> 797
FOUND: spodov-TRAIN -> 654
FOUND: jabwar-TEST -> 89
FOUND: wiltur-TEST -> 126
FOUND: sheowl-TEST -> 104
FOUND: brant-TEST -> 99
FOUND: spodov-TEST -> 82
FOUND: jabwar-VALIDATION -> 88
FOUND: wiltur-VALIDATION -> 126
FOUND: sheowl-VALIDATION -> 105
FOUND: brant-VALIDATION -> 100
FOUND: spodov-VALIDATION -> 82

Generating tensors... 


Tensors complete.



## Pipeline to load data into memory for model training

This pipeline will load all the tensors into a train, test, and validation data structure and prepare it for inputs into a model for training

In [107]:
class train_test_vali_pipeline():
    def __init__(self) -> None:
        self.TENSOR_OUTPUT_DIR = os.path.join(DATASET_PATH, 'tensors')
        self.VALID_FILES = False

        self.PATHS = []
        # self.train_data

    
    def check_valid_dirs(self) -> None:
        def contains_test(arr):
            if any("TEST" in item for item in arr): return True
            return False
        def contains_train(arr):
            if any("TRAIN" in item for item in arr): return True
            return False
        def contains_vali(arr):
            if any("VALIDATION" in item for item in arr): return True
            return False

        print('Checking to find train, test and vali directories inside tensors folder...')
        for root, dir, files in os.walk(self.TENSOR_OUTPUT_DIR):
            for i in dir:
                self.PATHS.append(os.path.join(root, i))
            if contains_test(dir) and contains_train(dir) and contains_vali(dir):
                self.VALID_FILES = True
                print('PASS')
            else:
                raise ValueError('Cannot find folders from previous pipline which include train, test and validation directories')
            break

    def load_data(self, LOAD_RAW:bool = False, LOAD_FREQ:bool = False, LOAD_TIME:bool = False) -> None:
        for path in self.PATHS:
            if "TRAIN" in path:
                for file in [f for f in listdir(path) if isfile(join(path, f))]:
                    print(file)
                    tmp_data = torch.load(os.path.join(path, file))
                    print(tmp_data)
                    input()

            if "TEST" in self.PATHS:
                pass

            if "VALIDATION" in self.PATHS:
                pass

        


In [108]:
model_data_pipeline = train_test_vali_pipeline()

model_data_pipeline.check_valid_dirs()
model_data_pipeline.load_data(LOAD_RAW=True, LOAD_FREQ=True, LOAD_TIME=True)

Checking to find train, test and vali directories inside tensors folder...
PASS
XC134155_raw_trim_sample_48_time_mask_mel_spectrogram.pt
tf.Tensor(
[[0.0000000e+00 7.0380890e-03 2.1710135e-03 ... 2.2849226e-06
  3.5207192e-06 4.3118348e-06]
 [0.0000000e+00 8.1175258e-03 2.5039834e-03 ... 2.8480540e-06
  3.7891521e-06 2.9156388e-06]
 [0.0000000e+00 3.6041860e-03 1.1117700e-03 ... 3.0358758e-06
  2.7417418e-06 3.4143338e-06]
 ...
 [0.0000000e+00 1.2513897e-02 3.8601162e-03 ... 3.2041526e-06
  6.0907114e-06 5.5250848e-06]
 [0.0000000e+00 5.7932683e-03 1.7870283e-03 ... 3.0217129e-06
  3.8724802e-06 3.1444392e-06]
 [0.0000000e+00 2.8787870e-02 8.8800890e-03 ... 1.4713195e-02
  1.4917713e-02 1.5132349e-02]], shape=(313, 128), dtype=float32)


KeyboardInterrupt: Interrupted by user