In [1]:
#import necessary libraries
import os
import random
import librosa
import argparse
import warnings
import numpy as np
import configparser
from tqdm import tqdm
import IPython.display
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
from scipy.ndimage import binary_dilation, binary_erosion

In [2]:
warnings.filterwarnings('ignore')

# Get parameters from configuration file
config = configparser.ConfigParser()
config.read('parameters.ini')

win_len_ms = int(config['audio']['win_len_ms'])
overlap = float(config['audio']['overlap'])
sr = int(config['audio']['sampling_rate'])


# Derive audio processing values
win_len = int((win_len_ms * sr) / 1000)
hop_len = int(win_len * (1 - overlap))
nfft = int(2 ** np.ceil(np.log2(win_len)))

In [4]:
#These functions get the indices corresponding to audio and noise in a file

def compute_audio_mask(norm_specgram, hop_len, category='audio'):
    """ Compute the section of signal corresponding to audio or noise
    This follows the approach described in
    Sprengel, E., Jaggi, M., Kilcher, Y., & Hofmann, T. (2016).
    Audio based bird species identification using deep learning techniques
    Args:
        norm_specgram: input spectrogram with values in range [0,1]
        hop_len: hop length used to generate the spectrogram
        category: whether 'audio' or 'noise'
    Returns:
        mask: the mask of samples belonging to 'category'
    Raises: ValueError if the category is not known
    """

    if category == 'audio':
        threshold = 3
    elif category == 'noise':
        threshold = 2.5
    else:
        raise ValueError('Unknown category')

    col_mask = norm_specgram > threshold * np.median(norm_specgram, axis=0)
    row_mask = norm_specgram.T > threshold * np.median(norm_specgram, axis=1)
    row_mask  = row_mask.T
    mask = col_mask & row_mask

    # erosion
    be_mask = binary_erosion(mask, np.ones((4, 4)))

    # dilation
    bd_be_mask = binary_dilation(be_mask, np.ones((4, 4)))

    bd_be_mask = bd_be_mask.astype(int)
    selected_col = np.max(bd_be_mask, axis=0)
    bd_sel_col = binary_dilation(selected_col[:, None], np.ones((4, 1)))
    bd2_sel_col = binary_dilation(bd_sel_col, np.ones((4, 1)))


    # translate to audio samples
    selection_mtx = np.ones((norm_specgram.shape[1], hop_len)) * selected_col[:, None]

    audio_indx = selection_mtx.flatten().astype(bool)

    if category == 'audio':
        return audio_indx
    else:
        return ~audio_indx



def get_audio_noise(audio_array, nfft, hop_len):
    """ Get both the signal and noise
    Args:
        audio_array: an array of audio
        nfft: FFT length
        hop_len: hop length
    Returns:
        signal and noise
    """

    specgram = np.abs(librosa.stft(audio_array, n_fft=nfft, hop_length=hop_len))
    specgram_norm = specgram / (specgram.max() + 1e-8)

    audio_indx = compute_audio_mask(specgram_norm, hop_len)[:len(audio_array)]
    noise_indx = compute_audio_mask(specgram_norm, hop_len, 'noise')[:len(audio_array)]


    return audio_array[audio_indx], audio_array[noise_indx]

In [5]:
def audio_noise_save(rec_dir, sr, nfft, hop_len, audio_dir, noise_dir):
    """ Save separated audio and noise parts
    Args:
        audio_array: an array of audio
        nfft: FFT length
        hop_len: hop length
        audio_dir-path to save audio segment
        noise_dir-path to save noise segment
        rec_dir-recordings' directory
        sr-sampling rate
    """
    
    
    birds_dir = next(os.walk(rec_dir))[1]
    for species in birds_dir:
        print('Segmenting & saving', species, 'audio files.')
        recs = os.listdir(os.path.join(rec_dir, species))
        path = os.path.join(audio_dir, species) #path to store recording of a given species
        if not os.path.exists(path):
            os.makedirs(path)
        for file in tqdm(recs):
            try:
                y,_ = librosa.load(os.path.join(rec_dir, species, file), sr=sr)
                audio, noise = get_audio_noise(y, nfft, hop_len)
            except Exception as e:
                return
            
            sf.write(os.path.join(path,
                          file.replace('mp3', 'wav')),
             audio,
             sr)
            sf.write(os.path.join(noise_dir,
                                  file.replace('mp3', 'wav')),
                     noise,
                     sr)

In [6]:
audio_noise_save('./xeno-canto',
                 sr,
                 nfft,
                 hop_len,
                 './noiseless-xeno-canto',
                 './noise')

  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

Segmenting & saving Grey-backed Camaroptera audio files.


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [03:08<00:00,  3.15s/it]
  0%|                                                                                           | 0/45 [00:00<?, ?it/s]

Segmenting & saving Rattling Cisticola audio files.


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [02:04<00:00,  2.77s/it]
  0%|                                                                                           | 0/98 [00:00<?, ?it/s]

Segmenting & saving Rüppell's Robin-Chat audio files.


100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [07:12<00:00,  4.42s/it]
  0%|                                                                                           | 0/47 [00:00<?, ?it/s]

Segmenting & saving Spotted Palm Thrush audio files.


100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [01:59<00:00,  2.54s/it]
