# Features Generation
In this notebook, we will compute the melspectrograms of the audio files. Melspectrograms are obtained by converting the frequency axis of a spectrogram into a mel scale. Spectrograms are computed by passing an audio through short-time Fourier transform (STFT), in our case using a Hamming window of size 1024 with 75% overlap. Audio files that are shorter than the threshold length are padded with noise to ensure they are at least threshold length long. The computed spectrograms are then converted to melspectrograms using 40 mel filters.

In [None]:
import os
import csv
import random
import librosa
import numpy as np
import configparser
from tqdm import tqdm
import IPython.display
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt

## Load the parameters required to compute the spectrograms.

In [None]:
# Get parameters from configuration file
config = configparser.ConfigParser()
config.read('parameters.ini')

win_len_ms = int(config['audio']['win_len_ms'])
overlap = float(config['audio']['overlap'])
sampling_rate = int(config['audio']['sampling_rate'])

# Derive audio processing values
win_len = int((win_len_ms * sampling_rate) / 1000)
hop_len = int(win_len * (1 - overlap))
nfft = int(2 ** np.ceil(np.log2(win_len)))

## Audio padding
For our classification models, we need fixed size inputs. However, before and after signal and noise separation, some files have shorter lengths than the required threshold lenghth. To ensure these files are of at least threshold-length long, we will pad them with noise. Padding a signal with noise helps in developing the model by exposing it to the noise it will encounter in the field. 

Let's load a signal file and plot its spectrogram.

In [None]:
signal, _ = librosa.load('./sample_data/signal.wav', sr=sampling_rate)

specgram = np.abs(librosa.stft(signal, 
                        n_fft=nfft, 
                        hop_length=hop_len))
librosa.display.specshow(librosa.amplitude_to_db(specgram, ref=np.max),
                         sr=sampling_rate,
                         hop_length=hop_len,
                         y_axis='linear', 
                         x_axis='time')

Now, let's pad it with noise to ensure it is at least 3 seconds long and visualize it

In [None]:
duration = 3

if (len(signal) / sampling_rate) < duration:
    noise, _ = librosa.load('./sample_data/noise.wav', sr=sampling_rate)
    while (len(signal) / sampling_rate) < duration:
        signal = np.concatenate((signal,noise))
        
signal = signal[:int(duration * sampling_rate) + 1]

specgram = np.abs(librosa.stft(signal, 
                        n_fft=nfft, 
                        hop_length=hop_len))
librosa.display.specshow(librosa.amplitude_to_db(specgram, ref=np.max),
                         sr=sampling_rate,
                         hop_length=hop_len,
                         y_axis='linear', 
                         x_axis='time')

Now let's perform the above steps to the signal files we obtained from separating signals and noise from the downloaded audio files. We will then compute the melspectrograms of the files and store them as numpy array files 

In [None]:
def pad_audio(signal, duration, sampling_rate, noise_dir):
    """ Pad signal if necessary to ensure it is at least duration seconds long
    Args:
        signal: the signal to be padded
        duration: the minimum duration
        sampling_rate: the sampling rate
        noise_dir: the noise directory
    Returns:
        the original signal or a signal padded to duration
    """
    
    audio_duration = len(signal) / sampling_rate
    noise_files = os.listdir(noise_dir)
    if audio_duration >= duration:
        return signal
    else:
        #print(len(signal) / sampling_rate)
        filename = random.choice(noise_files)
        noise_signal, _ = librosa.load(os.path.join(noise_dir, filename),
                                      sr=sampling_rate)
        while (len(signal) / sampling_rate) < duration:
            signal = np.concatenate((signal, noise_signal))
        #print(len(signal) / sampling_rate)  
        return signal[:int(duration * sampling_rate) + 1]
    
    
def features_extraction(audio_dir,
                         name,
                         species,
                         file,
                         noise_dir,
                         sampling_rate,
                         duration,
                         nfft,
                         win_length,
                         hop_length,
                         num_mels=40,
                         melspectrogram_dir=None):
    
    """Compute features for all files in the list
    Args:
        audio_dir: directory containig audio
        name: name to save the melspectrogram with
        species: name of the subdirectory containing a given species' recordings
        file: name of the recording
        noise_dir: directory with noise samples for padding
        sampling_rate: audio sampling rate
        duration: minimum duration of files
        nfft: FFT length
        win_length: window length
        hop_length: overlap between adjascent frames
        num_mel: number of melspectrogram channels
        melspectrogram_dir: directory to save spectrograms
    """
    
    
    
    
    try:
        y, _ = librosa.load(os.path.join(audio_dir, species, file),
                                sr=sampling_rate)
        signal = pad_audio(y, duration, sampling_rate, noise_dir)
                
        file_features = librosa.feature.melspectrogram(signal,
                                                        sr=sampling_rate,
                                                        n_fft=nfft,
                                                        hop_length=hop_length,
                                                        win_length=win_length,
                                                        window='hamming',
                                                        n_mels=num_mels)
        
        if not os.path.exists(melspectrogram_dir):
            os.makedirs(melspectrogram_dir)
        np.save(melspectrogram_dir + '/' + name , file_features)
    except (FileNotFoundError, EOFError) as e:
        print('{} not found'.format(file))
    
       

Saving melspectrograms and annotation csv file.

In [None]:
audio_dir = './noiseless-xenocanto'
recordings = next(os.walk(audio_dir))[1]
noise_dir = './noise'
duration = 3

labels_list = []
field_names = ['no.', 'name', 'label']
no = 0

for species in recordings:
        files = tqdm(os.listdir(os.path.join(audio_dir, species)))
        for indx, file in enumerate (files):
            labels_dict = {}
            name = species + str(indx) + '.npy'
            labels_dict.update({'no.':no, 'name':name, 'label':species})
            labels_list.append(labels_dict)
            files.set_description("Processing %s files" % species)
            features_extraction(audio_dir,
                                 name,
                                 species,
                                 file,
                                 noise_dir,
                                 sampling_rate,
                                 duration,
                                 nfft,
                                 win_len,
                                 hop_len,
                                 num_mels=40,
                                 melspectrogram_dir='./melspectrograms')
            no += 1
            
with open('labels.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = field_names)
    writer.writeheader()
    writer.writerows(labels_list)