# Generate FFT Feature Matrix and Label

In [8]:
# adjust as necessary
max_file_count = 25

## Read Data

In [9]:
import os
import sys
sys.path.append(os.path.join(os.pardir, os.pardir))
from helper.load_data import load_data

In [10]:
# determine path to `src/data`
cwd = os.getcwd()
src_dir = os.path.join(cwd, os.path.join(os.pardir, os.pardir))
data_dir = os.path.join(src_dir, 'data')

# load files
data_file_paths = load_data(data_dir)

In [11]:
# make sure that `max_file_count` is in bounds
max_file_count = min(max_file_count, len(data_file_paths))

## Feature Matrix and Label

In [12]:
import librosa
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import pandas as pd
from scipy.fftpack import fft
from tqdm import tqdm

# do not print warnings
if not sys.warnoptions:
    import warnings
    warnings.simplefilter('ignore')

In [13]:
def get_feature(waveform, sample_rate):
    return fft(waveform)

def get_label(label_str):
    return -1 if (label_str == 'No_Whistle') else 1

In [14]:
audio_data = []
labels = []

for file in tqdm(data_file_paths[:max_file_count], unit='file'):
    base = os.path.splitext(file)[0]
    extension = os.path.splitext(file)[1]
    
    if extension == '.flac' and os.path.isfile(base + '.csv'):
        label_df = pd.read_csv(base + '.csv')

        for index, row in label_df.iterrows():
            start = max(0, row['start'])
            end = row['end']
            duration = end - start
            
            if duration <= 0:
                continue
            
            waveform, sample_rate = librosa.load(
                file,
                sr=None,    # do not resample file
                mono=True,
                offset=start,
                duration=duration)
            
            audio_data.append([waveform, sample_rate])
            
            label = get_label(row['label'])
            labels.append(label)

100%|██████████| 25/25 [00:19<00:00,  1.28file/s]


In [15]:
max_length = 0
for file in audio_data:
    waveform = file[0]
    max_length = max(max_length, len(waveform))
print(max_length)

9837074


In [16]:
feature_matrix = []

for index, file in enumerate(tqdm(audio_data, unit='file')):
    waveform = file[0]
    sample_rate = file[1]
    
    padded_audio = waveform
    if len(waveform) < max_length:
        padded_audio = np.pad(waveform, (0, max_length - len(waveform)), mode='constant')
    
    feature = get_feature(padded_audio, sample_rate)
    feature_matrix.append(feature)

100%|██████████| 202/202 [05:55<00:00,  1.76s/file]


In [17]:
feature_matrix = np.array(feature_matrix).squeeze()
labels = np.array(labels)

In [18]:
np.save(f'fft_feature_{max_file_count}.npy', feature_matrix)
np.save(f'labels_{max_file_count}.npy', labels)