# Cut Audio Files in 1 Second Snippets

Cut audio files in 1 second snippets. Label correspondingly.

In [1]:
# adjust as necessary (None → all files)
max_file_count = None

SAMPLE_RATE = 22_050    # 22.05 kHz (default)

LABEL_NO_WHISTLE = -1
LABEL_WHISTLE = 1

In [2]:
import os
import sys
sys.path.append(os.path.join(os.pardir, os.pardir))
from helper.load_data import load_data

In [3]:
# determine path to `src/data`
cwd = os.getcwd()
src_dir = os.path.join(cwd, os.path.join(os.pardir, os.pardir))
data_dir = os.path.join(src_dir, 'data')

# load files
data_file_paths = load_data(data_dir)

In [4]:
if max_file_count is not None:
    max_file_count = min(max_file_count, len(data_file_paths))

In [5]:
import librosa
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import pandas as pd
from tqdm import tqdm
from math import floor, ceil

# do not print warnings
if not sys.warnoptions:
    import warnings
    warnings.simplefilter('ignore')

In [6]:
def get_label(label_str):
    return LABEL_NO_WHISTLE if (label_str == 'No_Whistle') else LABEL_WHISTLE

- time frame of whistles are usually shorter than 1 second
    - observation: mostly less than 0.4 seconds
- if rounding `start` and `end` values, duration will often be 0 seconds
    - e.g. $(\text{start},~\text{end}) = (5.1,~5.4) \approx (5,~5)$ $\Rightarrow$ will be dismissed
- therefore, we introduce a `THRESHOLD` constant of 0.1 seconds
- see `round_start(time)` and `round_end(time)`

In [7]:
THRESHOLD = 0.1

def round_start(time):
    diff = round(time % 1, 4)
    if diff < 1 - THRESHOLD:
        # diff < 0.95
        return floor(time)
    else:
        # diff >= 0.05
        return floor(time)

def round_end(time):
    diff = round(time % 1, 4)
    if diff < THRESHOLD:
        # < 0.05
        return max(0, floor(time))
    else:
        # > 0.05
        return ceil(time)

In [8]:
real_whistle_count = 0

def cut(file_path, csv_path, snippet_duration = 1):
    global real_whistle_count
    
    label_df = pd.read_csv(csv_path)
    
    audio_snippets = []
    
    waveform, sr = librosa.load(
        file_path,
        sr=SAMPLE_RATE,
        mono=True)
    
    samples_per_snippet = int(snippet_duration * SAMPLE_RATE)
    
    for i in range(0, len(waveform) - samples_per_snippet, samples_per_snippet):
        start = i
        end = i + samples_per_snippet
        audio_snippets.append(waveform[start : end])
    
    label_snippets = [LABEL_NO_WHISTLE] * len(audio_snippets)
    
    label_df = pd.read_csv(csv_path)
    for index, row in label_df.iterrows():
        start = row['start']
        end = row['end']
        label = get_label(row['label'])
        
        if label == LABEL_WHISTLE:
            real_whistle_count += 1
            start = round_start(start)
            end = round_end(end)
        else:
            start = round(start)
            end = round(start)
        
        for sec in range(start, end, 1):
            if sec >= len(label_snippets):
                assert label != LABEL_WHISTLE, f'{file} in row {index}'
                break
            label_snippets[sec] = label
            
    return np.array(audio_snippets), np.array(label_snippets)

In [9]:
audio_data = []
labels = []

paths = data_file_paths[:max_file_count] if max_file_count is not None else data_file_paths

for file in tqdm(paths, unit='file'):
    base = os.path.splitext(file)[0]
    extension = os.path.splitext(file)[1]
    
    csv_path = base + '.csv'
    if extension == '.flac' and os.path.isfile(csv_path):
        audio, label = cut(file, csv_path)
        for a in audio:
            audio_data.append(a)
        for l in label:
            labels.append(l)

audio_data = np.array(audio_data)
labels = np.array(labels)

100%|██████████| 120/120 [01:59<00:00,  1.00file/s]


In [10]:
print(audio_data.shape)
print(labels.shape)

(39704, 22050)
(39704,)


In [11]:
print(labels[labels == LABEL_WHISTLE].shape)
print(real_whistle_count)

(408,)
355


du to rounding errors, we have more `whistle` labels in the snippet version than originally perceived

In [15]:
desc = max_file_count if max_file_count is not None else 'all'

np.save(f'cut_waveform_{desc}.npy', audio_data)
np.save(f'cut_labels_{desc}.npy', labels)