# A New Approach: Cut Audio Files in 1 Second Snippets

- Cut audio files in 1 second snippets.
- Label correspondingly.
- Explanation of new approach: 🔗 [`new_cut_approach.pdf`](new_cut_approach.pdf)
  and `cut_labels` method below

In [1]:
# adjust as necessary (None → all files)
max_file_count = None
none_replacement = 'all'

SAMPLE_RATE = 22_050    # 22.05 kHz (default)
SNIPPET_DURATION = 1

LABEL_NO_WHISTLE = -1
LABEL_WHISTLE = 1

LABEL_THRESHOLD = 0.1

## Load Original Audio and Label Files

In [2]:
import os
import sys
sys.path.append(os.path.join(os.pardir, os.pardir))
from helper.load_data import load_data

In [3]:
# determine path to `src/data`
cwd = os.getcwd()
src_dir = os.path.join(cwd, os.path.join(os.pardir, os.pardir))
data_dir = os.path.join(src_dir, 'data')

# load files
data_file_paths = load_data(data_dir)

In [4]:
if max_file_count is not None:
    max_file_count = min(max_file_count, len(data_file_paths))

In [5]:
# do not print warnings
if not sys.warnoptions:
    import warnings
    warnings.simplefilter('ignore')

## Waveform Function

In [6]:
import librosa
import numpy as np

def get_waveform_and_cut(audio_file_path, snippet_duration=1):
    audio_snippets = []
    
    waveform, sr = librosa.load(
        audio_file_path,
        sr=SAMPLE_RATE,
        mono=True)

    samples_per_snippet = int(snippet_duration * SAMPLE_RATE)
    
    for i in range(0, len(waveform) - samples_per_snippet, samples_per_snippet):
        start = i
        end = i + samples_per_snippet
        audio_snippets.append(waveform[start : end])
    
    return np.array(audio_snippets)

## Label Functions

In [7]:
def get_label(label_str):
    return LABEL_NO_WHISTLE if (label_str == 'No_Whistle') else LABEL_WHISTLE

In [8]:
def log(msg, print_debug):
    if print_debug:
        print(msg)

In [9]:
from math import floor
import pandas as pd

def cut_labels(label_file_path, len_audio_snippets, snippet_duration=1, print_debug=False):
    labels = [LABEL_NO_WHISTLE] * len_audio_snippets
    to_remove = []
    
    label_df = pd.read_csv(label_file_path)
    for index, row in label_df.iterrows():
        l = get_label(row['label'])
        
        if l == LABEL_NO_WHISTLE:
            continue
        
        # looking at whistle
        start = row['start']
        end = row['end']
        
        log(f'Whistle from {start:.3f} to {end:.3f}', print_debug)
        
        duration_start_sec = 1 - round(start % 1, 3)
        duration_end_sec = round(end % 1, 3)
        
        start_sec = floor(start)
        end_sec = floor(end)
        
        if duration_start_sec >= LABEL_THRESHOLD:
            # whole second of start is whistle
            labels[start_sec] = LABEL_WHISTLE
            log(f'added to {start_sec}', print_debug)
        else:
            # whistle but too short in start second → remove start second
            to_remove.append(start_sec)
            log(f'removed {start_sec}', print_debug)
        
        if duration_end_sec >= LABEL_THRESHOLD:
            # whole second of end is whistle
            labels[end_sec] = LABEL_WHISTLE
            log(f'added to {end_sec}', print_debug)
        else:
            # whistle but too short in end second → remove end second
            to_remove.append(end_sec)
            log(f'removed {end_sec}', print_debug)
        log('', print_debug)
    
    return np.array(labels), to_remove

### Debug print from `cut_labels`

```
Whistle from 144.352 to 144.559
added to 144
added to 144

Whistle from 357.588 to 357.948
added to 357
added to 357

Whistle from 504.216 to 504.668
added to 504
added to 504

Whistle from 581.950 to 582.118
removed 581
added to 582

Whistle from 611.612 to 611.736
added to 611
added to 611

Whistle from 745.607 to 745.667
added to 745
added to 745

Whistle from 746.250 to 746.306
added to 746
added to 746
```

## Cut Function

In [10]:
def cut(file_path, csv_path, snippet_duration=1):
    global real_whistle_count
    
    audio_snippets = get_waveform_and_cut(file_path, snippet_duration=snippet_duration)
    labels, to_remove = cut_labels(csv_path, len(audio_snippets), snippet_duration=1, print_debug=False)
    
    # remove too short whistle data
    audio_snippets = np.delete(audio_snippets, to_remove, axis=0)
    labels = np.delete(labels, to_remove)
    
    return audio_snippets, labels

## Actual Cut

In [11]:
from tqdm import tqdm

waveforms = []
labels = []

paths = data_file_paths[:max_file_count] if max_file_count is not None else data_file_paths

for file in tqdm(paths, unit='file'):
    base = os.path.splitext(file)[0]
    extension = os.path.splitext(file)[1]
    
    csv_path = base + '.csv'
    if extension == '.flac' and os.path.isfile(csv_path):
        audio, label = cut(file, csv_path, SNIPPET_DURATION)
        for a in audio:
            waveforms.append(a)
        for l in label:
            labels.append(l)

waveforms = np.array(waveforms)
labels = np.array(labels)

100%|██████████| 120/120 [02:13<00:00,  1.12s/file]


In [12]:
print(waveforms.shape)
print(labels.shape)

(39622, 22050)
(39622,)


## Save Arrays

In [13]:
np.save(f'waveform_new_cut_{max_file_count or none_replacement}.npy', waveforms)
np.save(f'labels_new_cut_{max_file_count or none_replacement}.npy', labels)