## Preamble

**Requires an access to the BirdCLEF 2025 dataset** 
- https://www.kaggle.com/competitions/birdclef-2025/data

The code had been run on a Kaggle jupyter server with the dataset attached 

In [7]:
import os
import shutil
import zipfile
from ast import literal_eval

import numpy as np
import pandas as pd

import librosa
import soundfile as sf
from IPython.display import Audio

import ipywidgets as widgets
from IPython.display import display, Audio, clear_output, HTML

import matplotlib.pyplot as plt
plt.rcParams['axes.xmargin'] = 0

## Make app

In [8]:
def zip(zip_filename, folder_to_zip):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_to_zip):
            for file in files:
                filepath = os.path.join(root, file)
                arcname = os.path.relpath(filepath, start=folder_to_zip)
                zipf.write(filepath, arcname)

def save_audio(path, audio, sr):
  newdir = os.path.dirname(path)
  os.makedirs(newdir, exist_ok=True)
  sf.write(path, audio, sr)

In [9]:
def app_chunk(chunk, sr, start, filename, dir_, autoplay=False, label='Label', ctr=1):

    button_save = widgets.Button(description=f"Save {label.capitalize()}", button_style='success')
    button_none = widgets.Button(description="Save None", button_style='warning')
    button_skip = widgets.Button(description="Skip", button_style='info')
    output_box = widgets.Output()

    if label not in filename:
        nfilename = label + '/' + filename.split('/')[1]
    else:
        nfilename = filename

    def save_click(b):
        path = os.path.join(dir_, nfilename[:-4]+f'_{start//sr}.ogg')
        save_audio(path, chunk, sr)
        clear_output(wait=True)
        app_chunk(next(chunks_), sr, next(starting_points_), next(filenames_), dir_, autoplay, label, ctr+1)

    def none_click(b):
        path = os.path.join(dir_, 'none', nfilename.split('/')[1][:-4]+f'_{start//sr}.ogg')
        save_audio(path, chunk, sr)
        clear_output(wait=True)
        app_chunk(next(chunks_), sr, next(starting_points_), next(filenames_), dir_, autoplay, label, ctr+1)

    def skip_click(b):
        clear_output(wait=True)
        app_chunk(next(chunks_), sr, next(starting_points_), next(filenames_), dir_, autoplay, label, ctr+1)

    button_save.on_click(save_click)
    button_none.on_click(none_click)
    button_skip.on_click(skip_click)

    ui = widgets.VBox([
        widgets.HBox([button_save, button_none, button_skip]),
        output_box
    ])

    with output_box:
        print(f"Chunk n. {ctr} -- Labels: {df_K.loc[filename]} -- file {filename} \n\n")
        display(Audio(chunk, rate=sr, autoplay=autoplay, normalize=False))
        print('\n')

    display(ui)

## Prepare data

In [10]:
### Choose class 0 -- 205 to select a species, ordered by frequency
topK = 0
sort_rating = True

In [11]:
df = pd.read_csv('/kaggle/input/birdclef-2025/train.csv')
class_ = df.primary_label.value_counts().index[topK]

df_fn = df.set_index('filename', drop=True)
df_fn = df_fn.sort_values('rating', ascending=False) if sort_rating else df_fn
df_fn = df_fn.apply(lambda r: [r.primary_label] + literal_eval(r.secondary_labels), axis=1)

df_K = df_fn[df_fn.apply(lambda l: class_ in l)]
df_K = df_K if sort_rating else df_K.sample(frac=1.0, replace=False, random_state=2025)

tax = pd.read_csv('/kaggle/input/birdclef-2025/taxonomy.csv')
tax[tax.primary_label==class_]

In [12]:
dir_ = '/kaggle/input/birdclef-2025/train_audio/'
MAX_CHUNKS = 200

chunks = []; starting_points = []; filenames = []
for filename in df_K.index:
  audio, sr = librosa.load(dir_+filename, sr=16000)
  if len(audio) >= sr*5:
    SP = list(np.arange(0, len(audio), sr*5)[:-1])
    chunks.extend([audio[start:start+sr*5] for start in SP])
    starting_points.extend(SP)
    filenames.extend([filename]*len(SP))
  if len(chunks) >= MAX_CHUNKS:
    break

## Run App

In [13]:
first_chunk = 0
chunks_, starting_points_, filenames_ = iter(chunks[first_chunk-1:]), iter(starting_points[first_chunk-1:]), iter(filenames[first_chunk-1:])
print(f"Class {class_}, {len(chunks)} chunks")

In [14]:
app_chunk(next(chunks_), sr, next(starting_points_), next(filenames_), 'train_chunks', autoplay=True, label=class_, ctr=first_chunk)

## Save

In [15]:
zip("train_chunks.zip", "train_chunks")

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>

<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>