In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa


np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_file = "../data/cleaned/80_20_cleaned_train.parquet"
input_audio_dir = "../data/raw/audio/xeno_canto"
output_spectrogram_dir = '../data/processed/bird-whisperer/spectograms'
os.makedirs(output_spectrogram_dir, exist_ok=True)

original_df = pd.read_parquet(input_file)

In [3]:
SAMPLE_RATE = 16 * 1000 # 16kHz

In [5]:
target_recordings_per_species = 100
species_counts = original_df['en'].value_counts()
species_counts_after_aug = {species: 0 for species in species_counts.index}

augmentation_factors = {}
for species, count in tqdm(species_counts.items(), total=original_df.shape[0]):
    needed_augmentations = max(0, target_recordings_per_species - count)
    if count > 100:
        max_augment_per_sample = 1
    elif count > 50:
        max_augment_per_sample = 2
    elif count > 25:
        max_augment_per_sample = 5
    else:
        max_augment_per_sample = 10

    if needed_augmentations > 0:
        augmentation_per_sample = max(1, needed_augmentations // count)
        augmentation_factors[species] = min(augmentation_per_sample, max_augment_per_sample)
    else:
        augmentation_factors[species] = 0

  1%|          | 360/66148 [00:00<00:00, 1015433.38it/s]


In [17]:
for index, row in tqdm(original_df.iterrows(), total=original_df.shape[0]):
    if index > 1000:
        break

    audio_url = row["file"]
    original_audio_file_name = row["file-name"]
    file_extension = original_audio_file_name.split(".")[-1]
    new_audio_file_name = f"{row['id']}.{file_extension}"
    full_audio_file_path = os.path.join(input_audio_dir, new_audio_file_name)

    if not os.path.exists(full_audio_file_path):
      print(f"Audio file {new_audio_file_name} not found. Skipping.")
      continue

    try:
      audio_content, sample_rate = librosa.load(full_audio_file_path, sr=None)
      if sample_rate != SAMPLE_RATE:
          audio_content = librosa.resample(audio_content, orig_sr=sample_rate, target_sr=SAMPLE_RATE)

      # Trim silence from start and end
      audio_content, _ = librosa.effects.trim(audio_content, top_db=20)
    except Exception as e:
      print(f"Error loading audio file {new_audio_file_name}: {e}. Skipping.")
      continue

  1%|          | 339/66148 [00:14<34:23, 31.89it/s]Note: Illegal Audio-MPEG-Header 0x302f1700 at offset 427279.
Note: Trying to resync...
Note: Hit end of (available) data during resync.
  1%|          | 406/66148 [00:17<47:31, 23.06it/s]Note: Illegal Audio-MPEG-Header 0x4c595249 at offset 308571.
Note: Trying to resync...
Note: Hit end of (available) data during resync.
  1%|          | 431/66148 [00:18<47:41, 22.97it/s]Note: Illegal Audio-MPEG-Header 0x41525449 at offset 57804.
Note: Trying to resync...
Note: Hit end of (available) data during resync.
  1%|          | 510/66148 [00:21<45:40, 23.95it/s]Note: Illegal Audio-MPEG-Header 0x00000000 at offset 295618.
Note: Trying to resync...
Note: Hit end of (available) data during resync.
  1%|          | 664/66148 [00:27<41:05, 26.56it/s]Note: Illegal Audio-MPEG-Header 0x00004745 at offset 196776.
Note: Trying to resync...
Note: Hit end of (available) data during resync.
  1%|▏         | 908/66148 [00:36<34:59, 31.07it/s]Note: Illegal A