In [1]:
import os
import pandas as pd
from tqdm import tqdm

from utils.datasets.dataset_utils import get_dataloaders

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
original_train_df = pd.read_parquet("../data/cleaned/80_20_cleaned_train.parquet")
original_test_df = pd.read_parquet("../data/cleaned/80_20_cleaned_test.parquet")

augmented_train_df = pd.read_parquet("../data/processed/bird-whisperer/train.parquet")
augmented_test_df = pd.read_parquet("../data/processed/bird-whisperer/test.parquet")

Ensure that the number of species stayed the same before and after the augmentation process.

In [3]:
print(f"Number of original train species: {len(original_train_df['en'].unique())}")
print(f"Number of augmented train species: {len(augmented_train_df['species'].unique())}")
print()
print(f"Number of original test species: {len(original_test_df['en'].unique())}")
print(f"Number of augmented test species: {len(augmented_test_df['species'].unique())}")

assert len(original_train_df["en"].unique()) == len(augmented_train_df["species"].unique())

Number of original train species: 360
Number of augmented train species: 360

Number of original test species: 360
Number of augmented test species: 360


Instantiate the data loader

In [4]:
dataset_root = "../data/processed/bird-whisperer"
train_loader, test_loader, labels_unique = get_dataloaders(dataset_root)

In [5]:
labels_unique

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [6]:
print(f"Train dataset size: {train_loader.dataset.__len__()}")
print(f"Test dataset size: {test_loader.dataset.__len__()}")

Train dataset size: 134378
Test dataset size: 16538


Ensure that the first audio file in the train loader is the same as the first audio file in the original train df

In [7]:
(audio_file, label) = (train_loader.dataset.audio_files_path[0], train_loader.dataset.labels[0])
print(f"First audio file: {audio_file}")
print(f"First int label: {label}")
print(f"First label: {train_loader.dataset.label2bird_dict[label]}")

recording_id = audio_file.split("_")[0] if "_" in audio_file else audio_file.split(".")[0]
original_species = original_train_df.query(f"id == '{recording_id}'")["en"].values[0]

assert original_species == train_loader.dataset.label2bird_dict[label]

First audio file: 822605.pt
First int label: 118
First label: Eurasian Oystercatcher


Do this for all the entires in the train loader

In [10]:
for i in tqdm(range(len(train_loader.dataset.audio_files_path))):
  audio_file, label = train_loader.dataset.audio_files_path[i], train_loader.dataset.labels[i]
  recording_id = audio_file.split("_")[0] if "_" in audio_file else audio_file.split(".")[0]
  original_species = original_train_df.query(f"id == '{recording_id}'")["en"].values[0]

  assert original_species == train_loader.dataset.label2bird_dict[label]

100%|██████████| 134378/134378 [04:59<00:00, 448.90it/s]
