# Upload the BEANS enabirds recordings to HF

#### 1. Download the recordings the way BEANS do it on their [GitHub](https://github.com/earthspecies/beans)

First we will navigate into the mounted data_birdset folder to download the temporary files from the Repo their and install wget & unzip as they are not on the university bash.

In [1]:
%cd '../../../data_birdset/beans'
!pwd
!sudo apt install wget
!sudo apt install unzip

/workspace/data_birdset/beans
/workspace/data_birdset/beans


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unzip is already the newest version (6.0-26ubuntu3.2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Then we will run their script to download the metadata and recordings to have the same splits.

In [2]:
# Their script:
from collections import defaultdict
import json
from pathlib import Path
import pandas as pd
from plumbum import local, FG
import sys

import torch
import torchaudio
import math


def divide_waveform_to_chunks(path, target_dir, chunk_size, target_sample_rate):
    waveform, sample_rate = torchaudio.load(path)
    waveform = torch.mean(waveform, dim=0).unsqueeze(0)
    if sample_rate != target_sample_rate:
        transform = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
        waveform = transform(waveform)
    num_samples = waveform.shape[1]
    num_seconds = num_samples / target_sample_rate
    num_chunks = math.ceil(num_seconds / chunk_size)
    target_paths = []
    for chunk in range(num_chunks):
        target_path = Path(target_dir) / f'{Path(path).stem}.{chunk:03d}.wav'
        st_sample = int(chunk * chunk_size * target_sample_rate)
        ed_sample = int((chunk + 1) * chunk_size * target_sample_rate)
        torchaudio.save(
            target_path,
            waveform[:, st_sample:ed_sample],
            sample_rate=target_sample_rate
        )
        target_paths.append(str(target_path))

    return target_paths


def divide_annotation_to_chunks(annotations, chunk_size):
    chunks = defaultdict(list)
    for anon in annotations:
        st, ed = anon['st'], anon['ed']     # in seconds
        st_chunk, ed_chunk = int(st // chunk_size), int(ed // chunk_size)

        for chunk in range(st_chunk, ed_chunk + 1):
            if chunk == st_chunk and chunk == ed_chunk:
                local_st, local_ed = st - chunk * chunk_size, ed - chunk * chunk_size
            elif chunk == st_chunk:
                local_st, local_ed = st - chunk * chunk_size, chunk_size
            elif chunk == ed_chunk:
                local_st, local_ed = 0, ed - chunk * chunk_size
            else:
                local_st, local_ed = 0, chunk_size

            new_anon = dict(anon)
            new_anon['st'], new_anon['ed'] = local_st, local_ed
            chunks[chunk].append(new_anon)

    return chunks


def get_wav_length_in_secs(path):
    info = torchaudio.info(path)
    return info.num_frames / info.sample_rate


CHUNK_SIZE = 60     # in seconds
TARGET_SAMPLE_RATE = 32_000

local['mkdir']['-p', 'data/enabirds/wav']()
local['wget']['https://datadryad.org/stash/downloads/file_stream/641808', '-O', 'data/enabirds/wav_Files.zip'] & FG
local['unzip']['data/enabirds/wav_Files.zip', '-d', 'data/enabirds/'] & FG

local['wget']['https://datadryad.org/stash/downloads/file_stream/641805', '-O', 'data/enabirds/annotation_Files.zip'] & FG
local['unzip']['data/enabirds/annotation_Files.zip', '-d', 'data/enabirds/'] & FG

def get_split(chunk_id, total_num_chunks):
    if chunk_id / total_num_chunks < .12:
        return 'train-low'
    elif chunk_id / total_num_chunks < .6:
        return 'train'
    elif chunk_id / total_num_chunks < .8:
        return 'valid'
    else:
        return 'test'

datasets = defaultdict(list)

for wav_path in sorted(Path('data/enabirds/').glob('Recording_?/*.wav')):
    print(f'Converting {wav_path} ...', file=sys.stderr)

    target_paths = divide_waveform_to_chunks(
        path=wav_path,
        target_dir='data/enabirds/wav',
        chunk_size=CHUNK_SIZE,
        target_sample_rate=TARGET_SAMPLE_RATE
    )

    df = pd.read_csv(str(wav_path.parent / wav_path.stem) + '.Table.1.selections.txt', sep='\t')

    annotations = []
    for _, row in df.iterrows():
        st, ed = row['Begin Time (s)'], row['End Time (s)']
        annotations.append({'st': st, 'ed': ed, 'label': row['Species']})

    chunks = divide_annotation_to_chunks(
        annotations=annotations,
        chunk_size=CHUNK_SIZE)

    for chunk, path in enumerate(target_paths):
        split = get_split(chunk, len(target_paths))
        datasets[split].append({
            'path': path,
            'length': get_wav_length_in_secs(path),
            'annotations': chunks[chunk]
        })

for split in ['train', 'train-low', 'valid', 'test']:
    with open(f'data/enabirds/{split}.jsonl', mode='w') as f:
        if split == 'train':    # 'train' = 'train' + 'train-low'
            for data in datasets['train-low']:
                print(json.dumps(data), file=f)
        for data in datasets[split]:
            print(json.dumps(data), file=f)

--2024-04-06 21:48:03--  https://datadryad.org/stash/downloads/file_stream/641808
Resolving datadryad.org (datadryad.org)... 52.37.27.175, 34.215.3.207, 35.160.180.199, ...
Connecting to datadryad.org (datadryad.org)|52.37.27.175|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://dryad-assetstore-merritt-west.s3.us-west-2.amazonaws.com/ark%3A/13030/m5799nzg%7C5%7Cproducer/wav_Files.zip?response-content-disposition=attachment%3B%20filename%3Dwav_Files.zip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA2KERHV5E3OITXZXC%2F20240406%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20240406T214805Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=2257e81f7d851fdecf29f1c085134476dc42f3b25f3ade2bb788fab2798220e8 [following]
--2024-04-06 21:48:04--  https://dryad-assetstore-merritt-west.s3.us-west-2.amazonaws.com/ark%3A/13030/m5799nzg%7C5%7Cproducer/wav_Files.zip?response-content-disposition=attachment%3B%20filename%3Dwav_Files.zip&X-Amz-Algorith

Archive:  data/enabirds/wav_Files.zip
  inflating: data/enabirds/Recording_1/Recording_1_Segment_01.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_02.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_03.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_04.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_05.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_06.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_07.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_08.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_09.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_10.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_11.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_12.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_13.wav  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_14.wav  
  inflatin

--2024-04-06 21:49:56--  https://datadryad.org/stash/downloads/file_stream/641805
Resolving datadryad.org (datadryad.org)... 52.37.27.175, 35.160.180.199, 34.215.3.207, ...
Connecting to datadryad.org (datadryad.org)|52.37.27.175|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://dryad-assetstore-merritt-west.s3.us-west-2.amazonaws.com/ark%3A/13030/m5799nzg%7C1%7Cproducer/annotation_Files.zip?response-content-disposition=attachment%3B%20filename%3Dannotation_Files.zip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA2KERHV5E3OITXZXC%2F20240406%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20240406T214957Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=8c10472c781da3139210508d90aab396a6020ebecfdc242dca3e5d4d69ced5ff [following]
--2024-04-06 21:49:57--  https://dryad-assetstore-merritt-west.s3.us-west-2.amazonaws.com/ark%3A/13030/m5799nzg%7C1%7Cproducer/annotation_Files.zip?response-content-disposition=attachment%3B%20filename%3Dannotat

Archive:  data/enabirds/annotation_Files.zip
  inflating: data/enabirds/Recording_1/Recording_1_Segment_01.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_02.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_03.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_04.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_05.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_06.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_07.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_08.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_09.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_10.Table.1.selections.txt  
  inflating: data/enabirds/Recording_1/Recording_1_Segment_11.Table.1.selections.txt 

Converting data/enabirds/Recording_1/Recording_1_Segment_02.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_03.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_04.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_05.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_06.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_07.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_08.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_09.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_10.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_11.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_12.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_13.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_14.wav ...
Converting data/enabirds/Recording_1/Recording_1_Segment_15.wav ...
Converting data/enabirds/Recording_1/Recording_1

#### 2. Convert to HF format

In [3]:
from datasets import Dataset, Audio
import pandas as pd
import jsonlines

def load_dataset(split_name):
    with jsonlines.open(f'data/enabirds/{split_name}.jsonl', 'r') as reader:
        data = list(reader)
    df = pd.DataFrame(data)
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column('path', Audio())
    return dataset

splits = ['train', 'train-low', 'valid', 'test']
datasets = {split: load_dataset(split) for split in splits}
datasets['train_low'] = datasets.pop('train-low') # Rename split from train-low to train_low as HF does not accept -
for split, dataset in datasets.items():
    #dataset.save_to_disk(f'data/dogs/{split}')
    print(dataset[0])



{'path': {'path': 'data/enabirds/wav/Recording_1_Segment_01.000.wav', 'array': array([ 0.00021362,  0.0015564 , -0.00015259, ...,  0.00085449,
       -0.00286865, -0.00134277]), 'sampling_rate': 32000}, 'length': 60.0, 'annotations': []}
{'path': {'path': 'data/enabirds/wav/Recording_1_Segment_01.003.wav', 'array': array([-0.00064087, -0.00317383, -0.00048828, ...,  0.00128174,
        0.00180054,  0.00073242]), 'sampling_rate': 32000}, 'length': 60.0, 'annotations': []}
{'path': {'path': 'data/enabirds/wav/Recording_1_Segment_01.004.wav', 'array': array([-0.00039673,  0.0012207 ,  0.00064087, ..., -0.00265503,
       -0.00259399, -0.00054932]), 'sampling_rate': 32000}, 'length': 60.0, 'annotations': []}
{'path': {'path': 'data/enabirds/wav/Recording_1_Segment_01.000.wav', 'array': array([ 0.00021362,  0.0015564 , -0.00015259, ...,  0.00085449,
       -0.00286865, -0.00134277]), 'sampling_rate': 32000}, 'length': 60.0, 'annotations': []}


#### 3. Upload the datasets to HF

In [4]:
for split, dataset in datasets.items():
    dataset.push_to_hub('DBD-research-group/beans_enabirds', split=split)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/460 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/566 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/667 [00:00<?, ?B/s]

---
# Download the dataset from HF

Download all splits from the hub. Even when specifying a specific split it still downloads everything! Use `streaming=True` and `cache_dir='...'` for shorter loading times.

In [5]:
from datasets import load_dataset, DatasetDict

#dataset = load_dataset(path='DBD-research-group/beans_enabirds', split='test')
dataset: DatasetDict = load_dataset(name='default', path='DBD-research-group/beans_enabirds')

Now you can interact with the dataset. 

In [8]:
# print number of samples and number of distinct classes
print(f"Number of samples: {len(dataset['train'])}")
dataset['train'][0]
# path -> Array contains the Audio data

Number of samples: 702


{'path': {'path': '2015-09-04_08-04-59_unit03.000.wav',
  'array': array([-0.00088501, -0.01126099, -0.00408936, ..., -0.00488281,
         -0.00430298,  0.00384521]),
  'sampling_rate': 16000},
 'length': 60.0,
 'annotations': [{'ed': 30.857, 'label': 'OVEN', 'st': 30.707},
  {'ed': 45.624, 'label': 'SWTH', 'st': 45.474},
  {'ed': 59.246, 'label': 'OVEN', 'st': 59.096}]}