# Upload the BEANS dcase recordings to HF

#### 1. Download the recordings the way BEANS do it on their [GitHub](https://github.com/earthspecies/beans)

First we will navigate into the mounted data_birdset folder to download the temporary files from the Repo their and install wget & unzip as they are not on the university bash.

In [1]:
%cd '../../../../data_birdset/beans'
!pwd
!sudo apt install wget
!sudo apt install unzip

/workspace/data_birdset/beans
/workspace/data_birdset/beans


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unzip is already the newest version (6.0-26ubuntu3.2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Then we will run their script to download the metadata and recordings to have the same splits.

In [7]:
# Their script:
from collections import defaultdict
import json
import sys
from pathlib import Path
import torch
import torchaudio
import math

import pandas as pd
from plumbum import local, FG

def divide_waveform_to_chunks(path, target_dir, chunk_size, target_sample_rate):
    waveform, sample_rate = torchaudio.load(path)
    waveform = torch.mean(waveform, dim=0).unsqueeze(0)
    if sample_rate != target_sample_rate:
        transform = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
        waveform = transform(waveform)
    num_samples = waveform.shape[1]
    num_seconds = num_samples / target_sample_rate
    num_chunks = math.ceil(num_seconds / chunk_size)
    target_paths = []
    for chunk in range(num_chunks):
        target_path = Path(target_dir) / f'{Path(path).stem}.{chunk:03d}.wav'
        st_sample = int(chunk * chunk_size * target_sample_rate)
        ed_sample = int((chunk + 1) * chunk_size * target_sample_rate)
        torchaudio.save(
            target_path,
            waveform[:, st_sample:ed_sample],
            sample_rate=target_sample_rate
        )
        target_paths.append(str(target_path))

    return target_paths


def divide_annotation_to_chunks(annotations, chunk_size):
    chunks = defaultdict(list)
    for anon in annotations:
        st, ed = anon['st'], anon['ed']     # in seconds
        st_chunk, ed_chunk = int(st // chunk_size), int(ed // chunk_size)

        for chunk in range(st_chunk, ed_chunk + 1):
            if chunk == st_chunk and chunk == ed_chunk:
                local_st, local_ed = st - chunk * chunk_size, ed - chunk * chunk_size
            elif chunk == st_chunk:
                local_st, local_ed = st - chunk * chunk_size, chunk_size
            elif chunk == ed_chunk:
                local_st, local_ed = 0, ed - chunk * chunk_size
            else:
                local_st, local_ed = 0, chunk_size

            new_anon = dict(anon)
            new_anon['st'], new_anon['ed'] = local_st, local_ed
            chunks[chunk].append(new_anon)

    return chunks


def get_wav_length_in_secs(path):
    info = torchaudio.info(path)
    return info.num_frames / info.sample_rate


CHUNK_SIZE = 60     # in seconds
TARGET_SAMPLE_RATE = 16_000

def get_split(chunk_id, total_num_chunks):
    if chunk_id / total_num_chunks < .12:
        return 'train-low'
    elif chunk_id / total_num_chunks < .6:
        return 'train'
    elif chunk_id / total_num_chunks < .8:
        return 'valid'
    else:
        return 'test'

local['mkdir']['-p', 'data/dcase/wav']()
local['wget']['https://zenodo.org/record/5412896/files/Development_Set.zip?download=1', '-O', 'data/dcase/Development_set.zip'] & FG
local['unzip']['data/dcase/Development_set.zip', '-d', 'data/dcase/'] & FG

datasets = defaultdict(list)

for wav_path in sorted(Path('data/dcase/Development_Set/').glob('**/*.wav')):
    csv_path = wav_path.parent / (wav_path.stem + '.csv')
    print(f'Converting {wav_path} and {csv_path} ...', file=sys.stderr)

    target_paths = divide_waveform_to_chunks(
        path=wav_path,
        target_dir='data/dcase/wav/',
        chunk_size=CHUNK_SIZE,
        target_sample_rate=TARGET_SAMPLE_RATE)
    print(f'num_chunks = {len(target_paths)}', file=sys.stderr)

    df = pd.read_csv(csv_path)
    annotations = []
    for _, row in df.iterrows():
        st, ed = row['Starttime'], row['Endtime']

        for species, label in row.iloc[3:].items():
            if label == 'POS':
                if species in {'AGGM', 'SOCM'}:
                    # these species have very few annotations and will result in zero samples in either train or test sets after split
                    continue
                annotations.append({'st': st, 'ed': ed, 'label': species})

    chunks = divide_annotation_to_chunks(
        annotations=annotations,
        chunk_size=CHUNK_SIZE)

    for chunk, path in enumerate(target_paths):
        split = get_split(chunk, len(target_paths))
        datasets[split].append({
            'path': path,
            'length': get_wav_length_in_secs(path),
            'annotations': chunks[chunk],
        })

for split in ['train', 'train-low', 'valid', 'test']:
    with open(f'data/dcase/{split}.jsonl', mode='w') as f:
        if split == 'train':    # 'train' = 'train' + 'train-low'
            for data in datasets['train-low']:
                print(json.dumps(data), file=f)
        for data in datasets[split]:
            print(json.dumps(data), file=f)

--2024-04-06 17:48:23--  https://zenodo.org/record/5412896/files/Development_Set.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.184.103.159, 188.185.79.172, 188.184.98.238, ...
Connecting to zenodo.org (zenodo.org)|188.184.103.159|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/5412896/files/Development_Set.zip [following]
--2024-04-06 17:48:23--  https://zenodo.org/records/5412896/files/Development_Set.zip
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 1922049816 (1.8G) [application/octet-stream]
Saving to: ‘data/dcase/Development_set.zip’

     0K .......... .......... .......... .......... ..........  0%  665K 47m2s
    50K .......... .......... .......... .......... ..........  0% 65.0M 23m45s
   100K .......... .......... .......... .......... ..........  0% 1.63M 22m4s
   150K .......... .......... .......... .......... ..........  0% 1.74M 20m57s
   200K ..........

Archive:  data/dcase/Development_set.zip
   creating: data/dcase/Development_Set/
   creating: data/dcase/Development_Set/Training_Set/
   creating: data/dcase/Development_Set/Training_Set/BV/
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-04_08-04-59_unit03.csv  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-04_08-04-59_unit03.wav  

.... .......... .......... 99% 43.4M 0s
1875800K .......... .......... .......... .......... .......... 99% 35.7M 0s
1875850K .......... .......... .......... .......... .......... 99% 86.2M 0s
1875900K .......... .......... .......... .......... .......... 99%  108M 0s
1875950K .......... .......... .......... .......... .......... 99% 49.2M 0s
1876000K .......... .......... .......... .......... .......... 99% 44.8M 0s
1876050K .......... .......... .......... .......... .......... 99% 52.1M 0s
1876100K .......... .......... .......... .......... .......... 99% 37.5M 0s
1876150K .......... .......... .......... .......... .......... 99% 82.6M 0s
1876200K .......... .......... .......... .......... .......... 99% 94.3M 0s
1876250K .......... .......... .......... .......... .......... 99% 73.4M 0s
1876300K .......... .......... .......... .......... .......... 99% 42.0M 0s
1876350K .......... .......... .......... .......... .......... 99% 84.9M 0s
1876400K .......... .......... .....


  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-11_06-00-00_unit07.csv  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-11_06-00-00_unit07.wav  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-21_06-00-00_unit05.csv  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-21_06-00-00_unit05.wav  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-25_04-00-00_unit10.csv  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-09-25_04-00-00_unit10.wav  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-10-14_23-59-59_unit05.csv  
  inflating: data/dcase/Development_Set/Training_Set/BV/2015-10-14_23-59-59_unit05.wav  
   creating: data/dcase/Development_Set/Training_Set/HT/
  inflating: data/dcase/Development_Set/Training_Set/HT/e1.csv  
  inflating: data/dcase/Development_Set/Training_Set/HT/e1.wav  
  inflating: data/dcase/Development_Set/Training_Set/HT/h1.csv  
  inflating: data/dcase/Development

Converting data/dcase/Development_Set/Training_Set/BV/2015-09-04_08-04-59_unit03.wav and data/dcase/Development_Set/Training_Set/BV/2015-09-04_08-04-59_unit03.csv ...
num_chunks = 120
Converting data/dcase/Development_Set/Training_Set/BV/2015-09-11_06-00-00_unit07.wav and data/dcase/Development_Set/Training_Set/BV/2015-09-11_06-00-00_unit07.csv ...
num_chunks = 120
Converting data/dcase/Development_Set/Training_Set/BV/2015-09-21_06-00-00_unit05.wav and data/dcase/Development_Set/Training_Set/BV/2015-09-21_06-00-00_unit05.csv ...
num_chunks = 120
Converting data/dcase/Development_Set/Training_Set/BV/2015-09-25_04-00-00_unit10.wav and data/dcase/Development_Set/Training_Set/BV/2015-09-25_04-00-00_unit10.csv ...
num_chunks = 120
Converting data/dcase/Development_Set/Training_Set/BV/2015-10-14_23-59-59_unit05.wav and data/dcase/Development_Set/Training_Set/BV/2015-10-14_23-59-59_unit05.csv ...
num_chunks = 120
Converting data/dcase/Development_Set/Training_Set/HT/e1.wav and data/dcase/Deve

#### 2. Convert to HF format

In [9]:
from datasets import Dataset, Audio
import pandas as pd
import jsonlines

def load_dataset(split_name):
    with jsonlines.open(f'data/dcase/{split_name}.jsonl', 'r') as reader:
        data = list(reader)
    df = pd.DataFrame(data)
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column('path', Audio())
    return dataset

splits = ['train', 'train-low', 'valid', 'test']
datasets = {split: load_dataset(split) for split in splits}
datasets['train_low'] = datasets.pop('train-low') # Rename split from train-low to train_low as HF does not accept -
for split, dataset in datasets.items():
    print(dataset[0])



{'path': {'path': 'data/dcase/wav/2015-09-04_08-04-59_unit03.000.wav', 'array': array([-0.00088501, -0.01126099, -0.00408936, ..., -0.00488281,
       -0.00430298,  0.00384521]), 'sampling_rate': 16000}, 'length': 60.0, 'annotations': [{'ed': 30.857, 'label': 'OVEN', 'st': 30.707}, {'ed': 45.624, 'label': 'SWTH', 'st': 45.474}, {'ed': 59.246, 'label': 'OVEN', 'st': 59.096}]}
{'path': {'path': 'data/dcase/wav/2015-09-04_08-04-59_unit03.072.wav', 'array': array([ 0.00228882, -0.00302124,  0.00704956, ..., -0.00802612,
       -0.00085449,  0.00527954]), 'sampling_rate': 16000}, 'length': 60.0, 'annotations': [{'ed': 2.5339999999996508, 'label': 'SWTH', 'st': 2.3840000000000146}, {'ed': 45.22900000000027, 'label': 'OVEN', 'st': 45.07899999999972}]}
{'path': {'path': 'data/dcase/wav/2015-09-04_08-04-59_unit03.096.wav', 'array': array([-0.00421143,  0.0007019 , -0.003479  , ...,  0.00302124,
        0.00247192, -0.00302124]), 'sampling_rate': 16000}, 'length': 60.0, 'annotations': []}
{'path

#### 3. Upload the datasets to HF

In [10]:
for split, dataset in datasets.items():
    dataset.push_to_hub('DBD-research-group/beans_dcase', split=split)

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/463 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/568 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/670 [00:00<?, ?B/s]

---
# Download the dataset from HF

Download all splits from the hub. Even when specifying a specific split it still downloads everything! Use `streaming=True` and `cache_dir='...'` for shorter loading times.

In [5]:
from datasets import load_dataset, DatasetDict

#dataset = load_dataset(path='DBD-research-group/beans_dcase', split='test')
dataset: DatasetDict = load_dataset(name='default', path='DBD-research-group/beans_dcase')

Now you can interact with the dataset. 

In [8]:
# print number of samples and number of distinct classes
print(f"Number of samples: {len(dataset['train'])}")
dataset['train'][0]
# path -> Array contains the Audio data

Number of samples: 702


{'path': {'path': '2015-09-04_08-04-59_unit03.000.wav',
  'array': array([-0.00088501, -0.01126099, -0.00408936, ..., -0.00488281,
         -0.00430298,  0.00384521]),
  'sampling_rate': 16000},
 'length': 60.0,
 'annotations': [{'ed': 30.857, 'label': 'OVEN', 'st': 30.707},
  {'ed': 45.624, 'label': 'SWTH', 'st': 45.474},
  {'ed': 59.246, 'label': 'OVEN', 'st': 59.096}]}