# Upload the BEANS rfcx recordings to HF

#### 1. Download the recordings the way BEANS do it on their [GitHub](https://github.com/earthspecies/beans)

First we will navigate into the mounted data_birdset folder to download the temporary files from the Repo their and install wget & unzip as they are not on the university bash.

In [1]:
%cd '../../../../data_birdset/beans'
!pwd
!sudo apt install wget
!sudo apt install unzip

/workspace/data_birdset/beans
/workspace/data_birdset/beans


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
wget is already the newest version (1.21.2-2ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unzip is already the newest version (6.0-26ubuntu3.2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Then we will run their script to download the metadata and recordings to have the same splits.

In [5]:
# Their script:
import json
import os
import sys
import torchaudio
import pathlib
from plumbum import local, FG
import subprocess
import pandas as pd
from sklearn.model_selection import train_test_split

def get_wav_length_in_secs(path):
    info = torchaudio.info(path)
    return info.num_frames / info.sample_rate

dataset = {}

#local['mkdir']['-p', 'data/rfcx/wav']()
#local['kaggle']['competitions', 'download', '-p', 'data/rfcx', 'rfcx-species-audio-detection'] & FG
#local['unzip']['data/rfcx/rfcx-species-audio-detection.zip', '-d', 'data/rfcx/'] & FG

dest_dir = pathlib.Path('data/rfcx/wav')
for src_file in sorted(pathlib.Path('data/rfcx/train').glob('*.flac')):
    dest_file = dest_dir / (src_file.stem + '.wav')
    if not os.path.exists(dest_file):
        print(f"Converting {src_file} ...", file=sys.stderr)
        subprocess.run(
            ['sox', src_file, '-r 48000', '-R', dest_file]
        )
    dataset[src_file.stem] = {
        'path': str(dest_file),
        'length': get_wav_length_in_secs(dest_file),
        'annotations': []}


df = pd.read_csv('data/rfcx/train_tp.csv')
for _, row in df.iterrows():
    dataset[row['recording_id']]['annotations'].append(
        {'st': row['t_min'], 'ed': row['t_max'], 'label': row['species_id']})

# split to train:valid:test = 6:2:2
dataset = list(dataset.values())
df_train, df_valid_test = train_test_split(dataset, test_size=0.4, random_state=42, shuffle=True)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=42, shuffle=True)
df_train_low, _ = train_test_split(df_train, test_size=0.66, random_state=42, shuffle=True)

df_train.sort(key=lambda x: x['path'])
df_train_low.sort(key=lambda x: x['path'])
df_valid.sort(key=lambda x: x['path'])
df_test.sort(key=lambda x: x['path'])

with open('data/rfcx/train.jsonl', mode='w') as f:
    for data in df_train:
        print(json.dumps(data), file=f)

with open('data/rfcx/train-low.jsonl', mode='w') as f:
    for data in df_train_low:
        print(json.dumps(data), file=f)

with open('data/rfcx/valid.jsonl', mode='w') as f:
    for data in df_valid:
        print(json.dumps(data), file=f)

with open('data/rfcx/test.jsonl', mode='w') as f:
    for data in df_test:
        print(json.dumps(data), file=f)

Converting data/rfcx/train/63a82d848.flac ...
Converting data/rfcx/train/63a97cbdf.flac ...
Converting data/rfcx/train/63b01d3ee.flac ...
Converting data/rfcx/train/63b4d63b0.flac ...
Converting data/rfcx/train/63b5691ba.flac ...
Converting data/rfcx/train/63b6c797f.flac ...
Converting data/rfcx/train/63c6ce9e6.flac ...
Converting data/rfcx/train/63cc4d598.flac ...
Converting data/rfcx/train/63e47e918.flac ...
Converting data/rfcx/train/63fbf2f2e.flac ...
Converting data/rfcx/train/6400b3087.flac ...
Converting data/rfcx/train/64024bf74.flac ...
Converting data/rfcx/train/64025c7e6.flac ...
Converting data/rfcx/train/640b20641.flac ...
Converting data/rfcx/train/64132f653.flac ...
Converting data/rfcx/train/6414d3225.flac ...
Converting data/rfcx/train/64336bff0.flac ...
Converting data/rfcx/train/643cb10ea.flac ...
Converting data/rfcx/train/643e1b455.flac ...
Converting data/rfcx/train/6444d49cc.flac ...
Converting data/rfcx/train/644f71ae8.flac ...
Converting data/rfcx/train/64877c7

#### 2. Convert to HF format

In [6]:
from datasets import Dataset, Audio
import pandas as pd
import jsonlines

def load_dataset(split_name):
    with jsonlines.open(f'data/rfcx/{split_name}.jsonl', 'r') as reader:
        data = list(reader)
    df = pd.DataFrame(data)
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column('path', Audio())
    return dataset

splits = ['train', 'train-low', 'valid', 'test']
datasets = {split: load_dataset(split) for split in splits}
datasets['train_low'] = datasets.pop('train-low') # Rename split from train-low to train_low as HF does not accept -
for split, dataset in datasets.items():
    print(dataset[0])



{'path': {'path': 'data/rfcx/wav/003b04435.wav', 'array': array([ 0.00210571,  0.00793457,  0.00518799, ..., -0.01141357,
       -0.0168457 , -0.02178955]), 'sample_rate': 48000}, 'length': 60.0, 'annotations': []}
{'path': {'path': 'data/rfcx/wav/00204008d.wav', 'array': array([0.00753784, 0.0071106 , 0.00616455, ..., 0.0050354 , 0.00152588,
       0.00112915]), 'sample_rate': 48000}, 'length': 60.0, 'annotations': []}
{'path': {'path': 'data/rfcx/wav/00834f88e.wav', 'array': array([-0.06729126, -0.06689453, -0.05938721, ..., -0.01675415,
       -0.01593018, -0.00247192]), 'sample_rate': 48000}, 'length': 60.0, 'annotations': []}
{'path': {'path': 'data/rfcx/wav/003b04435.wav', 'array': array([ 0.00210571,  0.00793457,  0.00518799, ..., -0.01141357,
       -0.0168457 , -0.02178955]), 'sample_rate': 48000}, 'length': 60.0, 'annotations': []}


#### 3. Upload the datasets to HF

In [7]:
for split, dataset in datasets.items():
    dataset.push_to_hub('DBD-research-group/beans_rfcx', split=split)

Uploading the dataset shards:   0%|          | 0/33 [00:00<?, ?it/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/11 [00:00<?, ?it/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/466 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/11 [00:00<?, ?it/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/572 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/12 [00:00<?, ?it/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/675 [00:00<?, ?B/s]

---
# Download the dataset from HF

Download all splits from the hub. Even when specifying a specific split it still downloads everything! Use `streaming=True` and `cache_dir='...'` for shorter loading times.

In [8]:
from datasets import load_dataset, DatasetDict

#dataset = load_dataset(path='DBD-research-group/beans_rfcx', split='test')
dataset: DatasetDict = load_dataset(name='default', path='DBD-research-group/beans_rfcx')

Downloading readme:   0%|          | 0.00/793 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

KeyboardInterrupt: 

Now you can interact with the dataset. 

In [8]:
# print number of samples and number of distinct classes
print(f"Number of samples: {len(dataset['train'])}")
dataset['train'][0]
# path -> Array contains the Audio data

Number of samples: 702


{'path': {'path': '2015-09-04_08-04-59_unit03.000.wav',
  'array': array([-0.00088501, -0.01126099, -0.00408936, ..., -0.00488281,
         -0.00430298,  0.00384521]),
  'sample_rate': 16000},
 'length': 60.0,
 'annotations': [{'ed': 30.857, 'label': 'OVEN', 'st': 30.707},
  {'ed': 45.624, 'label': 'SWTH', 'st': 45.474},
  {'ed': 59.246, 'label': 'OVEN', 'st': 59.096}]}