# Helper

In [1]:
from datasets import load_dataset, concatenate_datasets
from datasets import DatasetDict, Dataset
import matplotlib.pyplot as plt

In [2]:
def get_dataset(name: str) -> DatasetDict:
    dataset = load_dataset(
    name=name,
    path="DBD-research-group/BirdSet",
    cache_dir="../../data_birdset/" + name
    )
    return dataset

In [3]:
def cut_underscores(path: str, num: int) -> str:
    """
    cuts till 'num' underscores from end of 'path'
    """
    for i in range(num):
        path = path[:path.rfind("_")]
    return path

In [28]:
def split_dataset(dataset : Dataset, split_from_idx : int, desired_test_split: float, strip_file_comparison: callable) -> DatasetDict:
    num_rows = len(dataset)
    print(f"dataset length: {num_rows}\n")

    # find start of test split
    bottom_start_idx = split_from_idx
    top_start_idx = num_rows-1
    split_file = strip_file_comparison(dataset[split_from_idx]["filepath"])
    print("File splitting used: ", split_file)

    for idx in range(split_from_idx-1, -1, -1):
        file_at_idx = strip_file_comparison(dataset[idx]["filepath"])

        if file_at_idx != split_file:
            bottom_start_idx = idx + 1
            break

    for idx in range(split_from_idx+1, num_rows):
        filepath_at_idx = dataset[idx]["filepath"]
        file_at_idx = strip_file_comparison(filepath_at_idx)

        if file_at_idx != split_file:
            top_start_idx = idx
            break

    print(f"bottom start idx: {bottom_start_idx} \ndata at idx: {dataset[bottom_start_idx]}\n")
    print(f"top start idx: {top_start_idx} \ndata at idx: {dataset[top_start_idx]}\n")
    if split_from_idx - bottom_start_idx > top_start_idx - split_from_idx:
        nearest_start_idx = top_start_idx
    else:
        nearest_start_idx = bottom_start_idx

    # find end of test split
    desired_end_idx = nearest_start_idx + int(num_rows * desired_test_split)
    split_file = strip_file_comparison(dataset[desired_end_idx]["filepath"])

    bottom_end_idx = desired_end_idx
    top_end_idx = num_rows

    for idx in range(desired_end_idx-1, -1, -1):
        file_at_idx = strip_file_comparison(dataset[idx]["filepath"])

        if idx <= nearest_start_idx:
            bottom_end_idx = nearest_start_idx
            break

        if file_at_idx != split_file:
            bottom_end_idx = idx + 1
            break

    for idx in range(desired_end_idx, num_rows):
        file_at_idx = strip_file_comparison(dataset[idx]["filepath"])

        if file_at_idx != split_file:
            top_end_idx = idx
            break


    bottom_test_split = ((bottom_end_idx - nearest_start_idx) / num_rows)
    top_test_split = ((top_end_idx - nearest_start_idx) / num_rows)
    print(f"bottom end idx: {bottom_end_idx} \ndata at idx: {dataset[bottom_end_idx]}\n")
    print(f"top end idx: {top_end_idx-1} \ndata at idx: {dataset[top_end_idx-1]}\n")
    print(f"bottom split percentage: {bottom_test_split} \ntop split percentage: {top_test_split}\n")

    if desired_test_split - bottom_test_split > top_test_split - desired_test_split:
        nearest_end_idx = top_end_idx
    else:
        nearest_end_idx = bottom_end_idx

    # build datasets and dataset dict
    first_train_split = dataset.select(range(nearest_start_idx))
    test_split = dataset.select(range(nearest_start_idx, nearest_end_idx))
    if nearest_end_idx != num_rows:
        second_train_split = dataset.select(range(nearest_end_idx, num_rows))
    else:
        second_train_split = dataset.select(range(0))

    print("first train dataset:\n" , first_train_split, first_train_split[-1] if len(first_train_split) != 0 else "empty")
    print("test dataset:\n", test_split, test_split[0])
    print(test_split[-1])
    print("second train dataset:\n" , second_train_split, second_train_split[0] if len(second_train_split) != 0 else "empty")

    train_split = concatenate_datasets([first_train_split, second_train_split])
    dataset_dict = DatasetDict({'train':train_split, 'test':test_split})
    return dataset_dict

In [30]:
def split_into_k_datasets(dataset: Dataset, k: int, strip_file_comparison: callable) -> list[DatasetDict]:
    test_percentage_per_set = 1/k
    dataset_length = len(dataset)
    dataset_dicts = []

    for i in range(k):
        split_from_idx = int(dataset_length * (test_percentage_per_set * i))
        dataset_dict = split_dataset(dataset, split_from_idx, test_percentage_per_set, strip_file_comparison)
        dataset_dicts.append(dataset_dict)

    return dataset_dicts

# NES

In [5]:
dataset = get_dataset("NES_scape")
test_5s = dataset["test_5s"]
print(dataset)
next(iter(test_5s))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    test: Dataset({
        features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist'],
        num_rows: 6952
    })
    test_5s: Dataset({
        features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist'],
        num_rows: 24480
    })
})


{'audio': {'bytes': None,
  'path': 'd:\\Programmierzeugs\\BirdSet\\data_birdset\\NES_scape\\downloads\\extracted\\35644542a0998491c586fd647ca3e015195cd45c9d8ca749e7fc4dbb61bd5e79\\NES_001_S01_20190914_043000_000_005.ogg'},
 'filepath': 'd:\\Programmierzeugs\\BirdSet\\data_birdset\\NES_scape\\downloads\\extracted\\35644542a0998491c586fd647ca3e015195cd45c9d8ca749e7fc4dbb61bd5e79\\NES_001_S01_20190914_043000_000_005.ogg',
 'start_time': 0.0,
 'end_time': 5.0,
 'low_freq': None,
 'high_freq': None,
 'ebird_code': None,
 'ebird_code_multilabel': [],
 'ebird_code_secondary': None,
 'call_type': None,
 'sex': None,
 'lat': 5.59,
 'long': -75.85,
 'length': None,
 'microphone': 'Soundscape',
 'license': 'Creative Commons Attribution 4.0 International Public License',
 'source': 'https://zenodo.org/record/7525349',
 'local_time': '4:30:29',
 'detected_events': None,
 'event_cluster': None,
 'peaks': None,
 'quality': None,
 'recordist': None}

In [31]:
def get_file_comparison_name(path :str) -> str:
    path = cut_underscores(path, 2)
    return path

split_into_k_datasets(test_5s, 5, get_file_comparison_name)

dataset length: 24480

File splitting used:  d:\Programmierzeugs\BirdSet\data_birdset\NES_scape\downloads\extracted\35644542a0998491c586fd647ca3e015195cd45c9d8ca749e7fc4dbb61bd5e79\NES_001_S01_20190914_043000
bottom start idx: 0 
data at idx: {'audio': {'bytes': None, 'path': 'd:\\Programmierzeugs\\BirdSet\\data_birdset\\NES_scape\\downloads\\extracted\\35644542a0998491c586fd647ca3e015195cd45c9d8ca749e7fc4dbb61bd5e79\\NES_001_S01_20190914_043000_000_005.ogg'}, 'filepath': 'd:\\Programmierzeugs\\BirdSet\\data_birdset\\NES_scape\\downloads\\extracted\\35644542a0998491c586fd647ca3e015195cd45c9d8ca749e7fc4dbb61bd5e79\\NES_001_S01_20190914_043000_000_005.ogg', 'start_time': 0.0, 'end_time': 5.0, 'low_freq': None, 'high_freq': None, 'ebird_code': None, 'ebird_code_multilabel': [], 'ebird_code_secondary': None, 'call_type': None, 'sex': None, 'lat': 5.59, 'long': -75.85, 'length': None, 'microphone': 'Soundscape', 'license': 'Creative Commons Attribution 4.0 International Public License', 'so

[DatasetDict({
     train: Dataset({
         features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist'],
         num_rows: 19440
     })
     test: Dataset({
         features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist'],
         num_rows: 5040
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length