In [1]:
from datasets import load_dataset, Audio, DatasetInfo

In [2]:
birdset1k = load_dataset("DBD-research-group/na_metadata1k_ogg_nodecode") 
birdset1k = birdset1k.cast_column(
    column="audio",
    feature=Audio(
        sampling_rate=32_000,
        mono=True,
        decode=True
    )
)

Found cached dataset parquet (/home/lukas/.cache/huggingface/datasets/DBD-research-group___parquet/DBD-research-group--na_metadata1k_ogg_nodecode-2458d6d017e1feb5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
birdset1k["train"].info

In [4]:
birdset1k

DatasetDict({
    train: Dataset({
        features: ['lat', 'lng', 'type', 'sex', 'primary', 'continent', 'secondary', 'audio', 'ebird_code', 'id', 'file'],
        num_rows: 800
    })
    test: Dataset({
        features: ['lat', 'lng', 'type', 'sex', 'primary', 'continent', 'secondary', 'audio', 'ebird_code', 'id', 'file'],
        num_rows: 200
    })
})

In [5]:
birdset1k.cache_files

{'train': [{'filename': '/home/lukas/.cache/huggingface/datasets/DBD-research-group___parquet/DBD-research-group--na_metadata1k_ogg_nodecode-2458d6d017e1feb5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/parquet-train.arrow'}],
 'test': [{'filename': '/home/lukas/.cache/huggingface/datasets/DBD-research-group___parquet/DBD-research-group--na_metadata1k_ogg_nodecode-2458d6d017e1feb5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/parquet-test.arrow'}]}

In [15]:
birdset1k.num_rows

{'train': 800, 'test': 200}

In [16]:
birdset1k.column_names

{'train': ['lat',
  'lng',
  'type',
  'sex',
  'primary',
  'continent',
  'secondary',
  'audio',
  'ebird_code',
  'id',
  'file'],
 'test': ['lat',
  'lng',
  'type',
  'sex',
  'primary',
  'continent',
  'secondary',
  'audio',
  'ebird_code',
  'id',
  'file']}

In [17]:
birdset1k.shape

{'train': (800, 11), 'test': (200, 11)}

In [6]:
birdset1k["train"].features

{'lat': Value(dtype='float64', id=None),
 'lng': Value(dtype='float64', id=None),
 'type': Value(dtype='string', id=None),
 'sex': Value(dtype='string', id=None),
 'primary': ClassLabel(names=['acanthis hornemanni', 'accipiter cooperii', 'actitis macularius', 'aegolius acadicus', 'aegolius funereus', 'agelaius phoeniceus', 'aimophila rufescens', 'aimophila ruficeps', 'alopochen aegyptiaca', 'amazilia rutila', 'amazilia tzacatl', 'amazilia yucatanensis', 'amazona autumnalis', 'amazona finschi', 'amazona guatemalae', 'amazona leucocephala', 'amblycercus holosericeus', 'ammodramus savannarum', 'ammospiza leconteii', 'ammospiza nelsoni', 'amphispiza bilineata', 'anabacerthia variegaticeps', 'anas carolinensis', 'anas platyrhynchos', 'anser caerulescens', 'anser rossii', 'antigone canadensis', 'antrostomus arizonae', 'antrostomus ridgwayi', 'antrostomus saturatus', 'antrostomus vociferus', 'aphelocoma unicolor', 'aphelocoma wollweberi', 'aphelocoma woodhouseii', 'ara militaris', 'archilochu

In [7]:
# decode was turned off in preprocessing
birdset1k["train"][0]["audio"]

{'path': 'XC146305.ogg',
 'array': array([ 2.25619879e-05,  3.81759601e-05,  4.19990392e-05, ...,
        -4.22009546e-03,  5.75777609e-03,  5.16055990e-03]),
 'sampling_rate': 32000}

**set transform**
- applied right before returning objects in __getitem__
- can be applied to specific columns but can also return the unformatted ones!
- cast_column is very similar (but introduced as .astype of a column)
- set_format = set the __getitem__ return format 
# 

In [3]:
birdset1k_audio = birdset1k.select_columns(['audio', 'primary'])

In [20]:
birdset1k_audio["train"][0]

{'primary': 485,
 'audio': {'path': 'XC146305.ogg',
  'array': array([ 2.25619879e-05,  3.81759601e-05,  4.19990392e-05, ...,
         -4.22009546e-03,  5.75777609e-03,  5.16055990e-03]),
  'sampling_rate': 32000}}

In [21]:
birdset1k_audio.set_format(type='pt')

# also works with "with_format" that returns a new object on-the-fly (could be better!)

In [22]:
birdset1k_audio["train"][0]

{'primary': tensor(485),
 'audio': {'path': 'XC146305.ogg',
  'array': tensor([ 2.2562e-05,  3.8176e-05,  4.1999e-05,  ..., -4.2201e-03,
           5.7578e-03,  5.1606e-03]),
  'sampling_rate': tensor(32000)}}

**Load Data**

In [5]:
from tqdm import tqdm
from torch.utils.data import DataLoader

In [25]:
for i in tqdm(birdset1k_audio["train"]):
    pass

# running through 800 training instances took 2 minutes

100%|██████████| 800/800 [02:11<00:00,  6.10it/s]


with dataloader

- padding is neccessary

In [45]:
birdset1k_audio.set_format(type='np')

In [6]:
from transformers import BatchFeature
from transformers import SequenceFeatureExtractor
import numpy as np 
# we could incorporate some kind of event detector in the customfeatureextractor

class CustomFeatureExtractor(SequenceFeatureExtractor):
    model_input_names = ["input_values"]

    def __init__(
        self,
        feature_size=1,
        sampling_rate=32_000,
        padding_value=0.0,
        return_attention_mask=False
    ):
        # initialize sequencefeatureextractor
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value)
        self.return_attention_mask = return_attention_mask

    def __call__(
        self, 
        raw_audio,
        padding = False, 
        max_length = None,
        truncation = False, 
        return_tensors = None,
        sampling_rate = None
    ) -> BatchFeature:
        
        # control/check sampling rate 
        if self.sampling_rate is not None: 
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                    f"{self.sampling_rate}. Make sure that the provided `raw_audio`input was sampled with"
                    f"{self.sampling_rate} and not {sampling_rate}."
                )
        else:
            print( "It is strongly recommended to pass the ``sampling_rate`` argument to this function. \
                    Failing to do so can result in silent errors that might be hard to debug.")
        # check batch input
        is_batched_numpy = isinstance(raw_audio, np.ndarray) and len(raw_audio.shape) > 1
        is_batched = is_batched_numpy or (
            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
        )

        if not is_batched:
            raw_audio = [raw_audio]

        encoded_inputs = BatchFeature({"input_values": raw_audio})

        padded_inputs = self.pad(
            encoded_inputs,
            padding=padding,
            max_length=max_length,
            return_attention_mask=None,
            truncation=truncation
        )
        # return_to_tensors comes from: transformers/birdset/transformers/feature_extraction_utils.py
        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
                
        return padded_inputs 

- custom feature extractor does not work when birdset1k_audio.set_format(type='pt')
- can be done via .map
- !! map could also load in all data and then detect events with equal lengths and choose this for the complete dataset
It is helpful to understand how this works, so you can come up with your own ways to use batch mapping. At this point, you may be wondering how you can control the size of the generated datamodule. The answer is: the mapped function does not have to return an output batch of the same size.

In other words, your mapped function input can be a batch of size N and return a batch of size M. The output M can be greater than or less than N. **This means you can concatenate your examples, divide it up, and even add more examples!**

from datasets import Dataset

dataset = Dataset.from_dict({"a": [0, 1, 2]})

dataset_with_duplicates = datamodule.map(lambda batch: {"b": batch["a"] * 2}, remove_columns=["a"], batched=True)

len(dataset_with_duplicates)


In [5]:
birdset1k_audio["train"][0]["audio"]

{'path': 'XC146305.ogg',
 'array': array([ 2.25619879e-05,  3.81759601e-05,  4.19990392e-05, ...,
        -4.22009546e-03,  5.75777609e-03,  5.16055990e-03]),
 'sampling_rate': 32000}

In [6]:
feature_extractor = CustomFeatureExtractor()

def preprocess_function(samples):
    audio_arrays = [x["array"] for x in samples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        padding=True,
        max_length=16_000*5,
        truncation=True,
        return_tensors="pt"
    )
    return inputs


encoded_birdset = birdset1k_audio.map(
    preprocess_function,
    remove_columns=['audio'],
    batched=True,
    batch_size=100
)

# truncate and pad

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x7ff7e2347430>:
Traceback (most recent call last):
  File "/home/lukas/miniconda3/envs/dal-toolbox/lib/python3.9/site-packages/soundfile.py", line 1244, in vio_read
    try:
KeyboardInterrupt: 


In [1]:
encoded_birdset["train"][0]

NameError: name 'encoded_birdset' is not defined

In [60]:
import torch
torch.cuda.device_count()

1

In [61]:
dataloader = DataLoader(
    encoded_birdset["train"],
    batch_size=16,
    shuffle=False,
    num_workers=4,
)

In [66]:
next(iter(dataloader))["input_values"].shape

torch.Size([16, 80000])

In [67]:
next(iter(dataloader))["input_values"]

tensor([[ 2.2562e-05,  3.8176e-05,  4.1999e-05,  ...,  1.3886e-02,
          1.5060e-02,  8.7285e-03],
        [ 3.7719e-07,  2.6892e-08, -3.3760e-08,  ...,  3.0991e-03,
         -4.7870e-03, -8.9598e-03],
        [ 1.1176e-08, -3.4459e-08, -5.5879e-09,  ..., -3.4147e-02,
         -3.4990e-02, -3.4313e-02],
        ...,
        [ 1.0459e-11,  5.6843e-12,  3.1832e-12,  ..., -8.9108e-04,
         -4.2660e-04, -4.8647e-03],
        [ 1.1921e-07, -6.2864e-08, -3.5856e-08,  ...,  2.0346e-34,
          2.0346e-34,  2.0346e-34],
        [-4.4518e-05, -4.2392e-05, -1.9335e-05,  ..., -2.4559e-02,
          3.0974e-03,  2.5544e-02]])

In [82]:
for _ in tqdm(dataloader):
    pass

# 1.6 seconds 

100%|██████████| 50/50 [00:01<00:00, 30.06it/s]


In [16]:
# without .map
birdset1k_audio["train"][0]["audio"]

{'path': 'XC146305.ogg',
 'array': array([ 2.25619879e-05,  3.81759601e-05,  4.19990392e-05, ...,
        -4.22009546e-03,  5.75777609e-03,  5.16055990e-03]),
 'sampling_rate': 32000}

In [7]:
from transformers import DataCollatorWithPadding

In [9]:
from tqdm import tqdm
from torch.utils.data import DataLoader

In [7]:
from dataclasses import dataclass
from typing import Any
from transformers import BatchFeature

@dataclass
class CustomCollatorWithPadding:

    feature_extractor: Any
    padding: bool = True
    truncation: bool = True
    max_length: int = None
    return_tensors: str = "pt"
    preprocessed: bool=False

    def __call__(self, batch):

        # preprocessed means that the .map function was applied
        # here, the feature extractor is only used for padding
        if self.preprocessed:
            batch = self.feature_extractor.pad(
                batch,
                padding=self.padding,
                max_length=self.max_length,
                truncation=self.truncation,
                return_tensors=self.return_tensors,
                return_attention_mask = None
            )

        # here, we first have to format the input and then pad it
        # note that everything regarding resampling is not implemented here    
        else:
            audio_arrays = [x["audio"]["array"] for x in batch]
            labels = [x["primary"] for x in batch]

            # batch feature is just a dictionary
            encoded_inputs = BatchFeature({"input_values": audio_arrays})
            batch = {**encoded_inputs, "labels": labels}

            batch = self.feature_extractor.pad(
                batch,
                padding=self.padding,
                max_length=self.max_length,
                truncation=self.truncation,
                return_tensors=self.return_tensors,
                return_attention_mask = None
            )

        if "label" in batch: 
            batch["labels"] = batch["label"]
            del batch["label"]

        if "target" in batch:
            batch["labels"] = batch["target"]
            del batch["target"]
        
        if "primary" in batch:
            batch["labels"] = batch["primary"]
            del batch["primary"]

        
        return batch


In [12]:
feature_extractor = CustomFeatureExtractor()
dataloader = DataLoader(
    birdset1k_audio["train"],
    collate_fn=CustomCollatorWithPadding(
        feature_extractor,
        padding='longest',
        return_tensors="pt",
        truncation=False        
    ),
    batch_size=16,
    shuffle=False,
    num_workers=4,
)

In [13]:
next(iter(dataloader))

{'input_values': tensor([[ 2.2562e-05,  3.8176e-05,  4.1999e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 3.7719e-07,  2.6892e-08, -3.3760e-08,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.1176e-08, -3.4459e-08, -5.5879e-09,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 1.0459e-11,  5.6843e-12,  3.1832e-12,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.1921e-07, -6.2864e-08, -3.5856e-08,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-4.4518e-05, -4.2392e-05, -1.9335e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), 'labels': tensor([485, 391, 191, 273,   6, 187, 513, 127, 427, 219, 180, 363, 489, 196,
        116,  38])}

In [14]:
for batch in tqdm(dataloader):
    pass

# takes 44 seconds

100%|██████████| 50/50 [00:44<00:00,  1.11it/s]


### Test for Birdset5k

In [1]:
from datasets import load_dataset, Audio, DatasetInfo

In [5]:
birdset5k = load_dataset("DBD-research-group/na_metadata5k_ogg_nodecode") 
birdset5k = birdset5k.cast_column(
    column="audio",
    feature=Audio(
        sampling_rate=32_000,
        mono=True,
        decode=True
    )
)

Found cached dataset parquet (/home/lukas/.cache/huggingface/datasets/DBD-research-group___parquet/DBD-research-group--na_metadata5k_ogg_nodecode-de5164d492ad6e9e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
birdset5k_audio = birdset5k.select_columns(['audio', 'primary'])

In [7]:
birdset5k_audio

DatasetDict({
    train: Dataset({
        features: ['primary', 'audio'],
        num_rows: 4316
    })
    test: Dataset({
        features: ['primary', 'audio'],
        num_rows: 1080
    })
})

In [9]:
birdset5k_audio["train"][0]

{'primary': 12,
 'audio': {'path': 'XC402197.ogg',
  'array': array([ 8.73114914e-11, -1.45519152e-10, -5.38420863e-10, ...,
          2.28490215e-03,  1.52546330e-03,  2.12723855e-04]),
  'sampling_rate': 32000}}

In [4]:
from tqdm import tqdm
from torch.utils.data import DataLoader

In [6]:
for instance in tqdm(birdset5k_audio["train"]):
    pass

# running through 800 training instances took 2 minutes
# now we have 5.395 times the data with 4316 training instances
# it took 11 minutes

100%|██████████| 4316/4316 [11:22<00:00,  6.33it/s]


**With Mapping**

- idea is to use this as event detection + transformation to spectrograms
- set_transform could also be used instead of cast_column to load only parts of the audio file if required

In [5]:
from transformers import BatchFeature
from transformers import SequenceFeatureExtractor
import numpy as np 
# we could incorporate some kind of event detector in the customfeatureextractor

class CustomFeatureExtractor(SequenceFeatureExtractor):
    model_input_names = ["input_values"]

    def __init__(
        self,
        feature_size=1,
        sampling_rate=32_000,
        padding_value=0.0,
        return_attention_mask=False
    ):
        # initialize sequencefeatureextractor
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value)
        self.return_attention_mask = return_attention_mask

    def __call__(
        self, 
        raw_audio,
        padding = False, 
        max_length = None,
        truncation = False, 
        return_tensors = None,
        sampling_rate = None
    ) -> BatchFeature:
        
        # control/check sampling rate 
        if self.sampling_rate is not None: 
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                    f"{self.sampling_rate}. Make sure that the provided `raw_audio`input was sampled with"
                    f"{self.sampling_rate} and not {sampling_rate}."
                )
        else:
            print( "It is strongly recommended to pass the ``sampling_rate`` argument to this function. \
                    Failing to do so can result in silent errors that might be hard to debug.")
        # check batch input
        is_batched_numpy = isinstance(raw_audio, np.ndarray) and len(raw_audio.shape) > 1
        is_batched = is_batched_numpy or (
            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
        )

        if not is_batched:
            raw_audio = [raw_audio]

        encoded_inputs = BatchFeature({"input_values": raw_audio})

        padded_inputs = self.pad(
            encoded_inputs,
            padding=padding,
            max_length=max_length,
            return_attention_mask=None,
            truncation=truncation
        )
        # return_to_tensors comes from: transformers/birdset/transformers/feature_extraction_utils.py
        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
                
        return padded_inputs 

In [24]:
feature_extractor = CustomFeatureExtractor()

def preprocess_function(samples):
    audio_arrays = [x["array"] for x in samples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        padding=True,
        max_length=32_000*5,
        truncation=True,
        return_tensors="pt"
    )
    return inputs


encoded_birdset = birdset5k_audio.map(
    preprocess_function,
    remove_columns=['audio'],
    batched=True,
    batch_size=100,
    load_from_cache_file=True
)

# took ~14 mionutes?

# truncate and pad

Loading cached processed dataset at /home/lukas/.cache/huggingface/datasets/DBD-research-group___parquet/DBD-research-group--na_metadata5k_ogg_nodecode-de5164d492ad6e9e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-c041687f9d2f3ce9.arrow
Loading cached processed dataset at /home/lukas/.cache/huggingface/datasets/DBD-research-group___parquet/DBD-research-group--na_metadata5k_ogg_nodecode-de5164d492ad6e9e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3273e49b4e01136d.arrow


In [26]:
encoded_birdset["train"].data.nbytes /1e9

1.381171792

In [27]:
birdset5k_audio["train"].data.nbytes /1e9

2.015389166

In [11]:
encoded_birdset

DatasetDict({
    train: Dataset({
        features: ['primary', 'input_values'],
        num_rows: 4316
    })
    test: Dataset({
        features: ['primary', 'input_values'],
        num_rows: 1080
    })
})

In [13]:
# important! otherwise, dataloader does not work
encoded_birdset.set_format("np")

In [14]:
encoded_birdset["train"][0]

{'primary': 12,
 'input_values': array([ 8.7311491e-11, -1.4551915e-10, -5.3842086e-10, ...,
        -6.6355248e-03, -6.7564198e-03, -3.9427751e-03], dtype=float32)}

In [22]:
encoded_birdset["train"]

Dataset({
    features: ['primary', 'input_values'],
    num_rows: 4316
})

In [17]:
dataloader = DataLoader(
    encoded_birdset["train"],
    batch_size=16,
    shuffle=False,
    num_workers=4,
)

In [16]:
next(iter(dataloader))

{'primary': tensor([12, 12, 37,  6, 29, 16, 35,  6, 12, 28, 15,  6,  3, 16, 38, 19]),
 'input_values': tensor([[ 8.7311e-11, -1.4552e-10, -5.3842e-10,  ..., -6.6355e-03,
          -6.7564e-03, -3.9428e-03],
         [-1.3824e-10,  2.1828e-11, -1.4916e-10,  ..., -6.9000e-03,
          -4.3055e-03,  1.6439e-03],
         [ 1.3097e-10, -3.2742e-10, -4.0018e-11,  ..., -2.4993e-03,
          -6.6736e-03, -5.9759e-03],
         ...,
         [-1.0516e-12, -1.1369e-13,  4.5475e-13,  ..., -2.9336e-02,
          -3.9038e-02, -4.6530e-02],
         [-3.9246e-06,  6.9001e-05,  1.1259e-04,  ..., -2.7581e-02,
          -1.6591e-02, -1.2605e-02],
         [ 2.5702e-06,  2.7131e-06,  2.9221e-06,  ..., -8.6000e-03,
          -1.3283e-02, -1.1424e-02]])}

In [21]:
for batch in tqdm(dataloader): 
    pass

# takes 0.3seconds

100%|██████████| 270/270 [00:00<00:00, 752.98it/s]


In [23]:
for instance in tqdm(encoded_birdset["train"]):
    pass

100%|██████████| 4316/4316 [00:00<00:00, 9957.35it/s] 


## Without Mapping

In [28]:
from dataclasses import dataclass
from typing import Any
from transformers import BatchFeature

@dataclass
class CustomCollatorWithPadding:

    feature_extractor: Any
    padding: bool = True
    truncation: bool = True
    max_length: int = None
    return_tensors: str = "pt"
    preprocessed: bool=False

    def __call__(self, batch):

        # preprocessed means that the .map function was applied
        # here, the feature extractor is only used for padding
        if self.preprocessed:
            batch = self.feature_extractor.pad(
                batch,
                padding=self.padding,
                max_length=self.max_length,
                truncation=self.truncation,
                return_tensors=self.return_tensors,
                return_attention_mask = None
            )

        # here, we first have to format the input and then pad it
        # note that everything regarding resampling is not implemented here    
        else:
            audio_arrays = [x["audio"]["array"] for x in batch]
            labels = [x["primary"] for x in batch]

            # batch feature is just a dictionary
            encoded_inputs = BatchFeature({"input_values": audio_arrays})
            batch = {**encoded_inputs, "labels": labels}

            batch = self.feature_extractor.pad(
                batch,
                padding=self.padding,
                max_length=self.max_length,
                truncation=self.truncation,
                return_tensors=self.return_tensors,
                return_attention_mask = None
            )

        if "label" in batch: 
            batch["labels"] = batch["label"]
            del batch["label"]

        if "target" in batch:
            batch["labels"] = batch["target"]
            del batch["target"]
        
        if "primary" in batch:
            batch["labels"] = batch["primary"]
            del batch["primary"]

        
        return batch


In [29]:
feature_extractor = CustomFeatureExtractor()
dataloader = DataLoader(
    birdset5k_audio["train"],
    collate_fn=CustomCollatorWithPadding(
        feature_extractor,
        padding='longest',
        return_tensors="pt",
        truncation=False        
    ),
    batch_size=16,
    shuffle=False,
    num_workers=4,
)

In [30]:
next(iter(dataloader))

{'input_values': tensor([[ 8.7311e-11, -1.4552e-10, -5.3842e-10,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.3824e-10,  2.1828e-11, -1.4916e-10,  ...,  0.0000e+00,
          0.0000e+00, -1.8190e-12],
        [ 1.3097e-10, -3.2742e-10, -4.0018e-11,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [-1.0516e-12, -1.1369e-13,  4.5475e-13,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.9246e-06,  6.9001e-05,  1.1259e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 2.5702e-06,  2.7131e-06,  2.9221e-06,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), 'labels': tensor([12, 12, 37,  6, 29, 16, 35,  6, 12, 28, 15,  6,  3, 16, 38, 19])}

In [31]:
for batch in tqdm(dataloader):
    pass

# took 3 minutes with loading and padding (without .map feature extraction!)

100%|██████████| 270/270 [03:26<00:00,  1.31it/s]
