In [1]:
from datasets import load_dataset

import matplotlib.pyplot as plt 
import librosa
from IPython.display import Audio
import numpy as np 

In [2]:
esc = load_dataset("ashraq/esc50", split="train")

Found cached dataset parquet (/home/lukas/.cache/huggingface/datasets/ashraq___parquet/ashraq--esc50-1000c3b73cc1500f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Note: Currently datasets make use of torchaudio and librosa for audio loading and resampling. If you wish to implement your own costumized data loading/sampling,

 feel free to just make use of the "path" column instead and disregard the "audio" column.


In [36]:
esc

Dataset({
    features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],
    num_rows: 2000
})

In [34]:
from datasets import Audio

In [35]:
esc = esc.cast_column("audio", Audio(sampling_rate=16_000))

In [34]:
esc[10]["audio"]

{'path': None,
 'array': array([1.85793069e-05, 3.60396007e-05, 1.33315079e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]),
 'sampling_rate': 16000}

In [35]:
waveform = esc[101]["audio"]["array"]
sampling_rate = esc[101]["audio"]["sampling_rate"]

from IPython.display import Audio
Audio(data=waveform, rate=sampling_rate)

**Filtering**

In [None]:
MAX_DURATION_IN_SECONDS = 20.0

def is_audio_length_in_range(input_length):
    return input_length < MAX_DURATION_IN_SECONDS

In [39]:
# no path here, no audio files, only arrays 
new_column = [librosa.get_duration(filename=x) for x in esc["audio"]["path"]]

TypeError: list indices must be integers or slices, not str

**Feature Extractor**

In [3]:
#from transformers import SequenceFeatureExtractor
from transformers import BatchFeature

from transformers import SequenceFeatureExtractor
# does not work out of the box, has to be inherited from imo 
# extractor = SequenceFeatureExtractor(
#     feature_size = 1,
#     sampling_rate = 16_000,
#     padding_value = 0.0
# )


In [4]:
# we could incorporate some kind of event detector in the customfeatureextractor

class CustomFeatureExtractor(SequenceFeatureExtractor):
    model_input_names = ["input_values"]

    def __init__(
        self,
        feature_size=1,
        sampling_rate=16_000,
        padding_value=0.0,
        return_attention_mask=False
    ):
        # initialize sequencefeatureextractor
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value)
        self.return_attention_mask = return_attention_mask

    def __call__(
        self, 
        raw_audio,
        padding = False, 
        max_length = None,
        truncation = False, 
        return_tensors = None,
        sampling_rate = None
    ) -> BatchFeature:
        
        # control/check sampling rate 
        if self.sampling_rate is not None: 
            if sampling_rate != self.sampling_rate:
                raise ValueError(
                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
                    f"{self.sampling_rate}. Make sure that the provided `raw_audio`input was sampled with"
                    f"{self.sampling_rate} and not {sampling_rate}."
                )
        else:
            print( "It is strongly recommended to pass the ``sampling_rate`` argument to this function. \
                    Failing to do so can result in silent errors that might be hard to debug.")
        # check batch input
        is_batched_numpy = isinstance(raw_audio, np.ndarray) and len(raw_audio.shape) > 1
        is_batched = is_batched_numpy or (
            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
        )

        if not is_batched:
            raw_audio = [raw_audio]

        encoded_inputs = BatchFeature({"input_values": raw_audio})

        padded_inputs = self.pad(
            encoded_inputs,
            padding=padding,
            max_length=max_length,
            return_attention_mask=None,
            truncation=truncation
        )
        # return_to_tensors comes from: transformers/src/transformers/feature_extraction_utils.py
        if return_tensors is not None:
            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
                
        return padded_inputs 

In [6]:
from datasets import Audio
# cast column probably casts for the complete dataset and saves? 
esc = esc.cast_column("audio", Audio(sampling_rate=16_000))
# convert into format for padding
test_sample = esc[10]["audio"]["array"]

In [7]:
test_sample.shape

(80000,)

In [5]:
feature_extractor = CustomFeatureExtractor()

In [20]:
test_input = feature_extractor(
    test_sample,
    sampling_rate=16_000,
    padding=True,
    return_tensors="pt",
    max_length=16_000*3,
    truncation=True)

In [49]:
esc = esc.cast_column("audio", Audio(sampling_rate=16_000))
# convert into format for padding
test_sample = esc[1:5]

In [51]:
test_samples = [x["array"] for x in test_sample["audio"]]

In [52]:
test_samples

[array([-0.04138798, -0.12544751, -0.04440723, ..., -0.03225712,
         0.05051001,  0.05712459]),
 array([-0.00718514, -0.00815905, -0.00541599, ..., -0.03492695,
         0.14444694,  0.14199632]),
 array([ 0.31204545,  0.23226222,  0.50212479, ...,  0.11517357,
        -0.05800427,  0.10114383]),
 array([-0.00044059, -0.00042982,  0.00094823, ...,  0.00283315,
         0.002939  ,  0.00285092])]

In [53]:
test_inputs = feature_extractor(
    test_samples,
    sampling_rate=16_000,
    padding=True,
    return_tensors="pt",
    max_length=16_000*3,
    truncation=True)

In [54]:
test_inputs

{'input_values': tensor([[-4.1388e-02, -1.2545e-01, -4.4407e-02,  ..., -4.8154e-02,
         -1.3149e-02,  2.1486e-02],
        [-7.1851e-03, -8.1590e-03, -5.4160e-03,  ...,  5.8837e-01,
         -4.7827e-02,  7.1412e-02],
        [ 3.1205e-01,  2.3226e-01,  5.0212e-01,  ...,  7.1962e-03,
         -3.8336e-01,  1.7940e-01],
        [-4.4059e-04, -4.2982e-04,  9.4823e-04,  ..., -6.4784e-03,
         -7.1633e-03, -6.5774e-03]])}

**Preprocess Before**

In [None]:
# from transformers import AutoFeatureExtractor

# feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

In [5]:
feature_extractor = CustomFeatureExtractor()

In [6]:
def preprocess_function(samples):
    audio_arrays = [x["array"] for x in samples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        padding=True,
        max_length=16_000*2,
        truncation=True,
        return_tensors="pt"
    )
    return inputs

In [7]:
encoded_esc = esc.map(
    preprocess_function,
    remove_columns=["audio", "filename", "fold", "category", "esc10","src_file", "take"],
    batched=True,
    batch_size=100
)
# add preprocessed flag for later convenience
encoded_esc.preprocessed = True

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

- mapping and caching as tensor: 

- map ignores formatting while writing a cache file
- needed: set_format("pt", columns=["input_values"], output_all_columns=True)
- with collator would be the best solution i guess


In [78]:
# strange: tensors are calculated but not saved for the dataset:
# 
encoded_esc[1]["input_values"][:10]

[-0.041387978941202164,
 -0.12544751167297363,
 -0.04440722614526749,
 0.03091111034154892,
 0.19001446664333344,
 0.03600512444972992,
 -0.14083096385002136,
 -0.003211977891623974,
 0.05811677128076553,
 0.1373983770608902]

In [83]:
encoded_esc.set_format(type="torch", columns=["input_values", "target"])

In [84]:
encoded_esc[0]

{'target': tensor(0), 'input_values': tensor([0., 0., 0.,  ..., 0., 0., 0.])}

In [80]:
encoded_esc.preprocessed = True

In [81]:
encoded_esc.preprocessed

True

In [85]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(
    tokenizer=CustomFeatureExtractor(),
    padding='longest',
    max_length=16_000*2,
    return_tensors='pt'
)

In [86]:
from torch.utils.data import DataLoader

dataloader = DataLoader(encoded_esc, batch_size=10, collate_fn=collator, shuffle=True)

In [90]:
for batch in dataloader:
    print(batch["input_values"].shape)
    break

torch.Size([10, 32000])


In [29]:
def custom_collator(batch):
    batch = batch[-1] #no list view
    wave = batch["audio"]["array"]
    sr = batch["audio"]["sampling_rate"]
    target = batch["target"]

    inputs = feature_extractor
    return batch 

In [30]:
for batch in dataloader:
    print(batch)
    break

[{'filename': '3-140199-A-8.wav', 'fold': 3, 'target': 8, 'category': 'sheep', 'esc10': False, 'src_file': 140199, 'take': 'A', 'audio': {'path': None, 'array': array([0.0214437 , 0.04072496, 0.03580027, ..., 0.00351946, 0.02853519,
       0.04049726]), 'sampling_rate': 16000}}, {'filename': '1-94231-B-32.wav', 'fold': 1, 'target': 32, 'category': 'keyboard_typing', 'esc10': False, 'src_file': 94231, 'take': 'B', 'audio': {'path': None, 'array': array([-5.05744356e-05,  3.53314390e-05,  3.47646710e-04, ...,
       -9.88819302e-05, -4.06061590e-04, -1.17571116e-03]), 'sampling_rate': 16000}}, {'filename': '5-103420-A-2.wav', 'fold': 5, 'target': 2, 'category': 'pig', 'esc10': False, 'src_file': 103420, 'take': 'A', 'audio': {'path': None, 'array': array([-0.06226345,  0.22140728,  0.34831095, ...,  0.        ,
        0.        ,  0.        ]), 'sampling_rate': 16000}}, {'filename': '4-182039-A-30.wav', 'fold': 4, 'target': 30, 'category': 'door_wood_knock', 'esc10': False, 'src_file': 

In [91]:
from dataclasses import dataclass
from typing import Any
from transformers import BatchFeature

@dataclass
class CustomCollatorWithPadding:

    feature_extractor: Any
    padding: bool = True
    truncation: bool = True
    max_length: int = None
    return_tensors: str = "pt"
    preprocessed: bool=False

    def __call__(self, batch):

        # preprocessed means that the .map function was applied
        # here, the feature extractor is only used for padding
        if self.preprocessed:
            batch = self.feature_extractor.pad(
                batch,
                padding=self.padding,
                max_length=self.max_length,
                truncation=self.truncation,
                return_tensors=self.return_tensors,
                return_attention_mask = None
            )

        # here, we first have to format the input and then pad it
        # note that everything regarding resampling is not implemented here    
        else:
            audio_arrays = [x["audio"]["array"] for x in batch]
            labels = [x["target"] for x in batch]

            # batch feature is just a dictionary
            encoded_inputs = BatchFeature({"input_values": audio_arrays})
            batch = {**encoded_inputs, "labels": labels}

            batch = self.feature_extractor.pad(
                batch,
                padding=self.padding,
                max_length=self.max_length,
                truncation=self.truncation,
                return_tensors=self.return_tensors,
                return_attention_mask = None
            )

        if "label" in batch: 
            batch["labels"] = batch["label"]
            del batch["label"]

        if "target" in batch:
            batch["labels"] = batch["target"]
            del batch["target"]

        
        return batch


In [98]:
esc.preprocessed = False

In [105]:
from torch.utils.data import DataLoader
dataset = esc
#dataset = encoded_esc

collator = CustomCollatorWithPadding(
    feature_extractor=feature_extractor,
    padding=True,
    max_length=16_000*1,
    truncation=True,
    preprocessed=datamodule.preprocessed
)

dataloader = DataLoader(dataset, batch_size=5, collate_fn=collator, shuffle=True)

In [106]:
for i in dataloader: 
    print(i)
    break

{'input_values': tensor([[-0.0059, -0.0043, -0.0032,  ..., -0.0016, -0.0023, -0.0029],
        [ 0.0031,  0.0032,  0.0030,  ...,  0.0045,  0.0046,  0.0046],
        [ 0.1048,  0.0818,  0.0605,  ...,  0.0041,  0.0035,  0.0033],
        [ 0.0238,  0.0264,  0.0277,  ..., -0.0342, -0.0351, -0.0328],
        [ 0.0193,  0.0099, -0.0108,  ...,  0.0835,  0.0750,  0.0831]]), 'labels': tensor([48, 31,  6, 27,  1])}


In [104]:
for i in dataloader: 
    print(i)
    break

{'input_values': tensor([[ 2.5940e-03,  7.0190e-04, -1.9531e-03,  ...,  2.7283e-02,
          4.2145e-02,  5.3131e-02],
        [ 3.0518e-05,  1.5259e-04,  9.1553e-05,  ...,  3.6621e-04,
          2.1362e-04,  1.2207e-04],
        [-9.7656e-04,  6.1035e-05,  1.0986e-03,  ...,  7.9651e-03,
          7.5989e-03,  6.8054e-03],
        [ 5.9143e-02,  5.5573e-02,  4.3732e-02,  ...,  1.0165e-01,
          9.5581e-02,  8.7677e-02],
        [ 1.1597e-02,  1.0620e-02,  1.1139e-02,  ..., -3.9337e-02,
         -4.0405e-02, -3.6377e-02]]), 'labels': tensor([27, 36, 14,  3,  5])}


**return_attention_mask**: Whether the model should make use of an attention_mask for batched inference. In general, models should always make use of the attention_mask to mask padded tokens.

 However, due to a very specific design choice of Wav2Vec2's "base" checkpoint, better results are achieved when using no attention_mask. This is not recommended for other speech models. 
 
 For more information, one can take a look at this issue. Important If you want to use this notebook to fine-tune large-lv60, this parameter should be set to True.

The first component of Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - 

features from the raw speech signal. This part of the model has already been sufficiently trained during pretrainind and as stated in the paper does not need to be fine-tuned anymore. 

Thus, we can set **the requires_grad to False for all parameters of the feature extraction part**.


In [1]:
#model.freeze_feature_extractor()