# Inizializzazione

In [None]:
!pip install transformers

In [9]:
!pip install datasets




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
!pip install librosa




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
!pip install soundfile




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import AST Pretrained and test

## Import dataset huggingface

In [None]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification
from datasets import load_dataset
import torch

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

## Import AST huggingface

In [123]:
# ast feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [124]:
# ast pretrained
ast_huggingface = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_huggingface

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       

## Test pretrained model

In [125]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

with torch.no_grad():
    logits = ast_huggingface(**inputs).logits

predicted_class_ids = torch.argmax(logits, dim=-1).item()
predicted_label = ast_huggingface.config.id2label[predicted_class_ids]
print(predicted_label)

# compute loss - target_label is e.g. "down"
target_label = ast_huggingface.config.id2label[0]
inputs["labels"] = torch.tensor([ast_huggingface.config.label2id[target_label]])
loss = ast_huggingface(**inputs).loss
round(loss.item(), 2)

Speech


0.17

# Prompt Tuning

## Retrieve Output size

In [126]:
from transformers import ASTModel
import torch

ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_model

ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ASTLayer(
        (attention): ASTSdpaAttention(
          (attention): ASTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [127]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
with torch.no_grad():
    outputs = ast_model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

[1, 1214, 768]

## Model and testing

In [107]:
from functools import reduce
from operator import mul
import math
import torch
import torch.nn as nn

class AST_PromptTuning(nn.Module):

    # dropout apply dropout after each prompt
    # str = "none" --> only head tuning
    def __init__(self, prompt_tokens: int = 5, prompt_dropout: float = 0.0, prompt_type: str = 'deep'):
        super().__init__()

        # load vit model
        self.encoder = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

        # hidden_size = depth of the model
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 384),
            # nn.Linear(self.encoder.config.hidden_size, 192),
            # nn.Linear(self.encoder.config.hidden_size, 96),
            nn.Linear(384, 15)
        )

        # freeze
        for n, p in self.encoder.named_parameters():
            p.requires_grad = False

        self.prompt_type = prompt_type # "shallow" "deep" or None

        if prompt_type is not None:

            # prompt
            self.prompt_tokens = prompt_tokens  # number of prompted tokens
            self.prompt_dropout = nn.Dropout(prompt_dropout)
            self.prompt_dim = self.encoder.config.hidden_size

            # initiate prompt (random)
            val = math.sqrt(6. / float(3 * reduce(mul, (self.encoder.config.patch_size, self.encoder.config.patch_size), 1) + self.prompt_dim))

            # my vector of learnable parameters (how many (prompt_tokens) and dimension (prompt_dim))
            self.prompt_embeddings = nn.Parameter(torch.zeros(1, self.prompt_tokens, self.prompt_dim))

            # xavier_uniform initialization
            nn.init.uniform_(self.prompt_embeddings.data, -val, val)

            if self.prompt_type == 'deep':
                self.total_d_layer = self.encoder.config.num_hidden_layers
                self.deep_prompt_embeddings = nn.Parameter(
                    # - 1 cause shallow already inserted
                    torch.zeros(self.total_d_layer-1, self.prompt_tokens, self.prompt_dim)
                )
                # xavier_uniform initialization
                nn.init.uniform_(self.deep_prompt_embeddings.data, -val, val)

    def train(self, mode=True):
        # set train status for this class: disable all but the prompt-related modules
        if mode:
            # training:
            self.encoder.eval()
            if self.prompt_type is not None:
              # enable dropout and batch normalization
                self.prompt_dropout.train()
        else:
            # eval:
            for module in self.children():
                module.train(mode)

    def incorporate_prompt(self, x, prompt_embeddings, n_prompt: int = 0):
        # x shape: (batch size, n_tokens, hidden_dim)
        # pompt_embeddings shape: (1, n_prompt, hidden_dim)
        B = x.shape[0]

        # peek the class token, add prompts, add sequence

        # concat prompts: (batch size, cls_token + n_prompt + n_patches, hidden_dim)
        x = torch.cat((
            x[:, :1, :],
            self.prompt_dropout(prompt_embeddings.expand(B, -1, -1)),
            x[:, (1+n_prompt):, :]
        ), dim=1)

        return x

    def forward_features(self, x):

        # go through the encoder embeddings
        x = self.encoder.embeddings(x)

        # add prompts
        x = self.incorporate_prompt(x, self.prompt_embeddings)

        if self.prompt_type == 'deep':
            # deep mode
            x = model.encoder.encoder.layer[0](x)[0]
            for i in range(1, self.total_d_layer):
                x = self.incorporate_prompt(x, self.deep_prompt_embeddings[i-1], self.prompt_tokens)
                x = model.encoder.encoder.layer[i](x)[0]
        else:
            # shallow mode
            x = self.encoder.encoder(x)["last_hidden_state"]

        x = self.encoder.layernorm(x)
        #print(x.shape)
        return x

    def forward(self, x):
        if self.prompt_type is not None:
            x = self.forward_features(x)[:, 0, :]
        else:
          # pass x, take the classification token
            x = self.encoder(x)["last_hidden_state"][:, 0, :]

        x = self.classifier(x)
        return x

In [108]:
ast_prompt = AST_PromptTuning(prompt_type=None)
# count number of parameters
print("AST params:", sum(p.numel() for p in ast_prompt.parameters()))
# count number of trainable parameters
print("Head fine-tuning:", sum(p.numel() for p in ast_prompt.parameters() if p.requires_grad))
ast_prompt_shallow = AST_PromptTuning(prompt_type='shallow')
# count number of trainable parameters
print("Shallow prompt-tuning:", sum(p.numel() for p in ast_prompt_shallow.parameters() if p.requires_grad))
ast_prompt_deep = AST_PromptTuning(prompt_type='deep')
# count number of trainable parameters
print("Deep prompt-tuning:", sum(p.numel() for p in ast_prompt_deep.parameters() if p.requires_grad))

AST params: 86488335
Head fine-tuning: 301071
Shallow prompt-tuning: 304911
Deep prompt-tuning: 347151


In [129]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

with torch.no_grad():
    outputs = ast_prompt(inputs['input_values'])

predicted_class_ids = torch.argmax(outputs, dim=-1).item()
predicted_class_ids

14

In [170]:
import torch.nn.functional as F

softmax = F.softmax(outputs, dim=1)
softmax

tensor([[0.0868, 0.0701, 0.0472, 0.0732, 0.0544, 0.0472, 0.0497, 0.0480, 0.0695,
         0.0493, 0.0593, 0.0988, 0.1038, 0.0229, 0.1197]])

# Implementation

## Utilities

In [304]:
import os
import librosa

def load_audio(audio_path):
    audio, sample_rate = librosa.load(audio_path, sr=16000)

    return audio, sample_rate

## TUT17 Dataset

In [319]:
from torch.utils.data import Dataset
import random

# from folder to PyTorch Dataset
class TUT17(Dataset):
    def __init__(self, root_dir, split = 'train', seed = 42, val_frac= 0.1, test_frac= 0.1):
        super().__init__()

        # we use seed because every time we instantiate the dataset we shuffle all the data
        # we call at least 3 times (train, validation, test) --> overlapping area
        # with seed we are sure that the dataset is shuffled always in the same way
        random.seed(seed)
        self.root_dir = root_dir
        
        audio_names = os.listdir(os.path.join(root_dir, 'Audio'))
        
        num_val = int(len(audio_names)*val_frac)
        num_test = int(len(audio_names)*test_frac)
        num_train = len(audio_names) - num_val - num_test

        random.shuffle(audio_names)
    
        # at this step we are only using images names - we are not using images
        if split == 'train':
            self.data = audio_names[:num_train]
        elif split == 'val':
            self.data = audio_names[num_train:num_train+num_val]
        elif split == 'test':
            self.data = audio_names[-num_test:]
        else:
          raise ValueError('Invalid split value.')
    
    # optional
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        audio_path = os.path.join(self.root_dir, 'audio', self.data[idx])

        audio_name = audio_path.split('/')[-1][6:].replace('\\', '/')

        label_path = os.path.join(self.root_dir, 'labels\evaluate.txt')

        with open(label_path, "r") as f:
            while line := f.readline():
                if line.split('\t')[0] == audio_name:
                    f.close()
                    audio, sample_rate = load_audio(audio_path)
                    return {'audio': audio, 'sample_rate': sample_rate, 'label': line.split('\t')[1][:-1]}

In [321]:
train_dataset = TUT17(root_dir = "c:/Users/cerru/Desktop/TUT17", split='train')
val_dataset = TUT17(root_dir = 'c:/Users/cerru/Desktop/TUT17', split='val')
test_dataset = TUT17(root_dir = 'c:/Users/cerru/Desktop/TUT17', split='test')

In [344]:
from torch.utils.data import DataLoader

# Define loaders
train_loader = DataLoader(train_dataset, batch_size=4, num_workers=0, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_dataset,   batch_size=4, num_workers=0, shuffle=False, drop_last=True)
test_loader  = DataLoader(test_dataset,  batch_size=4, num_workers=0, shuffle=False, drop_last=True)

batch = next(iter(train_loader))

feature_extractor(batch["audio"], sampling_rate=batch["sample_rate"])

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

## Train

In [342]:
from tqdm import tqdm

def train(model, feature_extractor, criterion, epochs, dev, lr=0.001, load_checkpoint = False, save_every = 10, save_path = 'weights'):
    try:
        if not os.path.isdir(save_path):
            os.mkdir(save_path)
            
        # Move model to CUDA
        model = model.to(dev)
        # MOVE criterior to CUDA
        criterion = criterion.to(dev)

        # create optimizer
        optimizer = optim.Adam(model.parameters(), lr = lr)

        
        labels_l = []
        predictions_l = []
        
        # load checkpoints
        if load_checkpoint:
            if os.path.isfile(os.path.join(save_path,'weights.pt')):
                print('Loading weights...')
                # it is possible to load a state dict that doesn't match the networck architecture by passing asserting the strict mode
                model.load_state_dict(torch.load(os.path.join(save_path,'weights.pt')))
            if os.path.isfile(os.path.join(save_path,'optim.pt')):
                print('Loading optimizer...')
                optimizer.load_state_dict(torch.load(os.path.join(save_path,'optim.pt')))
            print('Loading completed!')

        # Initialize history
        history_loss = {"train": [], "val": [], "test": []}
        history_accuracy = {"train": [], "val": [], "test": []}

        # Process each epoch
        for epoch in range(epochs):
            # Initialize epoch variables
            sum_loss = {"train": 0, "val": 0, "test": 0}
            sum_accuracy = {"train": 0, "val": 0, "test": 0}
            
            # Process each split
            for split in ["train", "val", "test"]:
                #Select train() or eval() mode
                if split == 'train':
                  model.train()
                else:
                  model.eval()
                    
                # Process each batch
                for batch in loaders[split]:
                    # Move to CUDA
                    input_audio = batch['audio'].to(dev)
                    sample_rate = batch['sample_rate'].to(dev)
                    target = batch['label'].squeeze(1).to(dev)

                    # Reset gradients
                    if split == 'train':
                        optimizer.zero_grad()
                    
                    # Compute output
                    ast_imput = feature_extractor(input_audio, sampling_rate=sample_rate, return_tensors="pt")
                    output = model(ast_input)

                    # Compute loss 
                    loss = criterion(output, target.long())
                    
                    # Update loss
                    sum_loss[split] += loss.item()
                    
                    # Check parameter update
                    if split == "train":
                        # Compute gradients
                        loss.backward()
                        # Optimize
                        optimizer.step()
                        
                    # Compute accuracy
                    pred = torch.argmax(output,1)
                    batch_accuracy = (pred == target).sum().item()/target.numel()
                    
                    # Update accuracy
                    sum_accuracy[split] += batch_accuracy

                # checkpoint
                if epoch%save_every == 0 and split == 'train':
                    torch.save(model.state_dict(), os.path.join(save_path, 'weights.pt'))
                    torch.save(optimizer.state_dict(), os.path.join(save_path, 'optim.pt'))
                
                
            # Compute epoch loss/accuracy
            epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
            epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
            # Update history
            for split in ["train", "val", "test"]:
                history_loss[split].append(epoch_loss[split])
                history_accuracy[split].append(epoch_accuracy[split])
            # Print info
            print(f"Epoch {epoch+1}:",
                  f"TrL={epoch_loss['train']:.4f},",
                  f"TrA={epoch_accuracy['train']:.4f},",
                  f"VL={epoch_loss['val']:.4f},",
                  f"VA={epoch_accuracy['val']:.4f},",
                  f"TeL={epoch_loss['test']:.4f},",
                  f"TeA={epoch_accuracy['test']:.4f},")
    except KeyboardInterrupt:
        print("Interrupted")
    finally:
        # Plot loss
        plt.title("Loss")
        for split in ["train", "val", "test"]:
            plt.plot(history_loss[split], label=split)
        plt.legend()
        plt.show()
        # Plot accuracy
        plt.title("Accuracy")
        for split in ["train", "val", "test"]:
            plt.plot(history_accuracy[split], label=split)
        plt.legend()
        plt.show()

def train_one_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    model.to(device)
    criterion.to(device)
    running_loss = 0.0
    for batch in tqdm(train_loader):
        audio = batch["audio"]
        sample_rate = batch["sample_rate"]
        labels = batch["label"]

        audio, sample_rate = audio.to(device), labels.to(device)
        #optimizer.zero_grad()
        #outputs = model(inputs)
        #loss = criterion(outputs, labels)
        #loss.backward()
        #optimizer.step()
        #running_loss += loss.item()
    #return running_loss / len(train_loader)

In [338]:
def validate(model, val_loader, criterion, device):
    model.eval()
    model.to(device)
    criterion.to(device)
    running_loss = 0.0
    for (inputs, labels) in tqdm(val_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
    return running_loss / len(val_loader)

In [339]:
def train(model, train_loader, val_loader, criterion, optimizer, device, n_epochs: int = 10):
    for epoch in tqdm(range(n_epochs)):
        #train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        train_one_epoch(model, train_loader, criterion, optimizer, device)
        #val_loss = validate(model, val_loader, criterion, device)
        #print(f'Epoch {epoch+1}/{n_epochs} : Train Loss {train_loss:.4f} : Val Loss {val_loss:.4f}')

In [340]:
def test(model, test_loader, criterion, device):
    model.eval()
    model.to(device)
    criterion.to(device)
    running_loss = 0.0
    labels_l = []
    predictions_l = []
    for (inputs, labels) in tqdm(test_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1)
        # compute accuracy
        labels_l.append(labels)
        predictions_l.append(predictions)

    labels = torch.cat(labels_l, dim=0)
    predictions = torch.cat(predictions_l, dim=0)

    accuracy = (predictions == labels).sum().item() / len(labels)
    return accuracy

In [336]:
ast_prompt = AST_PromptTuning(prompt_type=None)
ast_prompt_shallow = AST_PromptTuning(prompt_type='shallow')
ast_prompt_deep = AST_PromptTuning(prompt_type='deep')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_one_epoch(model, train_loader, criterion, optimizer, torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

  1%|▌                                                                                 | 2/324 [00:00<00:24, 13.04it/s]

{'audio': tensor([[-1.8764e-04, -2.4411e-04, -1.8040e-04,  ...,  2.6641e-03,
          3.0369e-03,  0.0000e+00],
        [ 6.9736e-04,  1.5239e-03,  1.7926e-03,  ..., -3.9335e-03,
         -1.0351e-02,  0.0000e+00],
        [-8.8113e-05, -1.1776e-04, -8.4949e-05,  ..., -1.3294e-04,
         -1.4187e-04,  0.0000e+00],
        [ 4.3056e-03,  6.6272e-03,  5.0238e-03,  ..., -8.2879e-03,
         -1.1731e-02,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'city_center', 'office', 'cafe/restaurant']}
tensor([[-1.8764e-04, -2.4411e-04, -1.8040e-04,  ...,  2.6641e-03,
          3.0369e-03,  0.0000e+00],
        [ 6.9736e-04,  1.5239e-03,  1.7926e-03,  ..., -3.9335e-03,
         -1.0351e-02,  0.0000e+00],
        [-8.8113e-05, -1.1776e-04, -8.4949e-05,  ..., -1.3294e-04,
         -1.4187e-04,  0.0000e+00],
        [ 4.3056e-03,  6.6272e-03,  5.0238e-03,  ..., -8.2879e-03,
         -1.1731e-02,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['f

  2%|█▌                                                                                | 6/324 [00:00<00:26, 12.09it/s]

{'audio': tensor([[-0.0042, -0.0069, -0.0059,  ...,  0.0093,  0.0119,  0.0000],
        [-0.0019, -0.0041, -0.0045,  ..., -0.0045, -0.0055,  0.0000],
        [-0.0052, -0.0086, -0.0074,  ...,  0.0037,  0.0040,  0.0000],
        [-0.0005, -0.0008, -0.0006,  ..., -0.0010, -0.0013,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['grocery_store', 'park', 'car', 'forest_path']}
tensor([[-0.0042, -0.0069, -0.0059,  ...,  0.0093,  0.0119,  0.0000],
        [-0.0019, -0.0041, -0.0045,  ..., -0.0045, -0.0055,  0.0000],
        [-0.0052, -0.0086, -0.0074,  ...,  0.0037,  0.0040,  0.0000],
        [-0.0005, -0.0008, -0.0006,  ..., -0.0010, -0.0013,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['grocery_store', 'park', 'car', 'forest_path']
{'audio': tensor([[-0.0063, -0.0105, -0.0087,  ...,  0.0158,  0.0210,  0.0000],
        [ 0.0009,  0.0019,  0.0020,  ..., -0.0010, -0.0015,  0.0000],
        [-0.0020, -0.0037, -0.0038,  ...,  0.0060,  0.0054,  0.0000],
        [ 0

  2%|██                                                                                | 8/324 [00:00<00:26, 11.82it/s]

{'audio': tensor([[-1.3624e-04,  1.1135e-03,  3.9968e-03,  ..., -1.5293e-02,
         -1.7948e-02,  0.0000e+00],
        [-1.9540e-02, -3.2813e-02, -2.8338e-02,  ..., -2.8979e-02,
         -3.3219e-02,  0.0000e+00],
        [-9.0841e-05, -3.1393e-04, -5.0443e-04,  ..., -4.9155e-04,
          2.3185e-04,  0.0000e+00],
        [ 8.5528e-05,  1.0414e-04, -4.4602e-05,  ..., -2.6228e-04,
         -2.7247e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['train', 'bus', 'residential_area', 'office']}
tensor([[-1.3624e-04,  1.1135e-03,  3.9968e-03,  ..., -1.5293e-02,
         -1.7948e-02,  0.0000e+00],
        [-1.9540e-02, -3.2813e-02, -2.8338e-02,  ..., -2.8979e-02,
         -3.3219e-02,  0.0000e+00],
        [-9.0841e-05, -3.1393e-04, -5.0443e-04,  ..., -4.9155e-04,
          2.3185e-04,  0.0000e+00],
        [ 8.5528e-05,  1.0414e-04, -4.4602e-05,  ..., -2.6228e-04,
         -2.7247e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['train', 'bus',

  4%|███                                                                              | 12/324 [00:00<00:26, 11.94it/s]

{'audio': tensor([[-1.3511e-02, -2.1407e-02, -2.1955e-02,  ..., -2.2219e-02,
         -2.1327e-02,  0.0000e+00],
        [-3.4188e-05,  2.1089e-04,  1.5366e-04,  ..., -1.8785e-03,
         -1.7878e-03,  0.0000e+00],
        [ 2.7353e-04,  1.0721e-03, -7.0572e-05,  ..., -1.6345e-03,
         -1.8741e-03,  0.0000e+00],
        [-1.4287e-02, -2.1274e-02, -1.1655e-02,  ...,  3.1093e-02,
          3.2549e-02,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['city_center', 'metro_station', 'library', 'city_center']}
tensor([[-1.3511e-02, -2.1407e-02, -2.1955e-02,  ..., -2.2219e-02,
         -2.1327e-02,  0.0000e+00],
        [-3.4188e-05,  2.1089e-04,  1.5366e-04,  ..., -1.8785e-03,
         -1.7878e-03,  0.0000e+00],
        [ 2.7353e-04,  1.0721e-03, -7.0572e-05,  ..., -1.6345e-03,
         -1.8741e-03,  0.0000e+00],
        [-1.4287e-02, -2.1274e-02, -1.1655e-02,  ...,  3.1093e-02,
          3.2549e-02,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['ci

  4%|███▌                                                                             | 14/324 [00:01<00:26, 11.54it/s]

{'audio': tensor([[-5.5635e-03, -9.1519e-03, -7.5676e-03,  ..., -4.7393e-02,
         -5.5831e-02,  0.0000e+00],
        [ 4.2677e-03,  7.1920e-03,  6.4576e-03,  ..., -3.1504e-02,
         -3.6569e-02,  0.0000e+00],
        [ 2.5836e-05, -3.1287e-05, -1.1036e-04,  ..., -6.6740e-05,
          1.4680e-06,  0.0000e+00],
        [-2.1464e-03, -7.2466e-03, -8.9443e-03,  ..., -2.1542e-02,
         -2.8332e-02,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['tram', 'car', 'library', 'city_center']}
tensor([[-5.5635e-03, -9.1519e-03, -7.5676e-03,  ..., -4.7393e-02,
         -5.5831e-02,  0.0000e+00],
        [ 4.2677e-03,  7.1920e-03,  6.4576e-03,  ..., -3.1504e-02,
         -3.6569e-02,  0.0000e+00],
        [ 2.5836e-05, -3.1287e-05, -1.1036e-04,  ..., -6.6740e-05,
          1.4680e-06,  0.0000e+00],
        [-2.1464e-03, -7.2466e-03, -8.9443e-03,  ..., -2.1542e-02,
         -2.8332e-02,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['tram', 'car', 'libr

  5%|████                                                                             | 16/324 [00:01<00:26, 11.59it/s]

{'audio': tensor([[-1.9793e-04, -3.2484e-04, -2.1027e-04,  ..., -3.1048e-05,
         -1.3128e-04,  0.0000e+00],
        [ 8.0956e-03,  1.3977e-02,  1.2827e-02,  ...,  3.5226e-03,
          2.4298e-03,  0.0000e+00],
        [ 4.4128e-03,  7.7163e-03,  7.1604e-03,  ..., -3.4944e-02,
         -4.0328e-02,  0.0000e+00],
        [ 7.5541e-03,  1.2844e-02,  1.1502e-02,  ...,  3.4933e-04,
          2.8949e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'bus', 'tram', 'residential_area']}
tensor([[-1.9793e-04, -3.2484e-04, -2.1027e-04,  ..., -3.1048e-05,
         -1.3128e-04,  0.0000e+00],
        [ 8.0956e-03,  1.3977e-02,  1.2827e-02,  ...,  3.5226e-03,
          2.4298e-03,  0.0000e+00],
        [ 4.4128e-03,  7.7163e-03,  7.1604e-03,  ..., -3.4944e-02,
         -4.0328e-02,  0.0000e+00],
        [ 7.5541e-03,  1.2844e-02,  1.1502e-02,  ...,  3.4933e-04,
          2.8949e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['forest_pat

  6%|█████                                                                            | 20/324 [00:01<00:24, 12.39it/s]

{'audio': tensor([[ 0.0018,  0.0032,  0.0029,  ..., -0.0019, -0.0021,  0.0000],
        [-0.0002, -0.0003, -0.0001,  ..., -0.0001, -0.0002,  0.0000],
        [ 0.0229,  0.0383,  0.0332,  ..., -0.0084, -0.0099,  0.0000],
        [ 0.0007,  0.0024,  0.0015,  ...,  0.0029,  0.0003,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['cafe/restaurant', 'office', 'bus', 'home']}
tensor([[ 0.0018,  0.0032,  0.0029,  ..., -0.0019, -0.0021,  0.0000],
        [-0.0002, -0.0003, -0.0001,  ..., -0.0001, -0.0002,  0.0000],
        [ 0.0229,  0.0383,  0.0332,  ..., -0.0084, -0.0099,  0.0000],
        [ 0.0007,  0.0024,  0.0015,  ...,  0.0029,  0.0003,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['cafe/restaurant', 'office', 'bus', 'home']
{'audio': tensor([[ 7.1387e-04,  1.2182e-03,  1.0898e-03,  ...,  3.8495e-03,
          5.1014e-03,  0.0000e+00],
        [-4.0032e-05,  8.6922e-05,  2.3711e-04,  ...,  1.9334e-04,
          4.1771e-04,  0.0000e+00],
        [ 1.9622e-04,

  7%|█████▌                                                                           | 22/324 [00:01<00:23, 12.84it/s]

{'audio': tensor([[ 1.0456e-04,  5.6972e-04,  7.4362e-04,  ..., -1.9797e-05,
         -3.4515e-05,  0.0000e+00],
        [-6.3523e-04, -9.9755e-04, -7.4254e-04,  ...,  1.0414e-03,
          8.5526e-04,  0.0000e+00],
        [-4.5543e-03, -8.4557e-03, -8.4365e-03,  ..., -2.7297e-03,
         -2.6237e-03,  0.0000e+00],
        [-2.0339e-04, -3.3966e-04, -3.6760e-04,  ..., -3.3573e-05,
         -2.3736e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['cafe/restaurant', 'car', 'train', 'office']}
tensor([[ 1.0456e-04,  5.6972e-04,  7.4362e-04,  ..., -1.9797e-05,
         -3.4515e-05,  0.0000e+00],
        [-6.3523e-04, -9.9755e-04, -7.4254e-04,  ...,  1.0414e-03,
          8.5526e-04,  0.0000e+00],
        [-4.5543e-03, -8.4557e-03, -8.4365e-03,  ..., -2.7297e-03,
         -2.6237e-03,  0.0000e+00],
        [-2.0339e-04, -3.3966e-04, -3.6760e-04,  ..., -3.3573e-05,
         -2.3736e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['cafe/restaurant

  8%|██████▌                                                                          | 26/324 [00:02<00:23, 12.74it/s]

{'audio': tensor([[-6.7202e-04, -1.7282e-03, -1.1863e-03,  ..., -6.9840e-04,
         -9.3095e-04,  0.0000e+00],
        [ 3.0755e-03,  6.1614e-03,  2.9063e-03,  ..., -9.3027e-04,
         -1.0930e-03,  0.0000e+00],
        [ 1.2248e-05, -2.8301e-05,  7.8967e-05,  ...,  4.5245e-04,
          5.4226e-04,  0.0000e+00],
        [ 4.6979e-05,  8.7899e-05,  3.1958e-05,  ..., -1.3103e-04,
         -2.1819e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['office', 'beach', 'library', 'office']}
tensor([[-6.7202e-04, -1.7282e-03, -1.1863e-03,  ..., -6.9840e-04,
         -9.3095e-04,  0.0000e+00],
        [ 3.0755e-03,  6.1614e-03,  2.9063e-03,  ..., -9.3027e-04,
         -1.0930e-03,  0.0000e+00],
        [ 1.2248e-05, -2.8301e-05,  7.8967e-05,  ...,  4.5245e-04,
          5.4226e-04,  0.0000e+00],
        [ 4.6979e-05,  8.7899e-05,  3.1958e-05,  ..., -1.3103e-04,
         -2.1819e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['office', 'beach', 'l

  9%|███████                                                                          | 28/324 [00:02<00:23, 12.84it/s]

{'audio': tensor([[ 0.0118,  0.0113, -0.0035,  ...,  0.0194,  0.0388,  0.0000],
        [-0.0006, -0.0004, -0.0002,  ..., -0.0008, -0.0004,  0.0000],
        [ 0.0001,  0.0002,  0.0002,  ...,  0.0001,  0.0001,  0.0000],
        [-0.0007, -0.0011, -0.0007,  ..., -0.0004, -0.0004,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['city_center', 'residential_area', 'office', 'forest_path']}
tensor([[ 0.0118,  0.0113, -0.0035,  ...,  0.0194,  0.0388,  0.0000],
        [-0.0006, -0.0004, -0.0002,  ..., -0.0008, -0.0004,  0.0000],
        [ 0.0001,  0.0002,  0.0002,  ...,  0.0001,  0.0001,  0.0000],
        [-0.0007, -0.0011, -0.0007,  ..., -0.0004, -0.0004,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['city_center', 'residential_area', 'office', 'forest_path']
{'audio': tensor([[-0.0162, -0.0260, -0.0251,  ..., -0.0015, -0.0023,  0.0000],
        [ 0.0091,  0.0144,  0.0134,  ..., -0.0063, -0.0073,  0.0000],
        [ 0.0003,  0.0004,  0.0003,  ...,  0.0010,  0.0

 10%|████████                                                                         | 32/324 [00:02<00:22, 12.92it/s]

{'audio': tensor([[ 0.0013,  0.0016, -0.0008,  ...,  0.0348,  0.0402,  0.0000],
        [ 0.0062,  0.0107,  0.0091,  ..., -0.0186, -0.0192,  0.0000],
        [ 0.0262,  0.0442,  0.0321,  ..., -0.0281, -0.0348,  0.0000],
        [ 0.0018,  0.0032,  0.0030,  ...,  0.0008,  0.0012,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['bus', 'residential_area', 'city_center', 'home']}
tensor([[ 0.0013,  0.0016, -0.0008,  ...,  0.0348,  0.0402,  0.0000],
        [ 0.0062,  0.0107,  0.0091,  ..., -0.0186, -0.0192,  0.0000],
        [ 0.0262,  0.0442,  0.0321,  ..., -0.0281, -0.0348,  0.0000],
        [ 0.0018,  0.0032,  0.0030,  ...,  0.0008,  0.0012,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['bus', 'residential_area', 'city_center', 'home']
{'audio': tensor([[-2.1605e-05,  4.0666e-05,  1.4191e-04,  ...,  4.4170e-04,
          5.8762e-04,  0.0000e+00],
        [ 1.6701e-03,  5.3799e-03,  7.4097e-03,  ...,  8.2139e-03,
          8.0700e-03,  0.0000e+00],
        [

 10%|████████▌                                                                        | 34/324 [00:02<00:22, 13.00it/s]

{'audio': tensor([[ 3.0008e-05,  8.3516e-05,  1.3430e-04,  ...,  3.7243e-04,
          7.8138e-05,  0.0000e+00],
        [ 1.0882e-06,  6.6537e-05,  3.7023e-05,  ..., -1.4316e-04,
         -1.6072e-04,  0.0000e+00],
        [-7.2679e-04, -3.1629e-03, -3.5755e-03,  ...,  1.7255e-03,
          2.3706e-03,  0.0000e+00],
        [ 1.1911e-04, -1.1953e-03,  8.9990e-04,  ...,  9.1994e-04,
         -3.7350e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['office', 'office', 'grocery_store', 'home']}
tensor([[ 3.0008e-05,  8.3516e-05,  1.3430e-04,  ...,  3.7243e-04,
          7.8138e-05,  0.0000e+00],
        [ 1.0882e-06,  6.6537e-05,  3.7023e-05,  ..., -1.4316e-04,
         -1.6072e-04,  0.0000e+00],
        [-7.2679e-04, -3.1629e-03, -3.5755e-03,  ...,  1.7255e-03,
          2.3706e-03,  0.0000e+00],
        [ 1.1911e-04, -1.1953e-03,  8.9990e-04,  ...,  9.1994e-04,
         -3.7350e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['office', 'offic

 11%|█████████                                                                        | 36/324 [00:02<00:22, 12.77it/s]

{'audio': tensor([[-0.0001, -0.0002, -0.0002,  ...,  0.0004,  0.0006,  0.0000],
        [-0.0004, -0.0008, -0.0006,  ..., -0.0100, -0.0086,  0.0000],
        [ 0.0018,  0.0019,  0.0011,  ...,  0.0186,  0.0205,  0.0000],
        [-0.0005, -0.0008, -0.0006,  ...,  0.0013,  0.0005,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['library', 'office', 'tram', 'forest_path']}
tensor([[-0.0001, -0.0002, -0.0002,  ...,  0.0004,  0.0006,  0.0000],
        [-0.0004, -0.0008, -0.0006,  ..., -0.0100, -0.0086,  0.0000],
        [ 0.0018,  0.0019,  0.0011,  ...,  0.0186,  0.0205,  0.0000],
        [-0.0005, -0.0008, -0.0006,  ...,  0.0013,  0.0005,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['library', 'office', 'tram', 'forest_path']
{'audio': tensor([[-1.0588e-03,  3.6582e-04,  2.4836e-03,  ..., -1.2177e-03,
         -1.6354e-03,  0.0000e+00],
        [-9.4595e-04, -1.6663e-03, -2.9765e-04,  ..., -1.2849e-02,
         -1.8617e-02,  0.0000e+00],
        [-6.3233e-04,

 12%|██████████                                                                       | 40/324 [00:03<00:23, 11.89it/s]

{'audio': tensor([[-8.4171e-03, -5.9735e-04,  1.7398e-02,  ..., -1.0187e-02,
         -1.1438e-02,  0.0000e+00],
        [-1.5755e-02, -2.5616e-02, -2.0963e-02,  ..., -3.9192e-02,
         -4.5500e-02,  0.0000e+00],
        [-6.3642e-05, -5.0940e-05, -6.2148e-03,  ..., -2.7739e-03,
         -5.8336e-03,  0.0000e+00],
        [-7.2909e-03, -1.4166e-02, -1.5539e-02,  ...,  1.3057e-02,
          1.3359e-02,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['city_center', 'train', 'home', 'tram']}
tensor([[-8.4171e-03, -5.9735e-04,  1.7398e-02,  ..., -1.0187e-02,
         -1.1438e-02,  0.0000e+00],
        [-1.5755e-02, -2.5616e-02, -2.0963e-02,  ..., -3.9192e-02,
         -4.5500e-02,  0.0000e+00],
        [-6.3642e-05, -5.0940e-05, -6.2148e-03,  ..., -2.7739e-03,
         -5.8336e-03,  0.0000e+00],
        [-7.2909e-03, -1.4166e-02, -1.5539e-02,  ...,  1.3057e-02,
          1.3359e-02,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['city_center', 'train

 13%|██████████▌                                                                      | 42/324 [00:03<00:22, 12.45it/s]

{'audio': tensor([[-0.0002, -0.0001,  0.0003,  ..., -0.0001, -0.0006,  0.0000],
        [-0.0010, -0.0016, -0.0011,  ..., -0.0008, -0.0009,  0.0000],
        [ 0.0538,  0.0843,  0.0653,  ...,  0.0496,  0.0524,  0.0000],
        [-0.0004, -0.0006, -0.0007,  ...,  0.0007,  0.0018,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['residential_area', 'forest_path', 'tram', 'park']}
tensor([[-0.0002, -0.0001,  0.0003,  ..., -0.0001, -0.0006,  0.0000],
        [-0.0010, -0.0016, -0.0011,  ..., -0.0008, -0.0009,  0.0000],
        [ 0.0538,  0.0843,  0.0653,  ...,  0.0496,  0.0524,  0.0000],
        [-0.0004, -0.0006, -0.0007,  ...,  0.0007,  0.0018,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['residential_area', 'forest_path', 'tram', 'park']
{'audio': tensor([[ 3.0741e-04,  6.5529e-04,  6.9601e-04,  ..., -1.4189e-03,
         -1.5949e-03,  0.0000e+00],
        [ 2.9158e-03,  5.0830e-03,  3.4957e-03,  ..., -4.2555e-04,
          9.2919e-04,  0.0000e+00],
       

 14%|███████████▌                                                                     | 46/324 [00:03<00:21, 13.15it/s]

{'audio': tensor([[-0.0016, -0.0026, -0.0014,  ..., -0.0015, -0.0041,  0.0000],
        [-0.0270, -0.0441, -0.0373,  ...,  0.0088,  0.0110,  0.0000],
        [-0.0045, -0.0072, -0.0040,  ..., -0.0078, -0.0111,  0.0000],
        [-0.0010, -0.0016, -0.0011,  ...,  0.0022,  0.0025,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['park', 'train', 'cafe/restaurant', 'grocery_store']}
tensor([[-0.0016, -0.0026, -0.0014,  ..., -0.0015, -0.0041,  0.0000],
        [-0.0270, -0.0441, -0.0373,  ...,  0.0088,  0.0110,  0.0000],
        [-0.0045, -0.0072, -0.0040,  ..., -0.0078, -0.0111,  0.0000],
        [-0.0010, -0.0016, -0.0011,  ...,  0.0022,  0.0025,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['park', 'train', 'cafe/restaurant', 'grocery_store']
{'audio': tensor([[-0.0030, -0.0051, -0.0046,  ..., -0.0003, -0.0018,  0.0000],
        [ 0.0001,  0.0003, -0.0005,  ..., -0.0012,  0.0008,  0.0000],
        [ 0.0015,  0.0015,  0.0013,  ...,  0.0065,  0.0067,  0.0000],

 15%|████████████                                                                     | 48/324 [00:03<00:20, 13.32it/s]

{'audio': tensor([[ 0.0077,  0.0113,  0.0084,  ...,  0.0029,  0.0023,  0.0000],
        [-0.0024, -0.0028, -0.0027,  ..., -0.0011, -0.0004,  0.0000],
        [-0.0037, -0.0071, -0.0082,  ...,  0.0035,  0.0038,  0.0000],
        [-0.0007, -0.0021, -0.0008,  ..., -0.0019, -0.0018,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['tram', 'car', 'metro_station', 'residential_area']}
tensor([[ 0.0077,  0.0113,  0.0084,  ...,  0.0029,  0.0023,  0.0000],
        [-0.0024, -0.0028, -0.0027,  ..., -0.0011, -0.0004,  0.0000],
        [-0.0037, -0.0071, -0.0082,  ...,  0.0035,  0.0038,  0.0000],
        [-0.0007, -0.0021, -0.0008,  ..., -0.0019, -0.0018,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['tram', 'car', 'metro_station', 'residential_area']
{'audio': tensor([[ 6.1457e-03,  1.0320e-02,  9.1478e-03,  ...,  1.8629e-02,
          2.1425e-02,  0.0000e+00],
        [-5.5775e-03, -9.2519e-03, -8.8833e-03,  ...,  3.5871e-03,
          1.2101e-03,  0.0000e+00],
     

 16%|█████████████                                                                    | 52/324 [00:04<00:20, 13.01it/s]

{'audio': tensor([[-0.0430, -0.0712, -0.0617,  ..., -0.0182, -0.0152,  0.0000],
        [-0.0059, -0.0097, -0.0085,  ..., -0.0010, -0.0008,  0.0000],
        [-0.0001,  0.0012,  0.0010,  ...,  0.0005, -0.0122,  0.0000],
        [-0.0198, -0.0322, -0.0280,  ...,  0.0259,  0.0303,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['bus', 'train', 'cafe/restaurant', 'tram']}
tensor([[-0.0430, -0.0712, -0.0617,  ..., -0.0182, -0.0152,  0.0000],
        [-0.0059, -0.0097, -0.0085,  ..., -0.0010, -0.0008,  0.0000],
        [-0.0001,  0.0012,  0.0010,  ...,  0.0005, -0.0122,  0.0000],
        [-0.0198, -0.0322, -0.0280,  ...,  0.0259,  0.0303,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['bus', 'train', 'cafe/restaurant', 'tram']
{'audio': tensor([[-6.5961e-05,  8.7274e-05,  3.6109e-04,  ...,  4.0368e-04,
          4.3289e-04,  0.0000e+00],
        [ 6.5132e-04,  8.8496e-04,  4.4306e-04,  ...,  3.3260e-03,
          3.9672e-03,  0.0000e+00],
        [-1.2042e-04, -

 17%|█████████████▌                                                                   | 54/324 [00:04<00:20, 13.24it/s]

{'audio': tensor([[-0.0016, -0.0037, -0.0038,  ..., -0.0037, -0.0032,  0.0000],
        [-0.0014, -0.0022, -0.0018,  ...,  0.0028,  0.0045,  0.0000],
        [ 0.0029,  0.0040,  0.0016,  ..., -0.0018, -0.0028,  0.0000],
        [ 0.0013,  0.0018,  0.0010,  ..., -0.0004,  0.0005,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['beach', 'park', 'grocery_store', 'cafe/restaurant']}
tensor([[-0.0016, -0.0037, -0.0038,  ..., -0.0037, -0.0032,  0.0000],
        [-0.0014, -0.0022, -0.0018,  ...,  0.0028,  0.0045,  0.0000],
        [ 0.0029,  0.0040,  0.0016,  ..., -0.0018, -0.0028,  0.0000],
        [ 0.0013,  0.0018,  0.0010,  ..., -0.0004,  0.0005,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['beach', 'park', 'grocery_store', 'cafe/restaurant']
{'audio': tensor([[ 3.4004e-04,  8.5960e-05, -2.7312e-04,  ..., -5.0116e-04,
         -3.6932e-04,  0.0000e+00],
        [ 1.2410e-03,  1.2137e-03,  2.0229e-03,  ..., -6.0329e-03,
         -6.1455e-03,  0.0000e+00],
   

 18%|██████████████▌                                                                  | 58/324 [00:04<00:20, 13.01it/s]

{'audio': tensor([[-0.0206, -0.0349, -0.0302,  ..., -0.0222, -0.0263,  0.0000],
        [-0.0009, -0.0013, -0.0008,  ...,  0.0052,  0.0068,  0.0000],
        [-0.0020, -0.0043, -0.0088,  ...,  0.0003,  0.0008,  0.0000],
        [ 0.0007,  0.0012,  0.0010,  ..., -0.0011, -0.0013,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['bus', 'grocery_store', 'metro_station', 'metro_station']}
tensor([[-0.0206, -0.0349, -0.0302,  ..., -0.0222, -0.0263,  0.0000],
        [-0.0009, -0.0013, -0.0008,  ...,  0.0052,  0.0068,  0.0000],
        [-0.0020, -0.0043, -0.0088,  ...,  0.0003,  0.0008,  0.0000],
        [ 0.0007,  0.0012,  0.0010,  ..., -0.0011, -0.0013,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['bus', 'grocery_store', 'metro_station', 'metro_station']
{'audio': tensor([[ 0.0005,  0.0008,  0.0007,  ..., -0.0002, -0.0003,  0.0000],
        [ 0.0084,  0.0131,  0.0101,  ...,  0.0178,  0.0209,  0.0000],
        [ 0.0020,  0.0036,  0.0048,  ...,  0.0029,  0.0044,

 19%|███████████████                                                                  | 60/324 [00:04<00:20, 12.95it/s]

{'audio': tensor([[ 2.2392e-03,  9.7977e-03,  1.9135e-02,  ..., -1.2456e-01,
         -1.3938e-01,  0.0000e+00],
        [-7.4062e-03, -4.5045e-03,  1.1761e-03,  ...,  1.5027e-03,
          6.3159e-04,  0.0000e+00],
        [-1.5595e-03, -2.3513e-03, -1.7416e-03,  ..., -2.9630e-04,
         -6.0850e-04,  0.0000e+00],
        [ 7.0284e-05,  9.8511e-05,  8.2642e-05,  ...,  7.0693e-05,
          8.8125e-05,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['tram', 'residential_area', 'park', 'office']}
tensor([[ 2.2392e-03,  9.7977e-03,  1.9135e-02,  ..., -1.2456e-01,
         -1.3938e-01,  0.0000e+00],
        [-7.4062e-03, -4.5045e-03,  1.1761e-03,  ...,  1.5027e-03,
          6.3159e-04,  0.0000e+00],
        [-1.5595e-03, -2.3513e-03, -1.7416e-03,  ..., -2.9630e-04,
         -6.0850e-04,  0.0000e+00],
        [ 7.0284e-05,  9.8511e-05,  8.2642e-05,  ...,  7.0693e-05,
          8.8125e-05,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['tram', 'reside

 20%|████████████████                                                                 | 64/324 [00:05<00:20, 12.40it/s]

{'audio': tensor([[ 2.0775e-04,  2.4410e-04,  1.1138e-04,  ...,  1.1329e-03,
          1.2883e-03,  0.0000e+00],
        [-1.4409e-04, -2.4224e-04, -6.5595e-05,  ...,  3.7630e-04,
          2.5309e-04,  0.0000e+00],
        [ 6.3125e-04,  1.1190e-03,  9.9338e-04,  ..., -7.3846e-04,
         -6.7214e-04,  0.0000e+00],
        [ 2.3420e-03,  4.1658e-03,  8.9662e-04,  ...,  3.2036e-03,
          4.3049e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['library', 'forest_path', 'home', 'park']}
tensor([[ 2.0775e-04,  2.4410e-04,  1.1138e-04,  ...,  1.1329e-03,
          1.2883e-03,  0.0000e+00],
        [-1.4409e-04, -2.4224e-04, -6.5595e-05,  ...,  3.7630e-04,
          2.5309e-04,  0.0000e+00],
        [ 6.3125e-04,  1.1190e-03,  9.9338e-04,  ..., -7.3846e-04,
         -6.7214e-04,  0.0000e+00],
        [ 2.3420e-03,  4.1658e-03,  8.9662e-04,  ...,  3.2036e-03,
          4.3049e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['library', 'forest_

 20%|████████████████▌                                                                | 66/324 [00:05<00:22, 11.71it/s]

{'audio': tensor([[-0.0008, -0.0018, -0.0020,  ...,  0.0015,  0.0018,  0.0000],
        [-0.0028, -0.0036, -0.0025,  ..., -0.0027,  0.0005,  0.0000],
        [ 0.0003,  0.0005,  0.0005,  ...,  0.0012, -0.0021,  0.0000],
        [ 0.0032,  0.0056,  0.0054,  ...,  0.0003, -0.0032,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['park', 'city_center', 'metro_station', 'park']}
tensor([[-0.0008, -0.0018, -0.0020,  ...,  0.0015,  0.0018,  0.0000],
        [-0.0028, -0.0036, -0.0025,  ..., -0.0027,  0.0005,  0.0000],
        [ 0.0003,  0.0005,  0.0005,  ...,  0.0012, -0.0021,  0.0000],
        [ 0.0032,  0.0056,  0.0054,  ...,  0.0003, -0.0032,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['park', 'city_center', 'metro_station', 'park']
{'audio': tensor([[-8.0714e-03, -1.3554e-02, -1.2040e-02,  ..., -1.6144e-02,
         -1.8742e-02,  0.0000e+00],
        [ 5.9277e-04,  9.9877e-04,  9.1179e-04,  ...,  7.8388e-04,
          7.6160e-04,  0.0000e+00],
        [ 1.9

 21%|█████████████████                                                                | 68/324 [00:05<00:21, 11.79it/s]

{'audio': tensor([[-1.5505e-02, -2.8990e-02, -2.5273e-02,  ..., -3.2158e-03,
         -3.2503e-03,  0.0000e+00],
        [ 1.5489e-04,  3.0427e-04,  5.1343e-06,  ..., -4.9937e-05,
          1.9539e-04,  0.0000e+00],
        [ 9.8882e-03,  1.5316e-02,  1.1587e-02,  ...,  6.0915e-03,
          6.4441e-03,  0.0000e+00],
        [-1.9465e-03, -4.0656e-03, -4.5428e-03,  ..., -3.8912e-03,
         -2.5880e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['car', 'forest_path', 'car', 'city_center']}
tensor([[-1.5505e-02, -2.8990e-02, -2.5273e-02,  ..., -3.2158e-03,
         -3.2503e-03,  0.0000e+00],
        [ 1.5489e-04,  3.0427e-04,  5.1343e-06,  ..., -4.9937e-05,
          1.9539e-04,  0.0000e+00],
        [ 9.8882e-03,  1.5316e-02,  1.1587e-02,  ...,  6.0915e-03,
          6.4441e-03,  0.0000e+00],
        [-1.9465e-03, -4.0656e-03, -4.5428e-03,  ..., -3.8912e-03,
         -2.5880e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['car', 'forest_pa

 22%|██████████████████                                                               | 72/324 [00:05<00:23, 10.74it/s]

{'audio': tensor([[ 0.0002,  0.0004,  0.0003,  ...,  0.0001,  0.0003,  0.0000],
        [-0.0071, -0.0133, -0.0119,  ...,  0.0090,  0.0122,  0.0000],
        [-0.0012, -0.0019, -0.0016,  ..., -0.0011, -0.0009,  0.0000],
        [-0.0024, -0.0037, -0.0029,  ..., -0.0226, -0.0262,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['office', 'tram', 'grocery_store', 'grocery_store']}
tensor([[ 0.0002,  0.0004,  0.0003,  ...,  0.0001,  0.0003,  0.0000],
        [-0.0071, -0.0133, -0.0119,  ...,  0.0090,  0.0122,  0.0000],
        [-0.0012, -0.0019, -0.0016,  ..., -0.0011, -0.0009,  0.0000],
        [-0.0024, -0.0037, -0.0029,  ..., -0.0226, -0.0262,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['office', 'tram', 'grocery_store', 'grocery_store']
{'audio': tensor([[-0.0004, -0.0005, -0.0001,  ..., -0.0010,  0.0002,  0.0000],
        [-0.0016, -0.0031, -0.0024,  ...,  0.0007,  0.0007,  0.0000],
        [-0.0019, -0.0027, -0.0022,  ..., -0.0052, -0.0054,  0.0000],
 

 23%|██████████████████▌                                                              | 74/324 [00:06<00:22, 11.12it/s]

{'audio': tensor([[ 1.3562e-03,  2.7626e-03,  2.5075e-03,  ...,  8.2432e-04,
          1.6373e-03,  0.0000e+00],
        [-3.5587e-02, -5.9831e-02, -5.2428e-02,  ...,  2.8161e-02,
          3.2169e-02,  0.0000e+00],
        [-4.4712e-04, -2.8474e-04,  7.2965e-04,  ..., -1.7054e-03,
         -2.1021e-03,  0.0000e+00],
        [ 4.8570e-04,  5.7517e-05, -2.9247e-06,  ...,  4.5622e-04,
          4.0127e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['beach', 'train', 'metro_station', 'beach']}
tensor([[ 1.3562e-03,  2.7626e-03,  2.5075e-03,  ...,  8.2432e-04,
          1.6373e-03,  0.0000e+00],
        [-3.5587e-02, -5.9831e-02, -5.2428e-02,  ...,  2.8161e-02,
          3.2169e-02,  0.0000e+00],
        [-4.4712e-04, -2.8474e-04,  7.2965e-04,  ..., -1.7054e-03,
         -2.1021e-03,  0.0000e+00],
        [ 4.8570e-04,  5.7517e-05, -2.9247e-06,  ...,  4.5622e-04,
          4.0127e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['beach', 'train',

 24%|███████████████████▌                                                             | 78/324 [00:06<00:21, 11.53it/s]

{'audio': tensor([[-0.0012, -0.0010,  0.0006,  ...,  0.0025,  0.0028,  0.0000],
        [ 0.0020,  0.0033,  0.0035,  ..., -0.0081, -0.0100,  0.0000],
        [-0.0006, -0.0016, -0.0031,  ..., -0.0008,  0.0019,  0.0000],
        [ 0.0042,  0.0065,  0.0050,  ..., -0.0008, -0.0004,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['metro_station', 'car', 'beach', 'tram']}
tensor([[-0.0012, -0.0010,  0.0006,  ...,  0.0025,  0.0028,  0.0000],
        [ 0.0020,  0.0033,  0.0035,  ..., -0.0081, -0.0100,  0.0000],
        [-0.0006, -0.0016, -0.0031,  ..., -0.0008,  0.0019,  0.0000],
        [ 0.0042,  0.0065,  0.0050,  ..., -0.0008, -0.0004,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['metro_station', 'car', 'beach', 'tram']
{'audio': tensor([[ 5.0366e-05,  8.0339e-06, -6.2623e-05,  ..., -2.1188e-04,
         -3.0577e-04,  0.0000e+00],
        [-4.7597e-05, -5.7921e-05, -2.9894e-05,  ..., -2.9233e-05,
          5.3108e-06,  0.0000e+00],
        [-1.0899e-04, -3.34

 25%|████████████████████                                                             | 80/324 [00:06<00:20, 11.62it/s]

{'audio': tensor([[-4.4023e-03, -6.4308e-03, -4.9790e-03,  ..., -1.9383e-03,
         -1.6741e-03,  0.0000e+00],
        [ 3.2905e-04, -9.9049e-05, -2.7239e-04,  ..., -5.7288e-03,
         -5.9443e-03,  0.0000e+00],
        [-1.4092e-02, -2.2433e-02, -1.8719e-02,  ...,  3.2359e-03,
          1.2700e-03,  0.0000e+00],
        [ 6.9751e-03,  1.1887e-02,  1.0259e-02,  ...,  1.6677e-03,
          2.7338e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['cafe/restaurant', 'park', 'train', 'bus']}
tensor([[-4.4023e-03, -6.4308e-03, -4.9790e-03,  ..., -1.9383e-03,
         -1.6741e-03,  0.0000e+00],
        [ 3.2905e-04, -9.9049e-05, -2.7239e-04,  ..., -5.7288e-03,
         -5.9443e-03,  0.0000e+00],
        [-1.4092e-02, -2.2433e-02, -1.8719e-02,  ...,  3.2359e-03,
          1.2700e-03,  0.0000e+00],
        [ 6.9751e-03,  1.1887e-02,  1.0259e-02,  ...,  1.6677e-03,
          2.7338e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['cafe/restaurant',

 25%|████████████████████▌                                                            | 82/324 [00:06<00:22, 10.79it/s]

{'audio': tensor([[-0.0004, -0.0007, -0.0005,  ...,  0.0006,  0.0007,  0.0000],
        [ 0.0041,  0.0067,  0.0059,  ...,  0.0017,  0.0028,  0.0000],
        [-0.0018, -0.0025, -0.0025,  ...,  0.0016,  0.0006,  0.0000],
        [ 0.0022,  0.0026,  0.0022,  ..., -0.0068, -0.0064,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['park', 'residential_area', 'beach', 'cafe/restaurant']}
tensor([[-0.0004, -0.0007, -0.0005,  ...,  0.0006,  0.0007,  0.0000],
        [ 0.0041,  0.0067,  0.0059,  ...,  0.0017,  0.0028,  0.0000],
        [-0.0018, -0.0025, -0.0025,  ...,  0.0016,  0.0006,  0.0000],
        [ 0.0022,  0.0026,  0.0022,  ..., -0.0068, -0.0064,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['park', 'residential_area', 'beach', 'cafe/restaurant']
{'audio': tensor([[-0.0007,  0.0008,  0.0019,  ...,  0.0014,  0.0009,  0.0000],
        [-0.0003, -0.0004, -0.0003,  ..., -0.0006, -0.0007,  0.0000],
        [ 0.0012,  0.0017,  0.0014,  ..., -0.0006, -0.0007,  0.

 26%|█████████████████████                                                            | 84/324 [00:06<00:21, 11.18it/s]

{'audio': tensor([[ 1.3089e-04,  2.1425e-04,  1.9917e-04,  ...,  4.7943e-03,
          5.1022e-03,  0.0000e+00],
        [ 4.4004e-04,  8.9487e-04,  8.6508e-04,  ..., -3.5303e-05,
         -2.0317e-04,  0.0000e+00],
        [ 1.5141e-02,  2.4640e-02,  1.9209e-02,  ...,  1.0019e-02,
          7.7629e-03,  0.0000e+00],
        [-2.1028e-03, -3.5190e-03, -3.5441e-03,  ...,  6.2437e-03,
          6.3362e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['bus', 'forest_path', 'train', 'cafe/restaurant']}
tensor([[ 1.3089e-04,  2.1425e-04,  1.9917e-04,  ...,  4.7943e-03,
          5.1022e-03,  0.0000e+00],
        [ 4.4004e-04,  8.9487e-04,  8.6508e-04,  ..., -3.5303e-05,
         -2.0317e-04,  0.0000e+00],
        [ 1.5141e-02,  2.4640e-02,  1.9209e-02,  ...,  1.0019e-02,
          7.7629e-03,  0.0000e+00],
        [-2.1028e-03, -3.5190e-03, -3.5441e-03,  ...,  6.2437e-03,
          6.3362e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['bus', 'for

 27%|█████████████████████▌                                                           | 86/324 [00:07<00:23,  9.92it/s]

{'audio': tensor([[ 0.0041,  0.0069,  0.0061,  ..., -0.0018, -0.0023,  0.0000],
        [-0.0002,  0.0020,  0.0027,  ..., -0.0030, -0.0052,  0.0000],
        [-0.0110, -0.0164, -0.0093,  ..., -0.0023, -0.0043,  0.0000],
        [-0.0335, -0.0573, -0.0512,  ..., -0.0019, -0.0035,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['bus', 'beach', 'residential_area', 'tram']}
tensor([[ 0.0041,  0.0069,  0.0061,  ..., -0.0018, -0.0023,  0.0000],
        [-0.0002,  0.0020,  0.0027,  ..., -0.0030, -0.0052,  0.0000],
        [-0.0110, -0.0164, -0.0093,  ..., -0.0023, -0.0043,  0.0000],
        [-0.0335, -0.0573, -0.0512,  ..., -0.0019, -0.0035,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['bus', 'beach', 'residential_area', 'tram']
{'audio': tensor([[ 0.0052,  0.0007, -0.0100,  ...,  0.0637,  0.0758,  0.0000],
        [ 0.0022,  0.0041,  0.0048,  ...,  0.0037,  0.0031,  0.0000],
        [-0.0002, -0.0003, -0.0002,  ...,  0.0014,  0.0017,  0.0000],
        [ 0.0013,

 27%|██████████████████████▎                                                          | 89/324 [00:07<00:25,  9.34it/s]

{'audio': tensor([[-0.0014, -0.0021, -0.0014,  ..., -0.0014, -0.0018,  0.0000],
        [ 0.0029,  0.0042,  0.0033,  ...,  0.0010,  0.0003,  0.0000],
        [ 0.0030,  0.0053,  0.0055,  ...,  0.0029,  0.0030,  0.0000],
        [ 0.0434,  0.0712,  0.0603,  ...,  0.0011,  0.0017,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['residential_area', 'park', 'bus', 'car']}
tensor([[-0.0014, -0.0021, -0.0014,  ..., -0.0014, -0.0018,  0.0000],
        [ 0.0029,  0.0042,  0.0033,  ...,  0.0010,  0.0003,  0.0000],
        [ 0.0030,  0.0053,  0.0055,  ...,  0.0029,  0.0030,  0.0000],
        [ 0.0434,  0.0712,  0.0603,  ...,  0.0011,  0.0017,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['residential_area', 'park', 'bus', 'car']
{'audio': tensor([[-4.7114e-04, -8.3235e-04, -7.7281e-04,  ...,  1.3108e-03,
         -6.0268e-04,  0.0000e+00],
        [ 6.5169e-05, -2.1902e-04,  1.6745e-04,  ...,  3.8705e-04,
          3.0003e-04,  0.0000e+00],
        [-6.3714e-05, -2.

 28%|███████████████████████                                                          | 92/324 [00:07<00:21, 10.76it/s]

{'audio': tensor([[ 1.0550e-05,  6.4761e-05,  5.5189e-05,  ...,  1.1070e-04,
          1.3951e-04,  0.0000e+00],
        [-1.0361e-02, -1.6386e-02, -1.2444e-02,  ..., -1.6345e-03,
         -4.4454e-03,  0.0000e+00],
        [-5.4309e-03, -9.5361e-03, -9.0606e-03,  ...,  3.5165e-03,
          4.5240e-03,  0.0000e+00],
        [-5.6629e-04, -8.9568e-04, -5.6524e-04,  ...,  2.0576e-05,
         -2.7671e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['office', 'bus', 'tram', 'residential_area']}
tensor([[ 1.0550e-05,  6.4761e-05,  5.5189e-05,  ...,  1.1070e-04,
          1.3951e-04,  0.0000e+00],
        [-1.0361e-02, -1.6386e-02, -1.2444e-02,  ..., -1.6345e-03,
         -4.4454e-03,  0.0000e+00],
        [-5.4309e-03, -9.5361e-03, -9.0606e-03,  ...,  3.5165e-03,
          4.5240e-03,  0.0000e+00],
        [-5.6629e-04, -8.9568e-04, -5.6524e-04,  ...,  2.0576e-05,
         -2.7671e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['office', 'bus',

 29%|███████████████████████▌                                                         | 94/324 [00:07<00:22, 10.41it/s]

{'audio': tensor([[ 8.6510e-04,  1.5895e-03,  2.1472e-03,  ...,  5.3341e-04,
          8.6687e-04,  0.0000e+00],
        [-3.7651e-03, -7.4708e-03, -5.4736e-03,  ...,  7.9548e-04,
          1.4161e-03,  0.0000e+00],
        [-1.6461e-04, -3.0886e-04, -1.7353e-04,  ..., -1.4099e-04,
         -6.9087e-05,  0.0000e+00],
        [-1.8994e-04, -2.3001e-04, -6.4095e-05,  ...,  6.5595e-05,
          2.0813e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['park', 'park', 'office', 'library']}
tensor([[ 8.6510e-04,  1.5895e-03,  2.1472e-03,  ...,  5.3341e-04,
          8.6687e-04,  0.0000e+00],
        [-3.7651e-03, -7.4708e-03, -5.4736e-03,  ...,  7.9548e-04,
          1.4161e-03,  0.0000e+00],
        [-1.6461e-04, -3.0886e-04, -1.7353e-04,  ..., -1.4099e-04,
         -6.9087e-05,  0.0000e+00],
        [-1.8994e-04, -2.3001e-04, -6.4095e-05,  ...,  6.5595e-05,
          2.0813e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['park', 'park', 'office'

 30%|████████████████████████▌                                                        | 98/324 [00:08<00:18, 11.99it/s]

{'audio': tensor([[ 3.3596e-04,  5.7333e-04, -1.2422e-04,  ..., -7.8951e-04,
         -1.1675e-03,  0.0000e+00],
        [ 2.9885e-04,  3.2597e-04,  2.9072e-05,  ...,  4.0700e-03,
          5.4208e-03,  0.0000e+00],
        [-7.4410e-03, -1.1579e-02, -8.0650e-03,  ...,  6.4442e-03,
          7.8254e-03,  0.0000e+00],
        [-2.5587e-04, -4.6630e-04, -3.1629e-04,  ..., -1.3751e-03,
         -1.5485e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['park', 'park', 'grocery_store', 'forest_path']}
tensor([[ 3.3596e-04,  5.7333e-04, -1.2422e-04,  ..., -7.8951e-04,
         -1.1675e-03,  0.0000e+00],
        [ 2.9885e-04,  3.2597e-04,  2.9072e-05,  ...,  4.0700e-03,
          5.4208e-03,  0.0000e+00],
        [-7.4410e-03, -1.1579e-02, -8.0650e-03,  ...,  6.4442e-03,
          7.8254e-03,  0.0000e+00],
        [-2.5587e-04, -4.6630e-04, -3.1629e-04,  ..., -1.3751e-03,
         -1.5485e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['park', 'park

 31%|████████████████████████▋                                                       | 100/324 [00:08<00:19, 11.63it/s]

{'audio': tensor([[ 2.6741e-04,  4.9761e-04,  8.5071e-04,  ...,  1.6481e-04,
          2.6994e-05,  0.0000e+00],
        [ 3.1509e-04,  2.2324e-03,  1.8065e-03,  ...,  1.2527e-02,
          1.3677e-02,  0.0000e+00],
        [ 2.8664e-03,  4.6882e-03,  3.8578e-03,  ..., -1.1584e-02,
         -1.3295e-02,  0.0000e+00],
        [-2.0495e-04, -2.7607e-04, -1.8035e-04,  ...,  4.1930e-04,
          2.2793e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['cafe/restaurant', 'bus', 'train', 'forest_path']}
tensor([[ 2.6741e-04,  4.9761e-04,  8.5071e-04,  ...,  1.6481e-04,
          2.6994e-05,  0.0000e+00],
        [ 3.1509e-04,  2.2324e-03,  1.8065e-03,  ...,  1.2527e-02,
          1.3677e-02,  0.0000e+00],
        [ 2.8664e-03,  4.6882e-03,  3.8578e-03,  ..., -1.1584e-02,
         -1.3295e-02,  0.0000e+00],
        [-2.0495e-04, -2.7607e-04, -1.8035e-04,  ...,  4.1930e-04,
          2.2793e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['cafe/resta

 32%|█████████████████████████▋                                                      | 104/324 [00:08<00:17, 12.59it/s]

{'audio': tensor([[-9.0173e-04, -1.5828e-03, -1.4034e-03,  ...,  1.3242e-05,
         -1.7026e-04,  0.0000e+00],
        [ 2.3797e-03,  4.1347e-03,  3.6214e-03,  ..., -1.3138e-03,
         -1.2521e-03,  0.0000e+00],
        [-7.3090e-05, -1.2238e-03, -1.4891e-03,  ...,  1.1758e-02,
         -3.3171e-04,  0.0000e+00],
        [-7.2292e-04, -9.2891e-04, -7.0811e-04,  ...,  5.7055e-05,
         -1.2929e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'car', 'beach', 'grocery_store']}
tensor([[-9.0173e-04, -1.5828e-03, -1.4034e-03,  ...,  1.3242e-05,
         -1.7026e-04,  0.0000e+00],
        [ 2.3797e-03,  4.1347e-03,  3.6214e-03,  ..., -1.3138e-03,
         -1.2521e-03,  0.0000e+00],
        [-7.3090e-05, -1.2238e-03, -1.4891e-03,  ...,  1.1758e-02,
         -3.3171e-04,  0.0000e+00],
        [-7.2292e-04, -9.2891e-04, -7.0811e-04,  ...,  5.7055e-05,
         -1.2929e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['forest_path'

 33%|██████████████████████████▋                                                     | 108/324 [00:09<00:17, 12.29it/s]

{'audio': tensor([[-2.7329e-02, -4.4575e-02, -3.7885e-02,  ..., -5.0550e-02,
         -5.9066e-02,  0.0000e+00],
        [-1.2358e-02, -1.9784e-02, -1.5354e-02,  ...,  7.7317e-03,
          8.6601e-03,  0.0000e+00],
        [-2.3824e-03, -4.0225e-03, -3.7028e-03,  ...,  6.2583e-03,
          9.3642e-03,  0.0000e+00],
        [ 1.9587e-04,  5.8258e-05,  4.4896e-04,  ..., -2.5939e-03,
         -2.8500e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['train', 'grocery_store', 'tram', 'metro_station']}
tensor([[-2.7329e-02, -4.4575e-02, -3.7885e-02,  ..., -5.0550e-02,
         -5.9066e-02,  0.0000e+00],
        [-1.2358e-02, -1.9784e-02, -1.5354e-02,  ...,  7.7317e-03,
          8.6601e-03,  0.0000e+00],
        [-2.3824e-03, -4.0225e-03, -3.7028e-03,  ...,  6.2583e-03,
          9.3642e-03,  0.0000e+00],
        [ 1.9587e-04,  5.8258e-05,  4.4896e-04,  ..., -2.5939e-03,
         -2.8500e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['train', '

 34%|███████████████████████████▏                                                    | 110/324 [00:09<00:17, 12.42it/s]

{'audio': tensor([[-0.0040, -0.0065, -0.0072,  ...,  0.0097,  0.0107,  0.0000],
        [-0.0098, -0.0169, -0.0144,  ..., -0.0773, -0.0957,  0.0000],
        [ 0.0007,  0.0009,  0.0017,  ...,  0.0015,  0.0016,  0.0000],
        [-0.0003, -0.0006, -0.0006,  ...,  0.0009,  0.0011,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['grocery_store', 'train', 'residential_area', 'library']}
tensor([[-0.0040, -0.0065, -0.0072,  ...,  0.0097,  0.0107,  0.0000],
        [-0.0098, -0.0169, -0.0144,  ..., -0.0773, -0.0957,  0.0000],
        [ 0.0007,  0.0009,  0.0017,  ...,  0.0015,  0.0016,  0.0000],
        [-0.0003, -0.0006, -0.0006,  ...,  0.0009,  0.0011,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['grocery_store', 'train', 'residential_area', 'library']
{'audio': tensor([[-2.5343e-03, -5.1744e-03, -5.2470e-03,  ...,  7.8379e-03,
          1.1474e-02,  0.0000e+00],
        [-1.5950e-02, -2.6810e-02, -2.3568e-02,  ..., -3.0914e-03,
         -3.6492e-03,  0.0000e+

 35%|████████████████████████████▏                                                   | 114/324 [00:09<00:18, 11.51it/s]

{'audio': tensor([[-0.0022, -0.0021,  0.0004,  ..., -0.0006, -0.0002,  0.0000],
        [ 0.0078,  0.0132,  0.0189,  ...,  0.0109,  0.0061,  0.0000],
        [ 0.0011,  0.0020,  0.0016,  ..., -0.0005, -0.0012,  0.0000],
        [-0.0002, -0.0001,  0.0003,  ..., -0.0065, -0.0075,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['home', 'train', 'park', 'train']}
tensor([[-0.0022, -0.0021,  0.0004,  ..., -0.0006, -0.0002,  0.0000],
        [ 0.0078,  0.0132,  0.0189,  ...,  0.0109,  0.0061,  0.0000],
        [ 0.0011,  0.0020,  0.0016,  ..., -0.0005, -0.0012,  0.0000],
        [-0.0002, -0.0001,  0.0003,  ..., -0.0065, -0.0075,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['home', 'train', 'park', 'train']
{'audio': tensor([[-0.0029, -0.0045, -0.0057,  ..., -0.0018, -0.0005,  0.0000],
        [ 0.0032,  0.0056,  0.0050,  ...,  0.0024,  0.0028,  0.0000],
        [-0.0025, -0.0083, -0.0107,  ...,  0.0021,  0.0028,  0.0000],
        [-0.0004, -0.0007, -0.0007,  

 36%|████████████████████████████▋                                                   | 116/324 [00:09<00:17, 12.23it/s]

{'audio': tensor([[-4.5132e-08,  1.3490e-05,  1.2615e-05,  ..., -2.3443e-04,
         -2.4858e-04,  0.0000e+00],
        [-5.4068e-04, -4.9715e-04, -1.4847e-04,  ..., -8.5489e-04,
         -7.1828e-04,  0.0000e+00],
        [-1.8882e-03, -3.8591e-03,  2.5352e-03,  ...,  8.4823e-05,
          2.5315e-03,  0.0000e+00],
        [-1.4619e-03, -2.5374e-03, -1.9500e-03,  ...,  1.6359e-03,
          2.1893e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'home', 'park', 'grocery_store']}
tensor([[-4.5132e-08,  1.3490e-05,  1.2615e-05,  ..., -2.3443e-04,
         -2.4858e-04,  0.0000e+00],
        [-5.4068e-04, -4.9715e-04, -1.4847e-04,  ..., -8.5489e-04,
         -7.1828e-04,  0.0000e+00],
        [-1.8882e-03, -3.8591e-03,  2.5352e-03,  ...,  8.4823e-05,
          2.5315e-03,  0.0000e+00],
        [-1.4619e-03, -2.5374e-03, -1.9500e-03,  ...,  1.6359e-03,
          2.1893e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['forest_path'

 37%|█████████████████████████████▋                                                  | 120/324 [00:10<00:15, 13.11it/s]

{'audio': tensor([[ 8.2888e-05,  1.7292e-04,  1.2113e-04,  ..., -1.0264e-04,
         -1.2307e-04,  0.0000e+00],
        [-2.8906e-04, -1.2109e-03, -5.4911e-04,  ...,  1.6705e-03,
          2.5375e-03,  0.0000e+00],
        [-4.9044e-04, -9.3949e-04, -5.6104e-04,  ...,  6.7080e-04,
          4.3754e-04,  0.0000e+00],
        [-4.3667e-03, -7.1545e-03, -6.4293e-03,  ...,  2.3105e-03,
          1.5833e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['office', 'park', 'home', 'bus']}
tensor([[ 8.2888e-05,  1.7292e-04,  1.2113e-04,  ..., -1.0264e-04,
         -1.2307e-04,  0.0000e+00],
        [-2.8906e-04, -1.2109e-03, -5.4911e-04,  ...,  1.6705e-03,
          2.5375e-03,  0.0000e+00],
        [-4.9044e-04, -9.3949e-04, -5.6104e-04,  ...,  6.7080e-04,
          4.3754e-04,  0.0000e+00],
        [-4.3667e-03, -7.1545e-03, -6.4293e-03,  ...,  2.3105e-03,
          1.5833e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['office', 'park', 'home', 'b

 38%|██████████████████████████████▌                                                 | 124/324 [00:10<00:14, 13.90it/s]

{'audio': tensor([[ 2.8564e-02,  4.8959e-02,  4.4242e-02,  ..., -2.2525e-02,
         -2.7673e-02,  0.0000e+00],
        [-2.8650e-04, -6.3224e-05,  3.6205e-04,  ..., -2.2707e-03,
         -2.2584e-03,  0.0000e+00],
        [-2.1448e-05,  9.6860e-05,  5.1283e-05,  ..., -9.7297e-04,
         -7.8018e-04,  0.0000e+00],
        [ 5.9606e-03,  9.7654e-03,  8.3883e-03,  ...,  1.8055e-03,
          2.2511e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['car', 'bus', 'forest_path', 'metro_station']}
tensor([[ 2.8564e-02,  4.8959e-02,  4.4242e-02,  ..., -2.2525e-02,
         -2.7673e-02,  0.0000e+00],
        [-2.8650e-04, -6.3224e-05,  3.6205e-04,  ..., -2.2707e-03,
         -2.2584e-03,  0.0000e+00],
        [-2.1448e-05,  9.6860e-05,  5.1283e-05,  ..., -9.7297e-04,
         -7.8018e-04,  0.0000e+00],
        [ 5.9606e-03,  9.7654e-03,  8.3883e-03,  ...,  1.8055e-03,
          2.2511e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['car', 'bus', '

 39%|███████████████████████████████                                                 | 126/324 [00:10<00:15, 12.85it/s]

{'audio': tensor([[ 1.2635e-02,  2.1573e-02,  1.9848e-02,  ..., -1.2623e-02,
         -1.3489e-02,  0.0000e+00],
        [-3.9275e-02, -6.5410e-02, -5.5991e-02,  ...,  1.8763e-02,
          2.2289e-02,  0.0000e+00],
        [ 2.4633e-03,  4.3278e-03,  3.4019e-03,  ..., -1.8914e-03,
         -1.4184e-03,  0.0000e+00],
        [-1.1000e-04, -1.3281e-04, -9.7020e-05,  ...,  2.6914e-04,
          2.4544e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['car', 'train', 'cafe/restaurant', 'home']}
tensor([[ 1.2635e-02,  2.1573e-02,  1.9848e-02,  ..., -1.2623e-02,
         -1.3489e-02,  0.0000e+00],
        [-3.9275e-02, -6.5410e-02, -5.5991e-02,  ...,  1.8763e-02,
          2.2289e-02,  0.0000e+00],
        [ 2.4633e-03,  4.3278e-03,  3.4019e-03,  ..., -1.8914e-03,
         -1.4184e-03,  0.0000e+00],
        [-1.1000e-04, -1.3281e-04, -9.7020e-05,  ...,  2.6914e-04,
          2.4544e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['car', 'train', 'c

 40%|███████████████████████████████▌                                                | 128/324 [00:10<00:15, 12.30it/s]

{'audio': tensor([[-7.9205e-05, -1.9924e-04, -2.6054e-04,  ...,  1.4583e-04,
          6.5611e-05,  0.0000e+00],
        [-8.8913e-04, -1.2288e-03, -9.6443e-04,  ...,  2.8787e-03,
          3.2783e-03,  0.0000e+00],
        [-2.5094e-04,  5.6331e-04,  4.2301e-04,  ..., -1.2925e-03,
         -3.2561e-04,  0.0000e+00],
        [-2.5817e-04,  5.8412e-04,  1.1394e-03,  ...,  2.5335e-03,
          1.0816e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'home', 'beach', 'beach']}
tensor([[-7.9205e-05, -1.9924e-04, -2.6054e-04,  ...,  1.4583e-04,
          6.5611e-05,  0.0000e+00],
        [-8.8913e-04, -1.2288e-03, -9.6443e-04,  ...,  2.8787e-03,
          3.2783e-03,  0.0000e+00],
        [-2.5094e-04,  5.6331e-04,  4.2301e-04,  ..., -1.2925e-03,
         -3.2561e-04,  0.0000e+00],
        [-2.5817e-04,  5.8412e-04,  1.1394e-03,  ...,  2.5335e-03,
          1.0816e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['forest_path', 'home

 40%|████████████████████████████████                                                | 130/324 [00:10<00:15, 12.86it/s]

{'audio': tensor([[-0.0171, -0.0286, -0.0248,  ..., -0.0179, -0.0192,  0.0000],
        [-0.0008, -0.0012, -0.0011,  ...,  0.0005,  0.0011,  0.0000],
        [ 0.0051,  0.0105,  0.0099,  ...,  0.0014,  0.0013,  0.0000],
        [-0.0057, -0.0101, -0.0089,  ...,  0.0066,  0.0085,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['car', 'metro_station', 'beach', 'car']}
tensor([[-0.0171, -0.0286, -0.0248,  ..., -0.0179, -0.0192,  0.0000],
        [-0.0008, -0.0012, -0.0011,  ...,  0.0005,  0.0011,  0.0000],
        [ 0.0051,  0.0105,  0.0099,  ...,  0.0014,  0.0013,  0.0000],
        [-0.0057, -0.0101, -0.0089,  ...,  0.0066,  0.0085,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['car', 'metro_station', 'beach', 'car']
{'audio': tensor([[-0.0010, -0.0019, -0.0017,  ...,  0.0100,  0.0116,  0.0000],
        [ 0.0011,  0.0020,  0.0018,  ..., -0.0007, -0.0011,  0.0000],
        [ 0.0058,  0.0090,  0.0066,  ..., -0.0114, -0.0162,  0.0000],
        [ 0.0044,  0.0087

 41%|█████████████████████████████████                                               | 134/324 [00:11<00:15, 12.45it/s]

{'audio': tensor([[-1.4102e-03, -4.6655e-03, -6.0874e-03,  ...,  2.6774e-02,
          2.2964e-02,  0.0000e+00],
        [-7.0641e-04, -5.3973e-04,  1.2927e-04,  ..., -2.6561e-03,
         -3.9117e-03,  0.0000e+00],
        [-5.5248e-04, -6.8050e-04, -3.3667e-04,  ...,  2.9443e-04,
         -4.1025e-05,  0.0000e+00],
        [-7.9357e-02, -1.3465e-01, -1.2139e-01,  ..., -7.6044e-02,
         -8.4747e-02,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['car', 'grocery_store', 'residential_area', 'train']}
tensor([[-1.4102e-03, -4.6655e-03, -6.0874e-03,  ...,  2.6774e-02,
          2.2964e-02,  0.0000e+00],
        [-7.0641e-04, -5.3973e-04,  1.2927e-04,  ..., -2.6561e-03,
         -3.9117e-03,  0.0000e+00],
        [-5.5248e-04, -6.8050e-04, -3.3667e-04,  ...,  2.9443e-04,
         -4.1025e-05,  0.0000e+00],
        [-7.9357e-02, -1.3465e-01, -1.2139e-01,  ..., -7.6044e-02,
         -8.4747e-02,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['car', '

 42%|█████████████████████████████████▌                                              | 136/324 [00:11<00:14, 12.87it/s]

{'audio': tensor([[-0.0043, -0.0073, -0.0059,  ..., -0.0098, -0.0103,  0.0000],
        [-0.0005, -0.0008, -0.0007,  ..., -0.0003,  0.0006,  0.0000],
        [-0.0002, -0.0002, -0.0001,  ..., -0.0002, -0.0002,  0.0000],
        [-0.0008, -0.0001, -0.0008,  ...,  0.0011,  0.0029,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['bus', 'home', 'metro_station', 'beach']}
tensor([[-0.0043, -0.0073, -0.0059,  ..., -0.0098, -0.0103,  0.0000],
        [-0.0005, -0.0008, -0.0007,  ..., -0.0003,  0.0006,  0.0000],
        [-0.0002, -0.0002, -0.0001,  ..., -0.0002, -0.0002,  0.0000],
        [-0.0008, -0.0001, -0.0008,  ...,  0.0011,  0.0029,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['bus', 'home', 'metro_station', 'beach']
{'audio': tensor([[ 4.8054e-04,  7.4235e-04,  1.3446e-03,  ...,  2.8756e-04,
         -3.8453e-05,  0.0000e+00],
        [-4.8053e-03, -7.1961e-03, -5.6463e-03,  ..., -4.8795e-03,
         -8.0332e-03,  0.0000e+00],
        [-2.3416e-03, -3.26

 43%|██████████████████████████████████▌                                             | 140/324 [00:11<00:14, 12.70it/s]

{'audio': tensor([[-3.4885e-04, -4.1119e-04, -3.4733e-04,  ...,  2.7650e-04,
          2.6939e-04,  0.0000e+00],
        [ 2.9241e-03,  4.1058e-03,  9.9788e-04,  ..., -1.5013e-03,
         -2.0917e-03,  0.0000e+00],
        [ 4.2889e-04,  4.5783e-04,  3.0506e-04,  ...,  3.1587e-04,
         -7.5804e-06,  0.0000e+00],
        [ 6.7885e-03,  1.2195e-02,  9.3937e-03,  ..., -1.3217e-03,
         -1.7817e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['home', 'beach', 'park', 'cafe/restaurant']}
tensor([[-3.4885e-04, -4.1119e-04, -3.4733e-04,  ...,  2.7650e-04,
          2.6939e-04,  0.0000e+00],
        [ 2.9241e-03,  4.1058e-03,  9.9788e-04,  ..., -1.5013e-03,
         -2.0917e-03,  0.0000e+00],
        [ 4.2889e-04,  4.5783e-04,  3.0506e-04,  ...,  3.1587e-04,
         -7.5804e-06,  0.0000e+00],
        [ 6.7885e-03,  1.2195e-02,  9.3937e-03,  ..., -1.3217e-03,
         -1.7817e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['home', 'beach', 

 44%|███████████████████████████████████                                             | 142/324 [00:11<00:13, 13.40it/s]

{'audio': tensor([[ 1.2295e-02,  2.0562e-02,  1.7722e-02,  ..., -4.3415e-02,
         -5.0686e-02,  0.0000e+00],
        [ 1.2250e-02,  2.0226e-02,  1.7618e-02,  ...,  5.6831e-02,
          6.5919e-02,  0.0000e+00],
        [ 3.6181e-05,  6.2088e-05,  6.3480e-05,  ...,  1.8611e-04,
          2.2464e-04,  0.0000e+00],
        [-9.5525e-04, -1.7237e-03, -1.7680e-03,  ...,  5.5970e-03,
          6.4893e-03,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['car', 'car', 'home', 'tram']}
tensor([[ 1.2295e-02,  2.0562e-02,  1.7722e-02,  ..., -4.3415e-02,
         -5.0686e-02,  0.0000e+00],
        [ 1.2250e-02,  2.0226e-02,  1.7618e-02,  ...,  5.6831e-02,
          6.5919e-02,  0.0000e+00],
        [ 3.6181e-05,  6.2088e-05,  6.3480e-05,  ...,  1.8611e-04,
          2.2464e-04,  0.0000e+00],
        [-9.5525e-04, -1.7237e-03, -1.7680e-03,  ...,  5.5970e-03,
          6.4893e-03,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['car', 'car', 'home', 'tram']
{

 45%|████████████████████████████████████                                            | 146/324 [00:12<00:13, 12.97it/s]

{'audio': tensor([[ 4.6371e-05,  7.2311e-05, -3.4880e-06,  ..., -6.2937e-04,
         -5.8368e-04,  0.0000e+00],
        [ 1.0186e-04,  1.8825e-04,  1.5617e-04,  ..., -8.8148e-05,
         -8.5813e-05,  0.0000e+00],
        [-1.2486e-04, -2.1112e-04, -2.2177e-04,  ...,  7.6994e-05,
          1.0498e-04,  0.0000e+00],
        [ 4.7646e-05,  1.1762e-05, -6.3772e-05,  ..., -2.2371e-04,
         -2.1459e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'office', 'home', 'office']}
tensor([[ 4.6371e-05,  7.2311e-05, -3.4880e-06,  ..., -6.2937e-04,
         -5.8368e-04,  0.0000e+00],
        [ 1.0186e-04,  1.8825e-04,  1.5617e-04,  ..., -8.8148e-05,
         -8.5813e-05,  0.0000e+00],
        [-1.2486e-04, -2.1112e-04, -2.2177e-04,  ...,  7.6994e-05,
          1.0498e-04,  0.0000e+00],
        [ 4.7646e-05,  1.1762e-05, -6.3772e-05,  ..., -2.2371e-04,
         -2.1459e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['forest_path', 'of

 46%|████████████████████████████████████▌                                           | 148/324 [00:12<00:15, 11.38it/s]

{'audio': tensor([[-0.0018, -0.0029, -0.0018,  ...,  0.0005,  0.0013,  0.0000],
        [ 0.0037,  0.0053,  0.0035,  ...,  0.0108,  0.0121,  0.0000],
        [-0.0107, -0.0187, -0.0173,  ...,  0.0031,  0.0016,  0.0000],
        [-0.0002, -0.0003, -0.0003,  ..., -0.0001, -0.0001,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['residential_area', 'tram', 'train', 'library']}
tensor([[-0.0018, -0.0029, -0.0018,  ...,  0.0005,  0.0013,  0.0000],
        [ 0.0037,  0.0053,  0.0035,  ...,  0.0108,  0.0121,  0.0000],
        [-0.0107, -0.0187, -0.0173,  ...,  0.0031,  0.0016,  0.0000],
        [-0.0002, -0.0003, -0.0003,  ..., -0.0001, -0.0001,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['residential_area', 'tram', 'train', 'library']
{'audio': tensor([[-8.4248e-05, -1.4780e-04, -1.3109e-04,  ...,  3.2339e-05,
          6.3317e-06,  0.0000e+00],
        [-7.0589e-02, -1.3090e-01, -1.2641e-01,  ..., -3.8475e-02,
         -2.7368e-02,  0.0000e+00],
        [-1.6

 47%|█████████████████████████████████████▌                                          | 152/324 [00:12<00:14, 11.71it/s]

{'audio': tensor([[-0.0013, -0.0024, -0.0043,  ..., -0.0030, -0.0019,  0.0000],
        [-0.0039, -0.0064, -0.0063,  ..., -0.0079, -0.0097,  0.0000],
        [ 0.0008,  0.0014,  0.0011,  ..., -0.0004, -0.0007,  0.0000],
        [-0.0002, -0.0003,  0.0005,  ...,  0.0020,  0.0026,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['park', 'tram', 'park', 'city_center']}
tensor([[-0.0013, -0.0024, -0.0043,  ..., -0.0030, -0.0019,  0.0000],
        [-0.0039, -0.0064, -0.0063,  ..., -0.0079, -0.0097,  0.0000],
        [ 0.0008,  0.0014,  0.0011,  ..., -0.0004, -0.0007,  0.0000],
        [-0.0002, -0.0003,  0.0005,  ...,  0.0020,  0.0026,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['park', 'tram', 'park', 'city_center']
{'audio': tensor([[-4.9931e-05, -2.6516e-04, -5.4160e-04,  ..., -6.5709e-04,
         -5.7380e-04,  0.0000e+00],
        [-9.9667e-04, -1.0815e-03,  1.9757e-04,  ...,  1.7687e-04,
          1.5668e-03,  0.0000e+00],
        [ 1.0241e-02,  1.8736e-

 48%|██████████████████████████████████████                                          | 154/324 [00:12<00:15, 11.14it/s]

{'audio': tensor([[ 2.0935e-03,  3.2623e-03,  2.5961e-03,  ..., -7.7205e-03,
         -8.3146e-03,  0.0000e+00],
        [-1.8831e-03, -5.8271e-04, -6.2520e-03,  ..., -9.2437e-04,
         -1.7414e-03,  0.0000e+00],
        [ 1.3552e-03,  2.0564e-03,  1.5588e-03,  ..., -1.5756e-03,
         -1.4639e-03,  0.0000e+00],
        [ 1.5600e-03,  2.2021e-03,  1.5062e-03,  ..., -3.0864e-06,
         -1.7209e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['tram', 'beach', 'residential_area', 'city_center']}
tensor([[ 2.0935e-03,  3.2623e-03,  2.5961e-03,  ..., -7.7205e-03,
         -8.3146e-03,  0.0000e+00],
        [-1.8831e-03, -5.8271e-04, -6.2520e-03,  ..., -9.2437e-04,
         -1.7414e-03,  0.0000e+00],
        [ 1.3552e-03,  2.0564e-03,  1.5588e-03,  ..., -1.5756e-03,
         -1.4639e-03,  0.0000e+00],
        [ 1.5600e-03,  2.2021e-03,  1.5062e-03,  ..., -3.0864e-06,
         -1.7209e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['tram', '

 49%|███████████████████████████████████████                                         | 158/324 [00:13<00:14, 11.62it/s]

{'audio': tensor([[-1.7568e-02, -1.8058e-02, -2.7027e-02,  ...,  4.0097e-02,
          4.6585e-02,  0.0000e+00],
        [ 4.5016e-05,  1.0741e-04,  5.1401e-05,  ..., -1.1363e-04,
         -1.5320e-04,  0.0000e+00],
        [-1.5823e-04, -5.3969e-04, -1.0100e-03,  ..., -2.6626e-03,
         -5.8239e-04,  0.0000e+00],
        [ 1.5203e-04,  3.2060e-04,  3.9877e-04,  ..., -6.3333e-04,
         -7.5161e-04,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['city_center', 'office', 'residential_area', 'forest_path']}
tensor([[-1.7568e-02, -1.8058e-02, -2.7027e-02,  ...,  4.0097e-02,
          4.6585e-02,  0.0000e+00],
        [ 4.5016e-05,  1.0741e-04,  5.1401e-05,  ..., -1.1363e-04,
         -1.5320e-04,  0.0000e+00],
        [-1.5823e-04, -5.3969e-04, -1.0100e-03,  ..., -2.6626e-03,
         -5.8239e-04,  0.0000e+00],
        [ 1.5203e-04,  3.2060e-04,  3.9877e-04,  ..., -6.3333e-04,
         -7.5161e-04,  0.0000e+00]])
tensor([16000, 16000, 16000, 16000])
['

 49%|███████████████████████████████████████▌                                        | 160/324 [00:13<00:13, 11.89it/s]

{'audio': tensor([[-1.4241e-03, -1.8594e-03, -1.5779e-03,  ..., -4.5728e-04,
         -3.9253e-04,  0.0000e+00],
        [ 4.3958e-03,  2.7914e-03, -2.9251e-03,  ...,  2.0364e-02,
          2.0503e-02,  0.0000e+00],
        [ 9.0581e-04,  1.4918e-03,  1.3124e-03,  ...,  5.2064e-04,
          1.1568e-03,  0.0000e+00],
        [-4.3392e-03, -6.9566e-03, -4.7360e-03,  ..., -8.2851e-04,
          3.1826e-06,  0.0000e+00]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['grocery_store', 'city_center', 'cafe/restaurant', 'cafe/restaurant']}
tensor([[-1.4241e-03, -1.8594e-03, -1.5779e-03,  ..., -4.5728e-04,
         -3.9253e-04,  0.0000e+00],
        [ 4.3958e-03,  2.7914e-03, -2.9251e-03,  ...,  2.0364e-02,
          2.0503e-02,  0.0000e+00],
        [ 9.0581e-04,  1.4918e-03,  1.3124e-03,  ...,  5.2064e-04,
          1.1568e-03,  0.0000e+00],
        [-4.3392e-03, -6.9566e-03, -4.7360e-03,  ..., -8.2851e-04,
          3.1826e-06,  0.0000e+00]])
tensor([16000, 16000, 16000, 

 51%|████████████████████████████████████████▍                                       | 164/324 [00:13<00:13, 12.09it/s]

{'audio': tensor([[ 0.0023,  0.0035,  0.0012,  ..., -0.0035, -0.0045,  0.0000],
        [-0.0003, -0.0015,  0.0002,  ...,  0.0011,  0.0025,  0.0000],
        [ 0.0061,  0.0097,  0.0072,  ...,  0.0091,  0.0093,  0.0000],
        [ 0.0001,  0.0004,  0.0003,  ..., -0.0001, -0.0008,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['beach', 'metro_station', 'bus', 'residential_area']}
tensor([[ 0.0023,  0.0035,  0.0012,  ..., -0.0035, -0.0045,  0.0000],
        [-0.0003, -0.0015,  0.0002,  ...,  0.0011,  0.0025,  0.0000],
        [ 0.0061,  0.0097,  0.0072,  ...,  0.0091,  0.0093,  0.0000],
        [ 0.0001,  0.0004,  0.0003,  ..., -0.0001, -0.0008,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['beach', 'metro_station', 'bus', 'residential_area']
{'audio': tensor([[-0.0202, -0.0342, -0.0302,  ..., -0.0355, -0.0411,  0.0000],
        [-0.0015, -0.0031, -0.0030,  ...,  0.0002, -0.0008,  0.0000],
        [-0.0002, -0.0004, -0.0003,  ...,  0.0004,  0.0005,  0.0000],

 51%|████████████████████████████████████████▉                                       | 166/324 [00:13<00:12, 12.30it/s]

{'audio': tensor([[-0.0007, -0.0012, -0.0010,  ...,  0.0023,  0.0024,  0.0000],
        [ 0.0288,  0.0466,  0.0373,  ..., -0.0008, -0.0028,  0.0000],
        [ 0.0003,  0.0002, -0.0004,  ..., -0.0013, -0.0015,  0.0000],
        [-0.0017, -0.0031, -0.0031,  ...,  0.0060,  0.0073,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['forest_path', 'city_center', 'forest_path', 'beach']}
tensor([[-0.0007, -0.0012, -0.0010,  ...,  0.0023,  0.0024,  0.0000],
        [ 0.0288,  0.0466,  0.0373,  ..., -0.0008, -0.0028,  0.0000],
        [ 0.0003,  0.0002, -0.0004,  ..., -0.0013, -0.0015,  0.0000],
        [-0.0017, -0.0031, -0.0031,  ...,  0.0060,  0.0073,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['forest_path', 'city_center', 'forest_path', 'beach']
{'audio': tensor([[ 6.5762e-04, -1.2394e-03, -2.0203e-03,  ...,  2.5427e-03,
          5.6356e-04,  0.0000e+00],
        [-1.3286e-04, -1.7477e-04, -1.4000e-04,  ..., -7.1883e-04,
         -1.1705e-03,  0.0000e+00],
 

 52%|█████████████████████████████████████████▉                                      | 170/324 [00:14<00:11, 12.86it/s]

{'audio': tensor([[-0.0001, -0.0007, -0.0018,  ...,  0.0007,  0.0007,  0.0000],
        [-0.0001, -0.0002, -0.0002,  ...,  0.0002,  0.0003,  0.0000],
        [ 0.0003,  0.0006,  0.0006,  ...,  0.0007,  0.0006,  0.0000],
        [-0.0679, -0.1165, -0.1053,  ..., -0.0914, -0.0984,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['grocery_store', 'library', 'forest_path', 'car']}
tensor([[-0.0001, -0.0007, -0.0018,  ...,  0.0007,  0.0007,  0.0000],
        [-0.0001, -0.0002, -0.0002,  ...,  0.0002,  0.0003,  0.0000],
        [ 0.0003,  0.0006,  0.0006,  ...,  0.0007,  0.0006,  0.0000],
        [-0.0679, -0.1165, -0.1053,  ..., -0.0914, -0.0984,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['grocery_store', 'library', 'forest_path', 'car']
{'audio': tensor([[ 0.0009,  0.0010,  0.0005,  ..., -0.0008, -0.0007,  0.0000],
        [-0.0373, -0.0618, -0.0532,  ..., -0.0123, -0.0139,  0.0000],
        [ 0.0010,  0.0017,  0.0012,  ...,  0.0033,  0.0042,  0.0000],
     

 53%|██████████████████████████████████████████▋                                     | 173/324 [00:14<00:12, 12.10it/s]

{'audio': tensor([[ 0.0023,  0.0040,  0.0039,  ...,  0.0016,  0.0014,  0.0000],
        [ 0.0067,  0.0072,  0.0044,  ..., -0.0052, -0.0064,  0.0000],
        [ 0.0100,  0.0183,  0.0192,  ..., -0.0120, -0.0112,  0.0000],
        [-0.0065, -0.0102, -0.0079,  ..., -0.0141, -0.0162,  0.0000]]), 'sample_rate': tensor([16000, 16000, 16000, 16000]), 'label': ['grocery_store', 'metro_station', 'car', 'train']}
tensor([[ 0.0023,  0.0040,  0.0039,  ...,  0.0016,  0.0014,  0.0000],
        [ 0.0067,  0.0072,  0.0044,  ..., -0.0052, -0.0064,  0.0000],
        [ 0.0100,  0.0183,  0.0192,  ..., -0.0120, -0.0112,  0.0000],
        [-0.0065, -0.0102, -0.0079,  ..., -0.0141, -0.0162,  0.0000]])
tensor([16000, 16000, 16000, 16000])
['grocery_store', 'metro_station', 'car', 'train']
{'audio': tensor([[-2.4439e-04, -3.0662e-04, -1.1655e-03,  ...,  1.6145e-03,
          3.8238e-03,  0.0000e+00],
        [ 1.9462e-04,  1.4024e-04, -6.5956e-05,  ...,  1.6403e-04,
         -5.7581e-07,  0.0000e+00],
        [




KeyboardInterrupt: 

In [None]:
def train(model, feature_extractor, criterion, epochs, dev, lr=0.001, load_checkpoint = False, save_every = 10, save_path = 'weights'):
    try:
        if not os.path.isdir(save_path):
            os.mkdir(save_path)
            
        # Move model to CUDA
        model = model.to(dev)
        # MOVE criterior to CUDA
        criterion = criterion.to(dev)

        # create optimizer
        optimizer = optim.Adam(model.parameters(), lr = lr)

        
        labels_l = []
        predictions_l = []
        
        # load checkpoints
        if load_checkpoint:
            if os.path.isfile(os.path.join(save_path,'weights.pt')):
                print('Loading weights...')
                # it is possible to load a state dict that doesn't match the networck architecture by passing asserting the strict mode
                model.load_state_dict(torch.load(os.path.join(save_path,'weights.pt')))
            if os.path.isfile(os.path.join(save_path,'optim.pt')):
                print('Loading optimizer...')
                optimizer.load_state_dict(torch.load(os.path.join(save_path,'optim.pt')))
            print('Loading completed!')

        # Initialize history
        history_loss = {"train": [], "val": [], "test": []}
        history_accuracy = {"train": [], "val": [], "test": []}

        # Process each epoch
        for epoch in range(epochs):
            # Initialize epoch variables
            sum_loss = {"train": 0, "val": 0, "test": 0}
            sum_accuracy = {"train": 0, "val": 0, "test": 0}
            
            # Process each split
            for split in ["train", "val", "test"]:
                #Select train() or eval() mode
                if split == 'train':
                  model.train()
                else:
                  model.eval()
                    
                # Process each batch
                for batch in loaders[split]:
                    # Move to CUDA
                    input_audio = batch['audio'].to(dev)
                    sample_rate = batch['sample_rate'].to(dev)
                    target = batch['label'].squeeze(1).to(dev)

                    # Reset gradients
                    if split == 'train':
                        optimizer.zero_grad()
                    
                    # Compute output
                    ast_imput = feature_extractor(input_audio, sampling_rate=sample_rate, return_tensors="pt")
                    output = model(ast_input)

                    # Compute loss 
                    loss = criterion(output, target.long())
                    
                    # Update loss
                    sum_loss[split] += loss.item()
                    
                    # Check parameter update
                    if split == "train":
                        # Compute gradients
                        loss.backward()
                        # Optimize
                        optimizer.step()
                        
                    # Compute accuracy
                    pred = torch.argmax(output,1)
                    batch_accuracy = (pred == target).sum().item()/target.numel()
                    
                    # Update accuracy
                    sum_accuracy[split] += batch_accuracy

                # checkpoint
                if epoch%save_every == 0 and split == 'train':
                    torch.save(model.state_dict(), os.path.join(save_path, 'weights.pt'))
                    torch.save(optimizer.state_dict(), os.path.join(save_path, 'optim.pt'))
                
                
            # Compute epoch loss/accuracy
            epoch_loss = {split: sum_loss[split]/len(loaders[split]) for split in ["train", "val", "test"]}
            epoch_accuracy = {split: sum_accuracy[split]/len(loaders[split]) for split in ["train", "val", "test"]}
            # Update history
            for split in ["train", "val", "test"]:
                history_loss[split].append(epoch_loss[split])
                history_accuracy[split].append(epoch_accuracy[split])
            # Print info
            print(f"Epoch {epoch+1}:",
                  f"TrL={epoch_loss['train']:.4f},",
                  f"TrA={epoch_accuracy['train']:.4f},",
                  f"VL={epoch_loss['val']:.4f},",
                  f"VA={epoch_accuracy['val']:.4f},",
                  f"TeL={epoch_loss['test']:.4f},",
                  f"TeA={epoch_accuracy['test']:.4f},")
    except KeyboardInterrupt:
        print("Interrupted")
    finally:
        # Plot loss
        plt.title("Loss")
        for split in ["train", "val", "test"]:
            plt.plot(history_loss[split], label=split)
        plt.legend()
        plt.show()
        # Plot accuracy
        plt.title("Accuracy")
        for split in ["train", "val", "test"]:
            plt.plot(history_accuracy[split], label=split)
        plt.legend()
        plt.show()

In [327]:
def conta_istanze_classi(path):
    class_count = {}



    with open(path, 'r') as file:

        for line in file:

            nome_audio, nome_classe = line.strip().split()

            if nome_classe in class_count:
                class_count[nome_classe] += 1
            else:
                class_count[nome_classe] = 1


    return class_count

print(conta_istanze_classi("c:/Users/cerru/Desktop/TUT17/labels/evaluate.txt"))

{'bus': 108, 'residential_area': 108, 'car': 108, 'grocery_store': 108, 'train': 108, 'forest_path': 108, 'park': 108, 'library': 108, 'cafe/restaurant': 108, 'tram': 108, 'city_center': 108, 'office': 108, 'beach': 108, 'home': 108, 'metro_station': 108}
