In [1]:
from src.utils.audio_utils import compute_spectrogram, load_audio_file, padding_audio
import matplotlib.pyplot as plt
import os
import torch
nn = torch.nn
from src.utils import path_utils
import torchinfo
import torchvision
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class BasicModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.main = nn.Sequential(
      nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),
      nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),
      nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),
      nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2, stride=2),
    )
    self.linear = nn.Linear(128, 2)
  
  def forward(self, x):
    x = self.main(x)
    # global average pooling
    x = nn.functional.avg_pool2d(x, kernel_size=x.shape[2:]).view(x.shape[0], -1)
    x = self.linear(x)
    return x


In [7]:
class SlidingDataset(torch.utils.data.Dataset):
    def __init__(self, raw_dir, transform_audio, window_size, stride):

        self.tensor_directory = raw_dir
        self.transform_audio = transform_audio
        self.reshape_size = (129, 229)
        self.transform_image = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Resize(self.reshape_size)])
        self.files = os.listdir(raw_dir)
        self.window_size = window_size
        self.stride = stride
        self.data_tuples = []

        for f in self.files:
            file = os.path.join(raw_dir, f)
            data, fs = load_audio_file(file)
            # pad with zeros with tensor is not of right length
            data = padding_audio(data, self.window_size)
            idxs = [i for i in range(0, data.size - self.window_size, self.stride)]
            if len(idxs) == 0:
                continue
            for j in idxs:
                data_tuple = (file, j)
                self.data_tuples.append(data_tuple)


    def __len__(self):
        return len(self.data_tuples)

    def __getitem__(self, idx):
        sample_tuple = self.data_tuples[idx]
        sample, _ = load_audio_file(sample_tuple[0])
        sample = sample[sample_tuple[1]: sample_tuple[1] + self.window_size]
        x = self.transform_audio(sample)
        print(x.shape)
        x = self.transform_image(x)
        print(x.shape)
        return {'sample': sample, 'file': sample_tuple[0], 'index': sample_tuple[1]}

In [8]:
def transform_audio(data):
    _, _, specto = compute_spectrogram(data, 24000, nperseg=256, noverlap=256/2, scale="dB")
    # freq clip
    specto = specto[:120, :]
    return specto
detection_dataloader = torch.utils.data.DataLoader(SlidingDataset(raw_dir=path_utils.get_raw_data_path(),
                                                    transform_audio=transform_audio, 
                                                    window_size=256, stride=128),
                                                    batch_size=1, 
                                                    drop_last=True)

  data=librosa.util.fix_length(data, data.size + window_size - data.size % window_size)


In [9]:
model = BasicModel()

for x in detection_dataloader:
  print(x['sample'].shape)
  print(x['file'])
  print(x['index'])
  logits=model(x['sample'])
  print(torch.argmax(logits, dim=1).numpy().tolist())

(120, 1)
torch.Size([1, 129, 229])
torch.Size([1, 256])
['./data/raw_data/SMA01214_20210809_233002.wav']
tensor([0])


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 256]