In [1]:
import numpy as np
import scipy

import torch
from torch.utils.data import DataLoader

import librosa
from hyperparameters import *

from custom_datasets import VoxCelebDataset, MelCelebDataset
from preprocessing import melspectrogram

from utils import show, random_crop
import os

from IPython.display import Audio
from matplotlib import pyplot as plt

from tqdm import tqdm_notebook as tqdm

from VGGish import vggish

import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


In [2]:
def transform(x):
    x = random_crop(x, 96)
    return torch.Tensor(x)

data = MelCelebDataset(MEL_DATA_PATH, transform=transform)

In [3]:
data[0][0]

tensor([[0.4449, 0.4260, 0.4216,  ..., 0.4209, 0.4630, 0.4075],
        [0.4074, 0.4092, 0.4298,  ..., 0.4039, 0.4306, 0.3602],
        [0.5133, 0.5027, 0.4857,  ..., 0.4463, 0.5062, 0.4792],
        ...,
        [0.3996, 0.3818, 0.4205,  ..., 0.4568, 0.5388, 0.5475],
        [0.3987, 0.3377, 0.4185,  ..., 0.4276, 0.4963, 0.5194],
        [0.3514, 0.2796, 0.3348,  ..., 0.4480, 0.4815, 0.4871]])

In [5]:
data_loader = DataLoader(data, batch_size=512, num_workers=1, shuffle=True)

In [8]:
data[0][0]

tensor([[0.3587, 0.3813, 0.4072,  ..., 0.3973, 0.3110, 0.3664],
        [0.4312, 0.4355, 0.4407,  ..., 0.3514, 0.2773, 0.3734],
        [0.4275, 0.4072, 0.4194,  ..., 0.2699, 0.2689, 0.3722],
        ...,
        [0.3885, 0.3748, 0.4091,  ..., 0.3963, 0.3816, 0.3528],
        [0.2940, 0.3477, 0.3735,  ..., 0.3890, 0.3749, 0.3743],
        [0.3348, 0.3800, 0.3993,  ..., 0.3916, 0.3499, 0.3765]])

In [None]:
for label in data.labels

In [None]:
plt.figure(figsize=[10, 3])
plt.imshow(data[0][0])

In [None]:
data

In [9]:
net = vggish(include_classifier=True, pretrained=True)

In [10]:
net

VGGish(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU()
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU()
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Linear(in_features=12288, out