In [1]:
pip install torchvggish

Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import torch
import torchaudio
from torchvggish import vggish, vggish_input
import numpy as np
from tqdm import tqdm

model = vggish()
model.eval()

audio_folder = "../../data/input.data"
embeddings = {}

for fname in tqdm(os.listdir(audio_folder), desc="Processing audios"):
    if fname.endswith(".wav"):
        path = os.path.join(audio_folder, fname)
        waveform, sr = torchaudio.load(path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
        if waveform.shape[1] < 16000:
            pad_len = 16000 - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_len))
        torchaudio.save("tmp.wav", waveform, 16000)
        examples = vggish_input.wavfile_to_examples("tmp.wav")
        if examples.size == 0:
            continue
        with torch.no_grad():
            embs = model(torch.tensor(examples))
        embeddings[fname] = embs.mean(dim=0).cpu().numpy()


  embs = model(torch.tensor(examples))
Processing audios: 100%|█████████████████████████████████████████████████████████████| 500/500 [00:44<00:00, 11.29it/s]


In [10]:
save_folder = "input_data_embeddings"
os.makedirs(save_folder, exist_ok=True)

for fname, emb in embeddings.items():
    save_path = os.path.join(save_folder, fname.replace(".wav", ".npy"))
    np.save(save_path, emb)