In [1]:
from librosa.core import audio
from scipy.signal import waveforms
from torch.utils.data.dataset import Dataset
import librosa
from glob import glob
import cv2
import numpy as np
import torch 
import random
import pandas as pd
import time
import clip

from textaugment import EDA
import nltk
import pickle
from PIL import Image
import os

from tqdm import tqdm
import matplotlib.pyplot as plt

from test_datasets import AudioSetTestDataset, VGGSoundTestDataset

In [None]:
"""

INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------
Clip(
  audio_path="/home/hjp/sound_datasets/urbansound8k/audio/fold4/55728-9-0-67.wav",
  clip_id="55728-9-0-67",
  audio: The clip's audio
            * np.ndarray - audio signal
            * float - sample rate,
  class_id: The clip's class id.
            * int - integer representation of the class label (0-9). See Dataset Info in the documentation for mapping,
  class_label: The clip's class label.
            * str - string class name: air_conditioner, car_horn, children_playing, dog_bark, drilling, engine_idling, gun_shot, jackhammer, siren, street_music,
  fold: The clip's fold.
            * int - fold number (1-10) to which this clip is allocated. Use these folds for cross validation,
  freesound_end_time: The clip's end time in Freesound.
            * float - end time in seconds of the clip in the original freesound recording,
  freesound_id: The clip's Freesound ID.
            * str - ID of the freesound.org recording from which this clip was taken,
  freesound_start_time: The clip's start time in Freesound.
            * float - start time in seconds of the clip in the original freesound recording,
  salience: The clip's salience.
            * int - annotator estimate of class sailence in the clip: 1 = foreground, 2 = background,
  slice_file_name: The clip's slice filename.
            * str - The name of the audio file. The name takes the following format: [fsID]-[classID]-[occurrenceID]-[sliceID].wav,
  tags: The clip's tags.
            * annotations.Tags - tag (label) of the clip + confidence. In UrbanSound8K every clip has one tag,
)


"""

In [3]:
#nltk.download("stopwords")
#nltk.download("wordnet")

In [74]:
VGGSSOUND_DATA_PATH = "./vggsound"

In [117]:
audioset_test_dataset = AudioSetTestDataset()

audioset_erronous_idx_list = []
for idx in tqdm(range(len(audioset_test_dataset))) :
    try :
        data = audioset_test_dataset[idx]
    except Exception as e :
        print(idx)
        audioset_erronous_idx_list.append(idx)

print("idx of erronous file :")
print(audioset_erronous_idx_list)

100%|██████████| 17045/17045 [04:59<00:00, 56.85it/s]

idx of erronous file :
[]





In [112]:
# check if idx in audioset_erronous_idx_list incurs error

idx = 5791
print(audioset_test_dataset.audio_path_list[idx])
print()
audioset_test_dataset[idx]

./audioset/Fire_9300.wav



(tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.2836, 0.2836, 0.2836],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]]),
 tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.2836, 0.2836, 0.2836],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]]),
 'elicit provoke')

In [116]:
# some_videos incur error due to various reasons.
# iter through every .wav file and delete erronous ones.

# 



audioset_delete_list = [
    './audioset/Fire_9300.wav'
]
# delete
list(map(
    os.remove,
    audioset_delete_list
))

[None]

In [7]:
import torch
import clip
import torch 
from collections import OrderedDict
import math
import timm

def copyStateDict(state_dict):
    if list(state_dict.keys())[0].startswith("module"):
        start_idx = 1
    else:
        start_idx = 0
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = ".".join(k.split(".")[start_idx:])
        new_state_dict[name] = v
    return new_state_dict

class AudioEncoder(torch.nn.Module):
    def __init__(self, backbone_name="resnet18"):
        super(AudioEncoder, self).__init__()
        self.backbone_name = backbone_name
        self.conv = torch.nn.Conv2d(1, 3, (3, 3))
        self.feature_extractor = timm.create_model(self.backbone_name, num_classes=512, pretrained=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.feature_extractor(x)
        return x
    

class SoundCLIPLoss(torch.nn.Module):

    def __init__(self, opts):
        super(SoundCLIPLoss, self).__init__()
        self.model, self.preprocess = clip.load("ViT-B/32", device="cuda")
        self.upsample = torch.nn.Upsample(scale_factor=7)
        self.avg_pool = torch.nn.AvgPool2d(kernel_size=opts.stylegan_size // 32)

        self.audio_encoder = AudioEncoder()
        
        self.audio_encoder.load_state_dict(copyStateDict(torch.load("./pretrained_models/resnet18.pth")))
        
        self.audio_encoder = self.audio_encoder.cuda()
        self.audio_encoder.eval()

    def forward(self, image, audio):
        image = self.avg_pool(self.upsample(image))
        image_features = self.model.encode_image(image).float()
        audio_features = self.audio_encoder(audio).float()

        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        sim = (image_features @ audio_features.T)[0] * math.exp(0.07)
        loss = 1 - sim
        return loss


In [10]:
from torch.utils.data.dataloader import DataLoader

# set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device :", device)

# initialize dataset
audioset_test_dataset = AudioSetTestDataset(
    prompt_generator = lambda prompt : "sound of " + prompt
)

# initialize dataloader
audioset_test_loader = DataLoader(
    audioset_test_dataset,
    16,
    num_workers=10
)

# initialize encoder
audio_encoder = AudioEncoder()
audio_encoder.load_state_dict(
    copyStateDict(torch.load("../pretrained_models/resnet18.pth"))
)
audio_encoder.to(device=device)

# run inference and accumulate result
result_list = []
with tqdm(total=len(audioset_test_dataset)) as pbar :
    for i, (X, label) in enumerate(audioset_test_loader) :
        X = X.to(device)
        result = audio_encoder(X)
        result_list.append(result.detach().cpu())
        b, c, h, w = X.shape
        pbar.update(b)

device : cuda:0


100%|██████████| 17045/17045 [08:41<00:00, 32.67it/s]


In [12]:
result_list[0].shape

torch.Size([16, 512])

# VGGSountTestDataset

In [186]:

class VggSoundTestDataset(Dataset) :
    def __init__(
        self,
        seed = 1,
        dataset_path = "./vggsound",
        prompt_generator = lambda x : x
    ) :
        super(VggSoundTestDataset, self).__init__()
        self.dataset_path = dataset_path
        self.audio_path_list = sorted(glob(os.path.join(self.dataset_path, "*.wav")))
        self.label_list = list(map(
            # format of each audio file is "[label]_[index of that audio file].wav" 
            lambda file_path : os.path.basename(file_path).split('_')[0],
            self.audio_path_list
        ))

        random.seed(seed)

        self.prompt_generator = prompt_generator
        self.time_length = 864
        self.n_mels = 128
        self.text_aug = EDA()
        self.width_resolution = 512 

    def __getitem__(self, idx) :
        #audio_inputs = np.load(wav_name, allow_pickle=True)

        audio_inputs, sr = librosa.load(self.audio_path_list[idx])
        audio_inputs = librosa.feature.melspectrogram(y=audio_inputs, sr=sr, n_mels=self.n_mels)
        audio_inputs = librosa.power_to_db(audio_inputs, ref=np.max) / 80.0 + 1
        audio_inputs = np.array([audio_inputs])


        text_prompt = self.label_list[idx]
        #text_prompt = wav_name.split("/")[-1].split("_")[0]
        c, h, w = audio_inputs.shape

        if w >= self.time_length:
            j = random.randint(0, w-self.time_length)
            audio_inputs = audio_inputs[:,:,j:j+self.time_length]
        elif w < self.time_length:
            zero = np.zeros((1, self.n_mels, self.time_length))
            j = random.randint(0, self.time_length - w - 1)
            zero[:,:,j:j+w] = audio_inputs[:,:,:w]
            audio_inputs = zero
       
        audio_inputs = cv2.resize(audio_inputs[0], (self.n_mels, self.width_resolution))
        audio_inputs = cv2.resize(audio_inputs[0], (self.n_mels, self.width_resolution))
            
        #audio_aug = self.spec_augment(audio_inputs)
        audio_inputs = audio_inputs.reshape(-1, self.n_mels, self.width_resolution)
        #audio_aug = audio_aug.reshape(-1, self.n_mels, self.width_resolution)
            
        audio_inputs = torch.from_numpy(audio_inputs).float()
        #audio_aug = torch.from_numpy(audio_aug).float()

        #text_prompt = self.text_aug.synonym_replacement(text_prompt)
        #text_prompt = self.text_aug.random_swap(text_prompt)
        #text_prompt = self.text_aug.random_insertion(text_prompt)

        # edit text prompt by given function    
        text_prompt = self.prompt_generator(text_prompt)

        #return audio_inputs, audio_aug, text_prompt
        return audio_inputs, text_prompt


    def spec_augment(self, spec, num_mask=2, freq_masking_max_percentage=0.15, time_masking_max_percentage=0.3):
        spec = spec.copy()
        for i in range(num_mask):
            all_frames_num, all_freqs_num = spec.shape
            freq_percentage = random.uniform(0.0, freq_masking_max_percentage)
            
            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[:, f0:f0 + num_freqs_to_mask] = 0

            time_percentage = random.uniform(0.0, time_masking_max_percentage)
            
            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[t0:t0 + num_frames_to_mask, :] = 0
        return spec

    def __len__(self):
        return len(self.audio_path_list)

In [107]:
# some_videos incur error due to various reasons.
# iter through every .wav file and delete erronous ones.


vggsound_erronous_idx_list = []

vggsound_test_dataset = VggSoundTestDataset()
for idx in tqdm(range(len(vggsound_test_dataset))) :
    try :
        data = vggsound_test_dataset[idx]
    except Exception as e :
        print(idx)
        vggsound_erronous_idx_list.append[idx]

print(vggsound_erronous_idx_list)

100%|██████████| 13540/13540 [04:05<00:00, 55.19it/s]

[]





In [108]:
# check idx in vggsound_erronous_idx_list incurs error

idx = 0
print(vggsound_test_dataset.audio_path_list[idx])
data = vggsound_test_dataset[idx]

./vggsound/air conditioning noise_102097.wav


In [106]:

# 
vggsound_delete_file_path_list = [
    './vggsound/hail_179314.wav',
    './vggsound/police car (siren)_5568.wav',
    './vggsound/squishing water_94705.wav',
]
# delete
list(map(
    os.remove,
    vggsound_delete_file_path_list
))

[None, None, None]

In [127]:
vggsound_test_dataset = VggSoundTestDataset(
    prompt_generator = lambda prompt : f"sound of {prompt}"
)

for i in range(100) :
    print(vggsound_test_dataset[i][-1])
    print(vggsound_test_dataset.audio_path_list[i])
    print()

sound of air conditioning noise
./vggsound/air conditioning noise_102097.wav

sound of air conditioning noise
./vggsound/air conditioning noise_103820.wav

sound of air conditioning noise
./vggsound/air conditioning noise_108938.wav

sound of air conditioning noise
./vggsound/air conditioning noise_110135.wav

sound of air conditioning noise
./vggsound/air conditioning noise_116249.wav

sound of air conditioning noise
./vggsound/air conditioning noise_116275.wav

sound of air conditioning noise
./vggsound/air conditioning noise_11715.wav

sound of air conditioning noise
./vggsound/air conditioning noise_12594.wav

sound of air conditioning noise
./vggsound/air conditioning noise_132581.wav

sound of air conditioning noise
./vggsound/air conditioning noise_133492.wav

sound of air conditioning noise
./vggsound/air conditioning noise_139073.wav

sound of air conditioning noise
./vggsound/air conditioning noise_150965.wav

sound of air conditioning noise
./vggsound/air conditioning noise_

In [None]:
class VggsoundCurationDataset(Dataset):
    def __init__(self):
        self.audio_lists = glob("./vggsound_curation/*.npy")
        self.time_length = 864
        self.n_mels = 128
        self.text_aug = EDA()
        self.width_resolution = 512

    def __getitem__(self, idx):
        wav_name = self.audio_lists[idx]
        audio_inputs = np.load(wav_name, allow_pickle=True)

        text_prompt = wav_name.split("/")[-1].split("_")[0]
        c, h, w = audio_inputs.shape

        if w >= self.time_length:
            j = random.randint(0, w-self.time_length)
            audio_inputs = audio_inputs[:,:,j:j+self.time_length]
        elif w < self.time_length:
            zero = np.zeros((1, self.n_mels, self.time_length))
            j = random.randint(0, self.time_length - w - 1)
            zero[:,:,j:j+w] = audio_inputs[:,:,:w]
            audio_inputs = zero
       
        audio_inputs = cv2.resize(audio_inputs[0], (self.n_mels, self.width_resolution))
        audio_inputs = cv2.resize(audio_inputs[0], (self.n_mels, self.width_resolution))
            
        audio_aug = self.spec_augment(audio_inputs)
        audio_inputs = audio_inputs.reshape(-1, self.n_mels, self.width_resolution)
        audio_aug = audio_aug.reshape(-1, self.n_mels, self.width_resolution)
            
        audio_inputs = torch.from_numpy(audio_inputs).float()
        audio_aug = torch.from_numpy(audio_aug).float()

        text_prompt = self.text_aug.synonym_replacement(text_prompt)
        text_prompt = self.text_aug.random_swap(text_prompt)
        text_prompt = self.text_aug.random_insertion(text_prompt)
            
        return audio_inputs, audio_aug, text_prompt

    def spec_augment(self, spec, num_mask=2, freq_masking_max_percentage=0.15, time_masking_max_percentage=0.3):
        spec = spec.copy()
        for i in range(num_mask):
            all_frames_num, all_freqs_num = spec.shape
            freq_percentage = random.uniform(0.0, freq_masking_max_percentage)
            
            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[:, f0:f0 + num_freqs_to_mask] = 0

            time_percentage = random.uniform(0.0, time_masking_max_percentage)
            
            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[t0:t0 + num_frames_to_mask, :] = 0
        return spec

    def __len__(self):
        return len(self.audio_lists)


Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
...,...,...,...,...,...,...,...
1995,5-263831-B-6.wav,5,6,hen,False,263831,B
1996,5-263902-A-36.wav,5,36,vacuum_cleaner,False,263902,A
1997,5-51149-A-25.wav,5,25,footsteps,False,51149,A
1998,5-61635-A-8.wav,5,8,sheep,False,61635,A
