In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
#from utils import transforms
from utils_dir import transforms 
import torchvision

import os
import numpy as np
import imageio
import random
import collections
import csv
import librosa
import os
import torch
import torchaudio
import torchaudio.transforms as T
import random
import config
torch.cuda.empty_cache()

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")





In [19]:


class ContrastiveESCDataset(torch.utils.data.Dataset):
    
    def __init__(self, train=True, root='./data/ESC50/ESC-50-master/audio/'):
        self.root = root
        self.train = train
        
        temp = os.listdir(self.root)
        temp.sort()
        self.file_names = []
        if train:
            for i in range(len(temp)):
                if int(temp[i].split('-')[0]) in config.train_folds:
                    self.file_names.append(temp[i])
        else:
            for i in range(len(temp)):
                if int(temp[i].split('-')[0]) in config.test_fold:
                    self.file_names.append(temp[i])
        
        self.mel_transform = T.MelSpectrogram(sample_rate=44100, n_mels=32, n_fft=1024, hop_length=512)
        
        if self.train:
            self.wave_transforms = torchvision.transforms.Compose([
                transforms.ToTensor1D(), 
                transforms.RandomScale(max_scale = 1.25), 
                transforms.RandomPadding(out_len = 220500),
                transforms.RandomCrop(out_len = 220500)
            ])
            self.spec_transforms = torchvision.transforms.Compose([
                transforms.FrequencyMask(max_width=config.freq_masks_width, numbers=config.freq_masks), 
                transforms.TimeMask(max_width=config.time_masks_width, numbers=config.time_masks)
            ])
        else:
            self.wave_transforms = torchvision.transforms.Compose([
                transforms.ToTensor1D(),
                transforms.RandomPadding(out_len = 220500),
                transforms.RandomCrop(out_len = 220500)
            ])
            # Only the ToTensor transform is removed here.
            self.spec_transforms = torchvision.transforms.Compose([])

    def __len__(self):
        return len(self.file_names)
    
    def __getitem__(self, index):
        file_name = self.file_names[index]
        
        # Create a positive pair and a negative sample
        aug_wave1, aug_wave2, neg_wave = self.load_and_augment(file_name, index)
        
        # Create spectrograms
        spec1 = self.generate_spectrogram(aug_wave1)
        spec2 = self.generate_spectrogram(aug_wave2)
        neg_spec = self.generate_spectrogram(neg_wave)
        
        return (spec1, spec2, neg_spec)

    def load_and_augment(self, file_name, index):
        path = os.path.join(self.root, file_name)
        wave, _ = torchaudio.load(path, num_frames=44100)
        wave = wave.squeeze(0)

        # Augment aug_wave2 and neg_wave
        aug_wave2 = self.process_wave(wave)
        neg_index = random.choice([x for x in range(len(self.file_names)) if x != index])
        neg_file_name = self.file_names[neg_index]
        neg_path = os.path.join(self.root, neg_file_name)
        neg_wave, _ = torchaudio.load(neg_path, num_frames=44100)
        neg_wave = neg_wave.squeeze(0)
        neg_wave = self.process_wave(neg_wave)

        return wave, aug_wave2, neg_wave

    def process_wave(self, wave):
        if wave.numel() == 0:
            # Handle empty tensor (e.g., by returning a zero tensor)
            return torch.zeros(1, self.out_len)  # Adjust the shape as needed

        # Normalize, remove silent sections, and apply wave transforms
        if wave.ndim == 1:
            wave = wave.unsqueeze(1)

        if np.abs(wave.max()) > 1.0:
            wave = transforms.scale(wave, wave.min(), wave.max(), -1.0, 1.0)
        
        non_zero_indices = wave.nonzero()
        
        if non_zero_indices.numel() == 0:
            # Handle the case where there are no non-zero elements in the waveform
            return torch.zeros(1, self.out_len)  # Adjust the shape as needed
        
        start = non_zero_indices[:, 1].min()
        end = non_zero_indices[:, 1].max()
        
        wave = wave[:, start:end+1]

        wave_copy = np.copy(wave)
        wave_copy = self.wave_transforms(wave_copy)
        wave_copy.squeeze_(0)

        return wave_copy


    def generate_spectrogram(self, wave):
        # Generating mel-spectrogram and apply spec transforms
        s = self.mel_transform(wave)
        log_s = torchaudio.transforms.AmplitudeToDB()(s)
        
        # Add batch dimension if it's not already there
        if log_s.ndim == 2:
            log_s = log_s.unsqueeze(0)
        
        log_s = self.spec_transforms(log_s)
        
        spec = torch.cat((log_s, log_s, log_s), dim=0)
        return spec

        




def create_generators():
    train_dataset = ContrastiveESCDataset(train=True)
    test_dataset = ContrastiveESCDataset(train=False)
    
    train_loader = data.DataLoader(train_dataset, 
                                   batch_size=4, 
                                   shuffle=True, 
                                   num_workers=0, 
                                   drop_last=True, 
                                   collate_fn=contrastive_collate_fn)
    
    test_loader = data.DataLoader(test_dataset, 
                                  batch_size=4, 
                                  shuffle=False, 
                                  num_workers=0, 
                                  drop_last=True, 
                                  collate_fn=contrastive_collate_fn)
    
    return train_loader, test_loader



In [20]:
def contrastive_collate_fn(batch):
    anchors, positives, negatives = [], [], []

    for item in batch:
        spec1, spec2, neg_spec = item
        anchors.append(spec1)
        positives.append(spec2)
        negatives.append(neg_spec)

    return {
        "anchors": torch.stack(anchors),
        "positives": torch.stack(positives),
        "negatives": torch.stack(negatives)
    }


In [21]:
class SmallCNN(nn.Module):
    def __init__(self):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(16 * 64 * 216, 128)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return x

In [22]:
import torch.optim as optim
import torch.nn.functional as F

# Define the contrastive loss
def contrastive_loss(anchor, positive, negative, margin=0.5):
    pos_distance = F.pairwise_distance(anchor, positive)
    neg_distance = F.pairwise_distance(anchor, negative)
    loss = torch.mean(torch.clamp(pos_distance - neg_distance + margin, min=0.0))
    return loss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_loader, test_loader = create_generators()




In [26]:
accumulation_steps = 4  # Accumulate gradients over 4 small batches before updating
optimizer.zero_grad()

for i, batch in enumerate(train_loader):
    print("I dua wos !!!")
    anchors, positives, negatives = batch
    anchors = anchors.to(device)
    positives = positives.to(device)
    negatives = negatives.to(device)
    
    optimizer.zero_grad()
    
    anchor_out = model(anchors)
    positive_out = model(positives)
    negative_out = model(negatives)
    
    loss = contrastive_loss(anchor_out, positive_out, negative_out)
    loss = loss / accumulation_steps  # Divide the loss by the accumulation steps
    
    loss.backward()
    print("I dua wos !!!")
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

# Make sure to update the model one more time if there are remaining accumulated gradients.
if i % accumulation_steps != 0:
    optimizer.step()


RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 38896200000 bytes.

In [None]:
train_dataset = ContrastiveESCDataset(train=True)

import matplotlib.pyplot as plt
import librosa.display

def plot_spec(spec1, spec2, neg_spec):
    plt.figure(figsize=(12, 4))
    
    # Plot the first spectrogram
    plt.subplot(131)
    librosa.display.specshow(spec1.numpy(), cmap='viridis', y_axis='mel')
    plt.title('Spectrogram 1')
    
    # Plot the second spectrogram
    plt.subplot(132)
    librosa.display.specshow(spec2.numpy(), cmap='viridis', y_axis='mel')
    plt.title('Spectrogram 2')
    
    # Plot the negative spectrogram
    plt.subplot(133)
    librosa.display.specshow(neg_spec.numpy(), cmap='viridis', y_axis='mel')
    plt.title('Negative Spectrogram')
    
    plt.tight_layout()
    plt.show()

# Assuming you have already created the train_dataset
spec1, spec2, neg_spec = train_dataset[0]
plot_spec(spec1, spec2, neg_spec)
