In [205]:
import torch
import torch.nn as nn
from torchvggish import vggish, vggish_input
import torch.nn.functional as F

import mir_eval

import os
import pandas as pd
import librosa
import numpy as np
import pickle

In [206]:
# train csv path
train_csv_path = 'openmic-2018/partitions/split01_train.csv'
# test csv path
test_csv_path = 'openmic-2018/partitions/split01_test.csv'

# open csvs
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# convert to numpy arrays
train_df = train_df.to_numpy()
test_df = test_df.to_numpy()

# make each a single list

train_df = train_df.flatten()
test_df = test_df.flatten()

# print the first 5 rows of the train and test dataframes
print(train_df[:5])
print(test_df[:5])

# only use the first 10% of each csv
train_df = train_df[:int(len(train_df) * 1)]
test_df = test_df[:int(len(test_df) * 1)]

['000135_483840' '000139_119040' '000141_153600' '000144_30720'
 '000145_172800']
['000308_61440' '000312_184320' '000319_145920' '000321_218880'
 '000327_88320']


In [207]:
dataset_path = 'spectrograms'
labels_path = 'labels.csv'

# Read the labels CSV file
# ['filename' 'clarinet' 'flute' 'trumpet' 'saxophone' 'voice' 'accordion' 'ukulele' 'mallet_percussion' 'piano' 'guitar' 'mandolin' 'banjo' 'synthesizer' 'trombone' 'organ' 'drums' 'bass' 'cymbals' 'cello' 'violin']
labels_df = pd.read_csv(labels_path)

# Get the list of all thae filenames
filenames = labels_df['filename'].values.tolist()

# load the spectrograms and labels
spectrograms_train = []
labels_train = []

spectrograms_test = []
labels_test = []

#check if pickle file exists
if not os.path.isfile('pickle/spectrograms_train.pkl'):
    
    for filename in filenames:
        # if the filename is not in the train or test dataframe, skip it
        if filename not in train_df and filename not in test_df:
            continue

        # load the spectrogram
        spectrogram = np.load(os.path.join(dataset_path, filename + '.npy'))

        # the fist index is the filename, the next 20 are the labels and the last 20 are the masks
        label = labels_df[labels_df['filename'] == filename].values.tolist()[0][1:21]
        mask = labels_df[labels_df['filename'] == filename].values.tolist()[0][21:]

        # threshold the labels
        label = np.array(label) > 0.5

        # make a pair of the spectrogram and the label
        combined = list(zip(label, mask))

        # append each second seperatly
        if filename in train_df:
            spectrograms_train.append(spectrogram)
            labels_train.append(combined)
        elif filename in test_df:
            spectrograms_test.append(spectrogram)
            labels_test.append(combined)
        else:
            continue
            # print(f"Filename {filename} not found in train or test dataframes")

    # convert the lists to numpy arrays
    spectrograms_train = np.array(spectrograms_train)
    labels_train = np.array(labels_train)

    spectrograms_test = np.array(spectrograms_test)
    labels_test = np.array(labels_test)


    #pickle the spectrogram test and train data
    pickle.dump(spectrograms_test, open('pickle/spectrograms_test.pkl', 'wb'))
    pickle.dump(spectrograms_train, open('pickle/spectrograms_train.pkl', 'wb'))
    #pickle the labels
    pickle.dump(labels_test, open('pickle/labels_test.pkl', 'wb'))
    pickle.dump(labels_train, open('pickle/labels_train.pkl', 'wb'))
else:
    #load the spectrogram test and train data
    spectrograms_test = pickle.load(open('pickle/spectrograms_test.pkl', 'rb'))
    spectrograms_train = pickle.load(open('pickle/spectrograms_train.pkl', 'rb'))
    #load the labels
    labels_test = pickle.load(open('pickle/labels_test.pkl', 'rb'))
    labels_train = pickle.load(open('pickle/labels_train.pkl', 'rb'))



print(f"Spectrograms shape: {spectrograms_train.shape}")
print(f"Labels shape: {labels_train.shape}")

Spectrograms shape: (14914, 10, 96, 64)
Labels shape: (14914, 20, 2)


In [232]:
#get the 2nd set of 10 speectrograms
spec = spectrograms_train[0]

SPEC_SHAPE = spec.shape
print(SPEC_SHAPE)

spec = spec.reshape(spec.shape[0], 1, spec.shape[1], spec.shape[2])
print(spec.shape)

#convert to tensor
spec = torch.tensor(spec).float()

embedding_model = vggish()
embedding_model.eval()
ex = embedding_model.forward(spec)


(10, 96, 64)
(10, 1, 96, 64)


In [209]:
#function for reconstructing the log mel spectrogram without using griffin lim
def reconstruct_log_mel_spectrogram(log_mel_spectrogram):
    # transpose
    log_mel_spectrogram = log_mel_spectrogram.T

    # de-noramlize
    log_mel_spectrogram = (log_mel_spectrogram * 255) + 255

    # apply the inv log mel spectrogram
    mel_spectrogram = librosa.db_to_amplitude(log_mel_spectrogram)

    # apply the mel to audio
    audio = librosa.feature.inverse.mel_to_audio(mel_spectrogram, sr=16000, n_fft=4000, hop_length=160, win_length=4000)

    return audio


In [210]:
#import classification model with torch LOAD
class_model = torch.load('model_2.pt')
#make the class_model use cuda
class_model = class_model.cuda()

In [211]:
class_model

DecisionLevelSingleAttention(
  (emb): EmbeddingLayers(
    (conv1x1): ModuleList(
      (0-2): 3 x Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
    )
    (batchnorm): ModuleList(
      (0-3): 4 x BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (attention): Attention(
    (att): Conv2d(128, 20, kernel_size=(1, 1), stride=(1, 1))
    (cla): Conv2d(128, 20, kernel_size=(1, 1), stride=(1, 1))
  )
)

In [212]:
ex = ex/255
ex = ex.reshape((int(ex.shape[0]/10),10, ex.shape[1])).cuda()

print(ex.shape)
print(ex)

torch.Size([1, 10, 128])
tensor([[[0.6863, 0.2000, 0.8000,  ..., 0.6902, 0.6000, 1.0000],
         [0.7647, 0.2471, 0.8275,  ..., 0.6627, 0.0000, 1.0000],
         [0.8078, 0.2510, 1.0000,  ..., 0.0000, 0.0157, 1.0000],
         ...,
         [0.8039, 0.2627, 1.0000,  ..., 0.0000, 0.1804, 1.0000],
         [0.7451, 0.1961, 0.8157,  ..., 0.7176, 0.0000, 1.0000],
         [0.7294, 0.2118, 0.7843,  ..., 0.5765, 0.0000, 1.0000]]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)


In [213]:
#classify the embedding
class_model.eval()
pred = class_model.forward(ex)

print(pred)

tensor([[0.0818, 0.0898, 0.2421, 0.0501, 0.0278, 0.9153, 0.8577, 0.0453, 0.3478,
         0.0891, 0.0157, 0.0112, 0.0658, 0.0335, 0.3674, 0.0303, 0.0297, 0.0068,
         0.0596, 0.9633]], device='cuda:0', grad_fn=<HardtanhBackward0>)


cymbal, drum, syth, voice

{"accordion": 0, "banjo": 1, "bass": 2, "cello": 3, "clarinet": 4, "cymbals": 5, "drums": 6, 
 "flute": 7, "guitar": 8, "mallet_percussion": 9, "mandolin": 10, "organ": 11, "piano": 12, "saxophone": 13, 
 "synthesizer": 14, "trombone": 15, "trumpet": 16, "ukulele": 17, "violin": 18, "voice": 19}

In [214]:
def conv_block(input, num_filters):
    x = nn.Conv2d(input.shape[1], num_filters, 3, padding=1)(input)
    x = nn.BatchNorm2d(num_filters)(x)
    x = nn.ReLU()(x)
    x = nn.Conv2d(num_filters, num_filters, 3, padding=1)(x)
    x = nn.BatchNorm2d(num_filters)(x)
    x = nn.ReLU()(x)
    return x

def encoder_block(input, num_filters):
    x = conv_block(input, num_filters)
    p = nn.MaxPool2d(2)(x)
    return x, p

def decoder_block(input, skip_features, num_filters):
    x = nn.ConvTranspose2d(input.shape[1], num_filters, 2, stride=2)(input)
    x = torch.cat([x, skip_features], dim=1)
    x = conv_block(x, num_filters)
    return x

In [215]:
# num_filters = 32
# inst_models = {}


# for i in range(20):
    
#     #make the input layer (1, 10, 96, 64)
#     inputs = nn.Input((1, 10, 96, 64))

#     # Encoder
#     e1, p1 = encoder_block(inputs, num_filters)
#     e2, p2 = encoder_block(p1, num_filters*2)
#     e3, p3 = encoder_block(p2, num_filters*4)
#     e4, p4 = encoder_block(p3, num_filters*8)

#     # Bridge
#     b1 = conv_block(p4, num_filters*16)

#     # Decoder
#     d1 = decoder_block(b1, e4, num_filters*8)
#     d2 = decoder_block(d1, e3, num_filters*4)
#     d3 = decoder_block(d2, e2, num_filters*2)
#     d4 = decoder_block(d3, e1, num_filters)

#     #unet output with sigmoid activation named 'unet_out' and the same shape as the input
#     unet_out = nn.Conv2d(d4.shape[1], 1, 1, activation='sigmoid', name='unet_out')(d4)

#     out_shaped = out_shaped.reshape(out_shaped.shape[0], 1, out_shaped.shape[1], out_shaped.shape[2])
#     #run through vggish
#     embedding_model = vggish()(out_shaped)

#     outputs = class_model(embedding_model)



In [216]:
class UNetSource(nn.Module):
    def __init__(self,input_shape, num_filters):
        super().__init__()

        # VGGish model
        self.vggish = vggish()
        #set trainable to false
        for param in self.vggish.parameters():
            param.requires_grad = False
        
        # Classifier model
        self.class_model = torch.load('model_2.pt')
        #set trainable to false
        for param in self.class_model.parameters():
            param.requires_grad = False

        # Encoder
        self.input_shape = input_shape
        self.e1, self.p1 = encoder_block(self.input_shape, num_filters)
        self.e2, self.p2 = encoder_block(self.p1, num_filters*2)
        self.e3, self.p3 = encoder_block(self.p2, num_filters*4)
        self.e4, self.p4 = encoder_block(self.p3, num_filters*8)

        # Bridge
        self.b1 = conv_block(self.p4, num_filters*16)

        # Decoder
        self.d1 = decoder_block(self.b1, self.e4, num_filters*8)
        self.d2 = decoder_block(self.d1, self.e3, num_filters*4)
        self.d3 = decoder_block(self.d2, self.e2, num_filters*2)
        self.d4 = decoder_block(self.d3, self.e1, num_filters)

    def forward(self, x):
        # UNet
        e1 = self.e1(x)
        p1 = self.p1(e1)
        e2 = self.e2(p1)
        p2 = self.p2(e2)
        e3 = self.e3(p2)
        p3 = self.p3(e3)
        e4 = self.e4(p3)
        p4 = self.p4(e4)

        b1 = self.b1(p4)

        d1 = self.d1(b1, e4)
        d2 = self.d2(d1, e3)
        d3 = self.d3(d2, e2)
        d4 = self.d4(d3, e1)

        unet_out = nn.Conv2d(d4.shape[1], 1, 1, activation='sigmoid', name='unet_out')(d4)

        masked_input = x * unet_out
        
        # VGGish
        vgg_unet = unet_out.reshape(masked_input.shape[0], 1, masked_input.shape[1], masked_input.shape[2])

        embedding_model = self.vggish(vgg_unet)

        # Classifier
        out = self.class_model(embedding_model)
        
        return masked_input, out


In [217]:
p =input_example=torch.randn(1, 10, 96, 64)
print(p.shape[1])

10


In [240]:
import torch
import torch.nn as nn
import torch.optim as optim

class ClassificationAndTraining(nn.Module):
    def __init__(self, num_filters=32, num_classes=2, num_inst_models=20, input_example=torch.randn(1, 10, 96, 64)):
        super().__init__()
        
        self.input_example = input_example
        self.num_filters = num_filters
        self.num_classes = num_classes
        self.num_inst_models = num_inst_models

        self.vggish = vggish()
        #set trainable to false
        for param in self.vggish.parameters():
            param.requires_grad = False
        

        # Classification network
        self.classification = torch.load('model_2.pt')
        #set trainable to false
        for param in self.classification.parameters():
            param.requires_grad = False
        
        # Instantiate UNetSource models
        self.inst_models = nn.ModuleList([UNetSource(self.input_example, num_filters) for _ in range(num_inst_models)])
        
        # Define loss as categorical cross-entropy
        self.loss = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
        
    def forward(self, x):
        
        embed = self.vggish(x)
        # Classify the input
        classification_outputs = self.classification(embed)

        # threshold the classification output
        classification_outputs = torch.where(classification_outputs > 0.5, 1, 0)
      
        inst_outputs = {}

        for inst in classification_outputs:
            if inst == 1:
                # Run through the corresponding instance model
                unet_out, out = self.inst_models[inst](x)
                #save both outputs and the instance model index
                inst_outputs[inst] = (unet_out, out)

        return inst_outputs, x
   

In [241]:
# Define your training and validation data loaders
batch_size = 256

train_loader = torch.utils.data.DataLoader(spectrograms_train, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(spectrograms_test, batch_size=batch_size)

In [242]:
# Define the number of epochs
num_epochs = 10

source_sep_model = ClassificationAndTraining(num_filters=32)
optimizer = optim.Adam(source_sep_model.parameters(), lr=0.001)
source_sep_model

ClassificationAndTraining(
  (vggish): VGG(
    (features): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): ReLU(inplace=True)
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): ReLU(inplace=True)
      (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (9): ReLU(inplace=True)
      (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (12): ReLU(inplace=True)
      (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (14): ReLU(inplace=True)
      (15): Max

In [243]:
num_params = sum(p.numel() for p in source_sep_model.parameters())
print(f"Number of parameters: {num_params}")

trainable_params = sum(p.numel() for p in source_sep_model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")

Number of parameters: 1516126920
Number of trainable parameters: 0


In [244]:
unet_example = UNetSource(torch.randn(1, 10, 96, 64), num_filters=32)
unet_example

UNetSource(
  (vggish): VGG(
    (features): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): ReLU(inplace=True)
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): ReLU(inplace=True)
      (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (9): ReLU(inplace=True)
      (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (12): ReLU(inplace=True)
      (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (14): ReLU(inplace=True)
      (15): MaxPool2d(kernel_s

In [245]:
num_params = sum(p.numel() for p in unet_example.parameters())
print(f"Number of parameters: {num_params}")

trainable_params = sum(p.numel() for p in unet_example.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")

Number of parameters: 72196520
Number of trainable parameters: 0


In [246]:
# Loop over the epochs
for epoch in range(num_epochs):
    # Set the model to train mode
    source_sep_model.train()
    
    # Loop over the batches of data
    for batch_idx, inputs in enumerate(train_loader):
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass the inputs through the model
        outputs, in_spec = source_sep_model(inputs)

        total_loss = 0

        sources = []

        for inst in outputs.keys():

            sources.append(outputs[inst][0])

            #create a numpy array of the same size as the output with zeros everywhere except for the index of the instance
            inst_tensor = torch.from_numpy(np.zeros(outputs[inst][1].shape))
            inst_tensor[inst] = 1
            labels_tensor = inst_tensor.type(torch.FloatTensor)
            
            # Calculate the loss
            loss = source_sep_model.loss(outputs[inst][1], labels_tensor)
            
            # Backpropagate the loss and update the weights
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        mixture = torch.sum(torch.stack(sources), dim=0)

        # reconstruct the log mel spectrogram mixture
        mix_wav = reconstruct_log_mel_spectrogram(mixture.detach().numpy())
        in_wav = reconstruct_log_mel_spectrogram(in_spec.detach().numpy())
        
        #calculate the signal to distortion ratio
        sdr, _, _, _ = mir_eval.separation.bss_eval_sources(in_wav, mix_wav)
  
        # Print the loss and accuracy after each batch
        print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\SDR: {:.6f} dB'.format(
            epoch + 1, batch_idx * len(inputs), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), total_loss, sdr))
        
    # Set the model to eval mode
    source_sep_model.eval()
    
    # Calculate the accuracy on the validation set
    val_loss = 0
    val_sdr = 0
    with torch.no_grad():
        for val_inputs in val_loader:
            # Forward pass the inputs through the model
            val_outputs, _ = source_sep_model(val_inputs)
            
            # Calculate the loss
            val_sources = []

            for val_inst in val_outputs.keys():

                #create a numpy array of the same size as the output with zeros everywhere except for the index of the instance
                val_inst_tensor = torch.from_numpy(np.zeros(val_outputs[val_inst][1].shape))
                val_inst_tensor[val_inst] = 1
                val_labels_tensor = val_inst_tensor.type(torch.FloatTensor)

                val_loss += source_sep_model.loss(val_outputs[val_inst][1], val_labels_tensor).item()

                val_sources.append(val_outputs[val_inst][0])


            val_mixture = torch.sum(torch.stack(val_sources), dim=0)

            # reconstruct the log mel spectrogram mixture
            val_mix_wav = reconstruct_log_mel_spectrogram(val_mixture.detach().numpy())
            val_in_wav = reconstruct_log_mel_spectrogram(val_inputs.detach().numpy())
            
            #calculate the signal to distortion ratio
            curr_val_sdr, _, _, _ = mir_eval.separation.bss_eval_sources(val_in_wav, val_mix_wav)
            val_sdr += curr_val_sdr
            
            
    
    # Print the validation loss and accuracy
    print('Validation set: Average loss: {:.4f}, SDR: {}/{} ({:.0f}dB)\n'.format(
        val_loss / len(val_loader.dataset), val_sdr, len(val_loader.dataset),
        val_sdr / len(val_loader.dataset)))


AttributeError: 'Tensor' object has no attribute 'astype'