In [4]:
import torch
import torch.nn as nn
import torchaudio
import torch.fft
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
import seaborn as sns
import pandas as pd
import os
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as transforms
import torchaudio.transforms as transformsaudio
import datetime
from torch.utils.tensorboard import SummaryWriter
import soundfile as sf
import time
import torch.nn.functional as F
import math
import random
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import tensorflow as tf
import io
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    


2023-10-21 18:45:11.358534: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-21 18:45:11.358580: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-21 18:45:11.358605: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-21 18:45:11.365954: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
device = "cuda"
directoryBase = "/home/afridman"

In [6]:

def pad_to_max_length(tensor1, tensor2):
    max_length = max(tensor1.size(1), tensor2.size(1))

    pad_tensor1 = torch.nn.functional.pad(tensor1, (0, max_length - tensor1.size(1)))
    pad_tensor2 = torch.nn.functional.pad(tensor2, (0, max_length - tensor2.size(1)))

    return pad_tensor1, pad_tensor2

In [7]:
inputSize = 32000
class AudioCleaningDataset(Dataset):
    def __init__(self, csv_file, audio_dir, noise_dir, reverb_dir, target_length=inputSize, maxRuido=0.001, fixedInterval=False):
        self.dataframe = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.noise_dir = noise_dir
        self.reverb_dir = reverb_dir
        self.target_length = target_length
        self.resampleo = transformsaudio.Resample(orig_freq=48000, new_freq=16000)  # Resampling
        self.resampleoIR = transformsaudio.Resample(orig_freq=32000, new_freq=16000)  # Resampling

        self.noise_files = os.listdir(noise_dir) # List all noise files
        self.reverb_files = os.listdir(reverb_dir)
        self.maxRuido = maxRuido
        self.conv= transformsaudio.Convolve(mode="same")
        self.fixedInterval = fixedInterval

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_file_name = os.path.join(self.audio_dir, self.dataframe.iloc[idx]['audio_file_name']+".wav")
        waveform, sample_rate = torchaudio.load(audio_file_name)

        waveformOriginal = self.resampleo(waveform)

        #aca seleccionar
        if(self.fixedInterval):
            maximoPosible = 0
        else:
            maximoPosible = len(waveformOriginal[0])-self.target_length
        if(maximoPosible>=0):
          comienzoAleatorio = (random.randint(0,maximoPosible))
          waveformOriginal = waveformOriginal[:,comienzoAleatorio:comienzoAleatorio+self.target_length]


        waveformSucia = waveformOriginal.clone()
        padding = torch.zeros((1, max(self.target_length - waveformSucia.size(1),1)))

        waveformOriginal = torch.cat((waveformOriginal, padding), dim=1)
        waveformSucia = torch.cat((waveformSucia, padding), dim=1)

        waveformOriginal = waveformOriginal[:,:self.target_length]
        waveformSucia = waveformSucia[:,:self.target_length]

        # Load a random noise file
        noise_file_name = random.choice(self.noise_files)
        noise_waveform, sample_Rate_ruido = torchaudio.load(os.path.join(self.noise_dir, noise_file_name))
        # Repeat the noise waveform until it's at least as long as the audio waveform
        while noise_waveform.size(1) < waveformSucia.size(1):
            noise_waveform = torch.cat((noise_waveform, noise_waveform), dim=1)

        # Trim the noise waveform to match the length of the audio waveform
        noise_waveform = noise_waveform[:,:waveformSucia.size(1)]

        # Add noise with a random signal-to-noise ratio between 0.01 and 0.1
        snr = random.uniform(0.0, self.maxRuido) #random.uniform(0.01, 0.1)

        # LE SACO EL RUIDO SUCIO PARA ACELERAR APRENDIZAJE !!!!!!!!!!!!!!!!!!!!!!!!!!!!
        waveformSucia = waveformSucia + noise_waveform * snr
        whitenoise = random.uniform(0.0, self.maxRuido)
        waveformSucia = waveformSucia + torch.randn_like(waveformOriginal) * whitenoise
        return 1*waveformSucia, 1*waveformOriginal

        """
        IR_file_name = random.choice(self.reverb_files)
        IR_waveform, sample_Rate_IR = torchaudio.load(os.path.join(self.reverb_dir, IR_file_name))
        IR_waveform = self.resampleoIR(IR_waveform)

        # Normalize impulse response
        normalized_ir = IR_waveform / (IR_waveform.abs().max())

        # Perform convolution
        padded_signal, padded_filter = pad_to_max_length(waveformSucia, normalized_ir)

        # Perform the FFT
        fft_signal = torch.fft.fft(padded_signal)
        fft_filter = torch.fft.fft(padded_filter)

        # Perform the convolution in the frequency domain
        fft_result = fft_signal * fft_filter

        # Perform the inverse FFT to get the result in the time domain
        result = torch.fft.ifft(fft_result)

        # The result is complex, take the real part

        # LE SACO EL REVERB PARA ACELERAR APRENDIZAJE !!!!!!!!!!!!!!!!!!!!!!!!!!!!
        #waveformSucia = result.real

        padding = torch.zeros((1, max(self.target_length - waveformSucia.size(1),1)))

        waveformOriginal = torch.cat((waveformOriginal, padding), dim=1)
        waveformSucia = torch.cat((waveformSucia, padding), dim=1)

        waveformOriginal = waveformOriginal[:,:self.target_length]
        waveformSucia = waveformSucia[:,:self.target_length]

       # waveformOriginal = (waveformOriginal - waveformOriginal.mean()) / waveformOriginal.std()
       # waveformSucia = (waveformSucia - waveformSucia.mean()) / waveformSucia.std()
       # Optional: Normalize convolved audio
       # waveformSucia = waveformSucia / (waveformSucia.abs().max())
       # waveformOriginal = waveformOriginal / (waveformOriginal.abs().max())

        return 1*waveformSucia, 1*waveformOriginal"""

In [8]:

class DilatedCausalConv1d(torch.nn.Module):
    """Dilated Causal Convolution for WaveNet"""
    def __init__(self, channels, dilation=1):
        super(DilatedCausalConv1d, self).__init__()

        self.conv = torch.nn.Conv1d(channels, channels,
                                    kernel_size=2, stride=1,  # Fixed for WaveNet
                                    dilation=dilation,
                                    padding=dilation,  # Fixed for WaveNet dilation
                                    bias=False)  # Fixed for WaveNet but not sure
        if torch.cuda.is_available():
            self.conv = self.conv.to(device)

    def init_weights_for_test(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Conv1d):
                m.weight.data.fill_(1)

    def forward(self, x):
        output = self.conv(x)

        return output


class CausalConv1d(torch.nn.Module):
    """Causal Convolution for WaveNet"""
    def __init__(self, in_channels, out_channels):
        super(CausalConv1d, self).__init__()

        # padding=1 for same size(length) between input and output for causal convolution
        self.conv = torch.nn.Conv1d(in_channels, out_channels,
                                    kernel_size=2, stride=1, padding=1,
                                    bias=False)  # Fixed for WaveNet but not sure
        if torch.cuda.is_available():
            self.conv = self.conv.to(device)

    def init_weights_for_test(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Conv1d):
                m.weight.data.fill_(1)

    def forward(self, x):
        output = self.conv(x)

        # remove last value for causal convolution
        return output[:, :, :-1]


class ResidualBlock(torch.nn.Module):
    def __init__(self, res_channels, skip_channels, dilation):
        """
        Residual block
        :param res_channels: number of residual channel for input, output
        :param skip_channels: number of skip channel for output
        :param dilation:
        """
        super(ResidualBlock, self).__init__()

        self.dilated = DilatedCausalConv1d(res_channels, dilation=dilation)
        self.conv_res = torch.nn.Conv1d(res_channels, res_channels, 1)
        self.conv_skip = torch.nn.Conv1d(res_channels, skip_channels, 1)

        self.gate_tanh = torch.nn.Tanh()
        self.gate_sigmoid = torch.nn.Sigmoid()

        if torch.cuda.is_available():
            self.conv_skip = self.conv_skip.to(device)
            self.conv_res = self.conv_res.to(device)

    def forward(self, x, skip_size):
        """
        :param x:
        :param skip_size: The last output size for loss and prediction
        :return:
        """
        output = self.dilated(x)

        # PixelCNN gate
        gated_tanh = self.gate_tanh(output)
        gated_sigmoid = self.gate_sigmoid(output)
        gated = gated_tanh * gated_sigmoid

        # Residual network
        output = self.conv_res(gated)
        output = output[:, :, 0:inputSize]

        input_cut = x#[:, :, -output.size(2):]

        output += input_cut

        # Skip connection
        skip = self.conv_skip(gated)
        skip = skip[:, :, -skip_size:]

        return output, skip


class ResidualStack(torch.nn.Module):
    def __init__(self, layer_size, stack_size, res_channels, skip_channels):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param res_channels: number of residual channel for input, output
        :param skip_channels: number of skip channel for output
        :return:
        """
        super(ResidualStack, self).__init__()

        self.layer_size = layer_size
        self.stack_size = stack_size

        self.res_blocks = self.stack_res_block(res_channels, skip_channels)

    @staticmethod
    def _residual_block(res_channels, skip_channels, dilation):
        block = ResidualBlock(res_channels, skip_channels, dilation)

        if torch.cuda.device_count() > 1:
            block = torch.nn.DataParallel(block)

        if torch.cuda.is_available():
            block.cuda()

        return block

    def build_dilations(self):
        dilations = []

        # 5 = stack[layer1, layer2, layer3, layer4, layer5]
        for s in range(0, self.stack_size):
            # 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
            for l in range(0, self.layer_size):
                dilations.append(2 ** l)

        return dilations

    def stack_res_block(self, res_channels, skip_channels):
        """
        Prepare dilated convolution blocks by layer and stack size
        :return:
        """
        res_blocks = []
        dilations = self.build_dilations()

        for dilation in dilations:
            block = self._residual_block(res_channels, skip_channels, dilation)
            res_blocks.append(block)

        return res_blocks

    def forward(self, x, skip_size):
        """
        :param x:
        :param skip_size: The last output size for loss and prediction
        :return:
        """
        output = x
        skip_connections = []

        for res_block in self.res_blocks:
            # output is the next input
            output, skip = res_block(output, skip_size)

            skip_connections.append(skip)


        return torch.stack(skip_connections)


class DensNet(torch.nn.Module):
    def __init__(self, channels):
        """
        The last network of WaveNet
        :param channels: number of channels for input and output
        :return:
        """
        super(DensNet, self).__init__()

        self.conv1 = torch.nn.Conv1d(channels, channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv1d(channels, channels, kernel_size=3, stride=1, padding=1)

        self.tan = torch.nn.Tanh()
        self.relu = torch.nn.ReLU()

        if torch.cuda.is_available():
            self.conv1 = self.conv1.to(device)
            self.conv2 = self.conv2.to(device)

    def forward(self, x):
        output = self.conv1(x)
        output1 = self.relu(output)
        output = self.conv2(output)

        return output





In [9]:
pip install librosa

Note: you may need to restart the kernel to use updated packages.


In [10]:
import librosa
import numpy as np

def find_most_energetic_interval(audio_file):
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=48000)

    
    # Calculate energy
    energy = np.square(y)

    # If audio is longer than 2 seconds, split into 2-second frames
    if len(y) > 32000:
        frame_size = 2 * sr  # 2 seconds
        num_frames = len(y) // frame_size
        frames = np.array_split(energy, num_frames)

        # Calculate energy for each frame
        frame_energies = [np.sum(frame) for frame in frames]

        # Find the frame with highest energy
        most_energetic_frame_idx = np.argmax(frame_energies)

        # Calculate start and end time of the most energetic interval
        start_time = most_energetic_frame_idx * 2
        end_time = start_time + 2
    else:
        start_time = 0
        end_time = len(y) / sr

    return start_time, end_time

# Example usage
audio_file = '/home/afridman/extra/audiosDivididos/audioPocos/arf_06592_02124244736.wav'
start_time, end_time = find_most_energetic_interval(audio_file)
print(f"The most energetic interval is from {start_time:.2f} seconds to {end_time:.2f} seconds.")


The most energetic interval is from 2.00 seconds to 4.00 seconds.


In [11]:
#loss_fn = nn.L1Loss()
#learning_rate = 0.0015
#optimizer = torch.optim.AdamW(params=wavenetModel.parameters(), lr=learning_rate)
#optimizerPost = torch.optim.AdamW(params=postnetModel.parameters(), lr=learning_rate)

mel_transform1 = transformsaudio.MelSpectrogram(sample_rate = 16000,
                                               n_fft = 2048,
                                                n_mels = 120,
                                                hop_length = 512).to('cuda')

mel_transform2 = transformsaudio.MelSpectrogram(sample_rate = 16000,
                                               n_fft = 512,
                                                n_mels = 80,
                                                hop_length = 128).to('cuda')


In [12]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, predicted, true):
        loss = 0

        for p, t in zip(predicted, true):
            print(t[0].shape)
            print(t[0])
            if(abs(t)<0.001):
                t = 0.001
            loss += abs((p / t) - 1)

        return loss

# Example usage
"""
predicted = torch.tensor([1, 2, 30, 1], dtype=torch.float32)
true = torch.tensor([2, 8, 9, 0.01], dtype=torch.float32)

criterion = CustomLoss()
loss = criterion(predicted, true)
print(loss.item())"""

'\npredicted = torch.tensor([1, 2, 30, 1], dtype=torch.float32)\ntrue = torch.tensor([2, 8, 9, 0.01], dtype=torch.float32)\n\ncriterion = CustomLoss()\nloss = criterion(predicted, true)\nprint(loss.item())'

In [13]:
def prepareSpectogram(melspect):
    melspect = (melspect - melspect.min() + 0.00000001) #/ (melspect.max()-melspect.min()) + 0.00000001)
    melspect=melspect.log10()
    return melspect

In [14]:

class MelspectogramLoss(nn.Module):
    def __init__(self):
        super(MelspectogramLoss, self).__init__()

    def forward(self, y_pred, true_y):
        mel1true_y = (mel_transform1(true_y.to('cuda')))
        mel1y_pred = (mel_transform1(y_pred.to('cuda')))
        mel2true_y = (mel_transform2(true_y.to('cuda')))
        mel2y_pred = (mel_transform2(y_pred.to('cuda')))

        mel1true_y = prepareSpectogram(mel1true_y)
        mel1y_pred = prepareSpectogram(mel1y_pred)
        mel2true_y = prepareSpectogram(mel2true_y)
        mel2y_pred = prepareSpectogram(mel2y_pred)

        min_db = 1

        dif1 = (mel1true_y - mel1y_pred)**2
        dif2 = (mel2true_y - mel2y_pred)**2
        dif1 = dif1.clamp(min=min_db) - min_db
        dif2 = dif2.clamp(min=min_db) - min_db

        dif1 = (dif1**2).mean() * 3
        dif2 = (dif2**2).mean() * 3
        return dif1 + dif2

In [15]:
class CombinedLoss(nn.Module):
    def __init__(self):
        super(CombinedLoss, self).__init__()
        self.custom_loss = CustomLoss()
        self.mel_loss = MelspectogramLoss()
        self.l1_loss = nn.L1Loss()

    def forward(self, y_pred, true_y):
        #customLoss = self.custom_loss(y_pred, true_y)*1
        prop = (((y_pred**2) / ((true_y**2)+0.00001)) - 1)**2
        max_limit = 10000
        prop = torch.clamp(prop, max=max_limit) 
        customLoss = prop.mean() * 0.001
        melLoss = self.mel_loss(y_pred, true_y)*0.02
        l1Loss =  self.l1_loss(y_pred, true_y)*100
        return  melLoss, customLoss, l1Loss

In [11]:
torch.tensor([1., -5., 4., 5., 5., 5.])**2


tensor([ 1., 25., 16., 25., 25., 25.])

In [12]:
class WaveNet(pl.LightningModule):
    def __init__(self, layer_size, stack_size, in_channels, res_channels, learning_rate):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param in_channels: number of channels for input data. skip channel is same as input channel
        :param res_channels: number of residual channel for input, output
        :return:
        """
        super(WaveNet, self).__init__()

        self.receptive_fields = self.calc_receptive_fields(layer_size, stack_size)

        self.causal = CausalConv1d(in_channels, res_channels)

        self.res_stack = ResidualStack(layer_size, stack_size, res_channels, in_channels)

        self.densnet = DensNet(in_channels)

        self.loss_fun = CombinedLoss()
        self.learning_rate = learning_rate
        
        self.ValLoss = 0
        self.TrainLoss = 0
        self.epochNumberVal = 0


    @staticmethod
    def calc_receptive_fields(layer_size, stack_size):
        layers = [2 ** i for i in range(0, layer_size)] * stack_size
        num_receptive_fields = np.sum(layers)

        return int(num_receptive_fields)
    
    def change_loss_function(self, loss_fun):
        self.loss_fun = loss_fun

    def calc_output_size(self, x):
        output_size = int(x.size(2)) - self.receptive_fields

        #self.check_input_size(x, output_size)

        return inputSize

    def check_input_size(self, x, output_size):
        if output_size < 1:
            raise InputSizeError(int(x.size(2)), self.receptive_fields, output_size)

    def forward(self, x):
        """
        The size of timestep(3rd dimention) has to be bigger than receptive fields
        :param x: Tensor[batch, timestep, channels]
        :return: Tensor[batch, timestep, channels]
        """
        output = x#.transpose(1, 2)
        
        output_size = self.calc_output_size(output)

        output = self.causal(output)

        skip_connections = self.res_stack(output, output_size)

        output = torch.sum(skip_connections, dim=0)


        output = self.densnet(output)
        return output#.transpose(1, 2).contiguous()

    
    def configure_optimizers(self, lr=0.001):
        learning_rate = self.learning_rate
        optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate)
        return optimizer
    
    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        X = X.to(device)
        y = y.to(device)

        # forward pass
        y_pred = self.forward(X)

        # compute loss
        lossMel, customLoss, lossAud = self.loss_fun(y_pred, y)
        loss = lossMel + lossAud + customLoss
        writer.add_scalar("Loss audio", lossAud, batch_idx)
        writer.add_scalar("Loss proportion", customLoss, batch_idx)
        writer.add_scalar("Loss melspect", lossMel, batch_idx)
        writer.add_scalars("Loss total", {'train':loss,
                                'validation':self.ValLoss
                                }, batch_idx)
        self.log('train_loss', loss, prog_bar=True)
        self.log('wave_loss', lossAud, prog_bar=True)
        self.log('mel_loss', lossMel, prog_bar=True)
        self.log('prop_loss', customLoss, prog_bar=True)
        self.log('val_loss', self.ValLoss, prog_bar=True)
        self.log('total_loss', loss, prog_bar=True)
        self.epochNumberVal =  self.epochNumberVal +1
        rand = random.random()
        if(rand<0.05):
            torch.save(model, modeloNombre)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        X = X.to(device)
        y = y.to(device)


        # forward pass
        y_pred = self.forward(X)
        
        # Save audio clips to TensorBoard
        for i in range(y_pred.shape[0]):
            audio_clip = y_pred[i].cpu().numpy()
            writer.add_audio(f'audio_clip_{batch_idx}_{self.epochNumberVal}', audio_clip, global_step=batch_idx, sample_rate=16000)
        # compute loss
        lossMel, customLoss, lossAud = self.loss_fun(y_pred, y)
        loss = lossMel + lossAud + customLoss
        self.ValLoss = loss
        return loss

In [13]:
writer = SummaryWriter(comment="pruebaAudios")
%load_ext tensorboard


In [14]:
modeloNombre = directoryBase+'/wavenet/modelos/wavenetReal/wavenetSacaRuidoBlanco19_2_1_128Layers.pth'


In [15]:
#waveform, label = next(iter(dataloader))

In [16]:
#Audio(data=waveform.cpu()[0], rate=16000)

In [17]:
#Audio(data=label.cpu()[0], rate=16000)

In [18]:
import sys
def size_of_tensor(a):
    return sys.getsizeof(a) + torch.numel(a)*a.element_size()

In [19]:
#dict(wavenetModel.named_parameters()).keys()

In [20]:
model = WaveNet(layer_size=18, stack_size=2, in_channels=1, res_channels=128, learning_rate=0.01)
#model = torch.load(modeloNombre, map_location=torch.device('cuda'))
model = model.to(device)


In [21]:
model.configure_optimizers(lr=0.0005)
loss_fun = CombinedLoss()
model.change_loss_function(loss_fun)

In [22]:
batch_size = 1
traindataset = AudioCleaningDataset(directoryBase+'/wavenet/CSV/audio_train.csv', directoryBase+'/extra/audiosDivididos', directoryBase+'/extra/audiosDivididos/ruidosPocos/', directoryBase+"/extra/audiosDivididos/reverbPocos", maxRuido=0.005, fixedInterval=False)
traindataloader = DataLoader(traindataset, batch_size=batch_size, shuffle=True, num_workers=16)

valdataset = AudioCleaningDataset(directoryBase+'/wavenet/CSV/audio_valMicro.csv', directoryBase+'/extra/audiosDivididos', directoryBase+'/extra/audiosDivididos/ruidosPocos/', directoryBase+"/extra/audiosDivididos/reverbPocos", maxRuido=0.005, fixedInterval=True)
valdataloader = DataLoader(valdataset, batch_size=batch_size, shuffle=False, num_workers=16)

In [23]:

# training
early_stop_callback = EarlyStopping(monitor="train_loss", min_delta=0.01, patience=1000, verbose=True, mode="min")

trainer = pl.Trainer(accelerator='gpu', devices=1,
                     max_epochs=1000,
                     log_every_n_steps=1,
                     callbacks=[early_stop_callback],
                    accumulate_grad_batches=6
                     ,val_check_interval=10  # Perform validation every 10 training steps
)
trainer.fit(model=model, train_dataloaders=traindataloader
            , val_dataloaders=valdataloader
           )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type          | Params
--------------------------------------------
0 | causal    | CausalConv1d  | 256   
1 | res_stack | ResidualStack | 0     
2 | densnet   | DensNet       | 8     
3 | loss_fun  | CombinedLoss  | 0     
--------------------------------------------
264       Trainable params
0         Non-trainable params
264       Total params
0.001     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric train_loss improved. New best score: 16.850


Validation: 0it [00:00, ?it/s]

Metric train_loss improved by 5.814 >= min_delta = 0.01. New best score: 11.036


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric train_loss improved by 2.678 >= min_delta = 0.01. New best score: 8.358


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric train_loss improved by 1.659 >= min_delta = 0.01. New best score: 6.699


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric train_loss improved by 2.756 >= min_delta = 0.01. New best score: 3.942


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric train_loss improved by 0.510 >= min_delta = 0.01. New best score: 3.432


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric train_loss improved by 0.557 >= min_delta = 0.01. New best score: 2.875


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [24]:
torch.save(model, modeloNombre)


In [25]:
writer.close()


In [26]:
X, true_y = next(iter(dataloader))
X = X.to(device)
true_y = true_y.to(device)
model = model.to(device)
wdet = model(X).detach()
predicho = wdet.cpu()
X = X.cpu()
true_y = true_y.cpu()

NameError: name 'dataloader' is not defined

In [None]:
audioAnalizar = 0

In [None]:
loss_fn = CombinedLoss()
loss1, loss2, loss3 = loss_fn(predicho[audioAnalizar], true_y[audioAnalizar])
print("error de prediccion " + str(loss1 +  loss2+ loss3))


loss4, loss5, loss6 = loss_fn(X[audioAnalizar], true_y[audioAnalizar])
print("error sin hacer nada " + str(  loss4+loss5+loss6))

In [None]:
Audio(data=X.cpu()[audioAnalizar], rate=16000)

In [None]:
Audio(data=predicho.cpu()[audioAnalizar], rate=16000)

In [None]:
Audio(data=true_y.cpu()[audioAnalizar], rate=16000)

In [None]:
audioAnalizar = 2

In [None]:
plt.plot(range(0,len(onda2)), (onda2.numpy()-onda.numpy()))

In [None]:
pos = 0
delta = 32000
onda = predicho[audioAnalizar][0][pos:pos+delta]
onda2 = true_y[audioAnalizar][0][pos:pos+delta]
onda3 = X[audioAnalizar][0][pos:pos+delta]

sns.lineplot(x=range(0,len(onda2)), y=onda2.numpy(), label="Real")
#sns.lineplot(x=range(0,len(onda3)), y=(onda3), label="Sucia")
sns.lineplot(x=range(0,len(onda)), y=onda.numpy(), label="Predicha")


In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/home/afridman/wavenet/CSV/newCSV/audiosTest.csv')

# Take a sample (in this case, let's take 10 random rows)
sample_df = df.sample(n=100)

# Save the sample to a new CSV file
sample_df.to_csv('/home/afridman/wavenet/CSV/newCSV/audiosMicroTest.csv', index=False)


In [None]:
onda2.std()

In [None]:
onda3.std()

In [None]:
onda.std()

In [None]:
abs(onda2.numpy()-onda.numpy()).mean()

In [None]:
  mel1true_y = mel_transform1(true_y.to('cuda'))
  mel1y_pred = mel_transform1(predicho.to('cuda'))
  mel1sucio = mel_transform1(X.to('cuda'))


In [None]:
mel_spectrogram_db = transformsaudio.AmplitudeToDB()(mel1true_y).cpu()


# Display the mel spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(mel_spectrogram_db[0][0].detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram verdadero")
plt.xlabel("Time")
plt.ylabel("Mel Frequency Bin")
plt.show()

In [None]:
mel_spectrogram_db2 = transformsaudio.AmplitudeToDB()(mel1y_pred).cpu()


# Display the mel spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(mel_spectrogram_db2[0][0].detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram predicho")
plt.xlabel("Time")
plt.ylabel("Mel Frequency Bin")
plt.show()

In [None]:
mel_spectrogram_db3 = transformsaudio.AmplitudeToDB()(mel1sucio).cpu()


# Display the mel spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(mel_spectrogram_db3[0][0].detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram sucio")
plt.xlabel("Time")
plt.ylabel("Mel Frequency Bin")
plt.show()

In [None]:
melsp = MelspectogramLoss()

In [None]:
mel1true_y, mel1y_pred, dif1, dif2 = melsp(predicho, true_y)

In [None]:
# Display the mel spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(mel1true_y[0][0].cpu().detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram verdadero")
plt.xlabel("Time")
plt.ylabel("Mel Frequency Bin")
plt.show()

In [None]:
# Display the mel spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(mel1y_pred[0][0].cpu().detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram predicho")
plt.xlabel("Time")
plt.ylabel("Mel Frequency Bin")
plt.show()

In [None]:
# Display the mel spectrogram
plt.figure(figsize=(10, 4))
plt.imshow(dif1[0][0].cpu().detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram diferencia")
plt.xlabel("Time")
plt.ylabel("Mel Frequency Bin")
plt.show()

In [None]:
audioAnalizar = 0

In [None]:
import soundfile as sf


In [None]:
for audioAnalizar in opin:
  sf.write('sucio'+str(audioAnalizar)+'.wav', np.ravel(X[audioAnalizar]), 16000, 'PCM_24')
  sf.write('original'+str(audioAnalizar)+'.wav', np.ravel(true_y[audioAnalizar]), 16000, 'PCM_24')
  sf.write('limpiado'+str(audioAnalizar)+'.wav', np.ravel(predicho[audioAnalizar][0]), 16000, 'PCM_24')

In [None]:
sf.write('sucio2'+str(audioAnalizar)+'.wav', np.ravel(X[audioAnalizar]), 16000, 'PCM_24')

In [None]:
sf.write('limpiadoEco2'+str(audioAnalizar)+'.wav', np.ravel(predicho[audioAnalizar][0]), 16000, 'PCM_24')

In [None]:
sf.write('originalSinTocarWavenetReplica2'+str(audioAnalizar)+'.wav', np.ravel(true_y[audioAnalizar]), 16000, 'PCM_24')

In [None]:
#dataloader give 1 example
ini = time.time()
for waveform, label in dataloader:
    print(waveform.shape)
    print(label.shape)
    break
fin = time.time()

In [None]:
out = modelPanoramico(waveform)

In [None]:
nuevow = WaveNet( layer_size=18, stack_size=2, in_channels=1, res_channels=128)

In [None]:
#nuevow.load_state_dict(torch.load(modeloNombre), map_location=torch.device('cpu'))
nuevow = torch.load(modeloNombre, map_location=torch.device('cpu'))


In [None]:
predicho = nuevow(waveform).detach().cpu()
waveform = waveform.cpu().detach()
true_y = label.cpu()
X = waveform

In [None]:
lossSpect1, lossSpect2 = melspectogramLoss(predicho[audioAnalizar], true_y[audioAnalizar])
loss1 = loss_fn(predicho[audioAnalizar], true_y[audioAnalizar])
loss = loss1 + lossSpect1 + lossSpect2
print("error de prediccion " + str(loss.item()))

lossSpect1t, lossSpect2t = melspectogramLoss(waveform[audioAnalizar], true_y[audioAnalizar])
loss2 = loss_fn(waveform[audioAnalizar], true_y[audioAnalizar])
loss2t = loss2 + lossSpect1t + lossSpect2t

print("error sin hacer nada " + str(loss2t.item()))

In [None]:
class WaveNetPanoramico(pl.LightningModule):
    def __init__(self, layer_size, stack_size, in_channels, res_channels):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param in_channels: number of channels for input data. skip channel is same as input channel
        :param res_channels: number of residual channel for input, output
        :return:
        """
        super(WaveNetPanoramico, self).__init__()

        self.receptive_fields = self.calc_receptive_fields(layer_size, stack_size)

        self.causal = CausalConv1d(in_channels, res_channels)

        self.res_stack = ResidualStack(layer_size, stack_size, res_channels, in_channels)

        self.densnet = DensNet(in_channels)

        self.loss_fun = nn.MSELoss()
        inp = layer_size*stack_size+res_channels+1
        inp2 = math.floor(inp/2)
        self.convFinal1 = nn.Conv1d(in_channels=layer_size*stack_size+res_channels+1,
                                   out_channels=inp2,
                                   kernel_size=15, stride=1,
                                   dilation=1, padding=7, bias=True).to(device)
        self.convFinal2 = nn.Conv1d(in_channels=inp2,
                                   out_channels=1,
                                   kernel_size=15, stride=1,
                                   dilation=1, padding=7, bias=True).to(device)

        self.convFinal3 = nn.Conv1d(in_channels=1,
                                   out_channels=1,
                                   kernel_size=101, stride=1,
                                   dilation=1, padding=50, bias=False).to(device)


        self.relu = nn.ReLU()

    @staticmethod
    def calc_receptive_fields(layer_size, stack_size):
        layers = [2 ** i for i in range(0, layer_size)] * stack_size
        num_receptive_fields = np.sum(layers)

        return int(num_receptive_fields)

    def calc_output_size(self, x):
        output_size = int(x.size(2)) - self.receptive_fields

        #self.check_input_size(x, output_size)

        return inputSize

    def check_input_size(self, x, output_size):
        if output_size < 1:
            raise InputSizeError(int(x.size(2)), self.receptive_fields, output_size)

    def forward(self, x):
        """
        The size of timestep(3rd dimention) has to be bigger than receptive fields
        :param x: Tensor[batch, timestep, channels]
        :return: Tensor[batch, timestep, channels]
        """
        output = x#.transpose(1, 2)
        copiaConvolucionFinal = x.clone()

        output_size = self.calc_output_size(output)

        output = self.causal(output)

        skip_connections = self.res_stack(output, output_size)



        #en vez de sumar skip connections hago convolucion
        #print(skip_connections.shape)
        #output = torch.sum(skip_connections, dim=0)
        skip_connections_squeezed = skip_connections.squeeze()
        # Check the shape
        # Swap the first two dimensions
        skip_connections_squeezed = skip_connections_squeezed.transpose(0, 1)
        output = torch.cat((output, copiaConvolucionFinal), dim=1)

        output = torch.cat((output, skip_connections_squeezed), dim=1)

        output = self.convFinal1(output)
        output = self.relu(output)
        output = self.convFinal2(output)
        #output = self.relu(output)
        #output = self.convFinal3(output)

        #output = self.densnet(output)

        return output#.transpose(1, 2).contiguous()

In [None]:
class PostNetSimple(pl.LightningModule):
  def __init__(self, layers=12):
    super(PostNetSimple, self).__init__()
    self.convInicial = nn.Conv1d(in_channels=1,
                            out_channels=128,
                            kernel_size=33, stride=1,
                            dilation=1, padding=16, bias=True).to("cuda")
    self.totalLayers = layers
    self.convs = []
    for conv in range(0, layers):
      self.convs.append(
          nn.Conv1d(in_channels=128,
                            out_channels=128,
                            kernel_size=33, stride=1,
                            dilation=1, padding=16, bias=True).to("cuda")
      )

    self.tan = nn.Tanh()

  def forward(self, x):
    x = self.convInicial(x)
    x = self.tan(x)
    for conv in self.convs:
      x = conv(x)
      x = self.tan(x)
    return x

In [1]:
from zipfile import ZipFile
directoryBase="/home/afridman"
# loading the temp.zip and creating a zip object
with ZipFile(directoryBase+ '/extra/audiosPaises2.zip', 'r') as zObject:
  
    # Extracting all the members of the zip 
    # into a specific location.
    zObject.extractall(
        path=directoryBase+ '/extra')

In [20]:
import torch
import torch.nn.functional as F

def add_impulse_response(audio_signal, impulse_response):
    # Ensure both audio and impulse response have the same device (CPU or GPU)
    impulse_response = impulse_response.to(audio_signal.device)
    
    # Apply the impulse response using 1D convolution
    output_audio = F.conv1d(audio_signal.unsqueeze(0).unsqueeze(0), impulse_response.unsqueeze(0).unsqueeze(0))
    
    return output_audio.squeeze(0).squeeze(0)

# Example usage
audio_signal = torch.randn(1000)  # Replace with your actual audio data
impulse_response = torch.randn(200)  # Replace with your actual impulse response data

# Apply the impulse response
output_audio = add_impulse_response(audio_signal, impulse_response)


In [21]:
output_audio

tensor([ 7.5687e+00, -9.3573e+00, -7.3352e-01, -6.8507e+00,  9.0835e+00,
         9.5852e+00, -3.5008e+00, -4.0582e+00,  1.5354e+00, -1.5502e+01,
        -1.2079e+01,  7.7278e-02,  1.5648e+00,  6.2965e-01,  1.8557e+01,
        -1.6554e+00, -1.2635e+01, -5.6840e+00,  1.4760e+01, -3.0606e+00,
        -1.7563e+01, -3.2267e+00,  2.6477e+01, -1.8612e+00, -1.2754e+01,
        -2.8657e+00, -9.3973e+00,  8.3042e+00, -2.3713e+00, -1.0660e+01,
        -1.8093e+01, -3.9224e+00,  5.6275e+00,  1.2004e+01, -8.2134e+00,
        -1.2713e+01,  1.3887e+01,  1.9645e+01,  1.3015e+01,  1.6071e+01,
        -8.6562e+00, -2.9002e+01, -2.4760e+01,  1.0437e+01, -3.3004e+01,
        -3.7859e+00,  2.2630e+00,  1.2608e+01,  5.4650e+00,  1.8472e+01,
         2.8007e+01,  1.8272e+01, -1.4786e+00,  1.8029e+01, -1.8204e+01,
        -7.2283e+00, -6.4111e+00, -1.4654e+01, -1.0531e+01,  2.9071e+00,
        -4.4362e+00,  2.6543e+00,  2.7406e+00,  2.1918e+01,  1.7892e+01,
         1.6038e+01, -9.4173e+00,  5.1944e+00, -1.5

In [20]:
print("Ejecutando archivo entrenamiento")
import torch
import torch.nn as nn
import torchaudio
import torch.fft
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
import seaborn as sns
import pandas as pd
import os
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as transforms
import torchaudio.transforms as transformsaudio
import datetime
from torch.utils.tensorboard import SummaryWriter
import soundfile as sf
import time
import torch.nn.functional as F
import torchaudio.functional as Fa

import math
import random
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import Callback

from torchmetrics.audio import PerceptualEvaluationSpeechQuality
from torchmetrics.audio import ShortTimeObjectiveIntelligibility
from torchmetrics.audio import SpeechReverberationModulationEnergyRatio
from torchmetrics.audio import SignalNoiseRatio

import librosa
import tensorflow as tf
import io
from PIL import Image
from prettytable import PrettyTable
import json


Ejecutando archivo entrenamiento


In [35]:
snr = 0.2
add_impulse_response = True
learning_rate = 0.001

In [36]:

class AudioCleaningDataset(Dataset):
    def __init__(self, csv_file, audio_dir, noise_dir, noise_csv, reverb_dir, target_length=inputSize, maxRuido=0.001, fixedInterval=False, snr=snr, ir_on = add_impulse_response, nivelDificultadReverb =1):
        self.dataframe = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.noise_df = pd.read_csv(noise_csv)
        self.noise_dir = noise_dir
        self.reverb_dir = reverb_dir
        self.target_length = target_length
        self.resampleo = transformsaudio.Resample(orig_freq=48000, new_freq=16000)  # Resampling
        self.resampleoIR = transformsaudio.Resample(orig_freq=32000, new_freq=16000)  # Resampling
        self.ir = ir_on
        self.maxRuido = maxRuido
        self.snr = snr
        
        self.reverb_files = os.listdir(reverb_dir)
        self.nivelDificultadReverb = nivelDificultadReverb

        self.conv= transformsaudio.Convolve(mode="same")
        self.fixedInterval = fixedInterval
        
    def activate_ir(self):
        self.ir=True
    def change_max_ruido(self, maxRuidopar):
        self.maxRuido = maxRuidopar
    def change_snr(self, snrpar):
        self.snr = snrpar

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_file_name = os.path.join(self.audio_dir, self.dataframe.iloc[idx]['audio_file_name']+".wav")
        waveform, sample_rate = torchaudio.load(audio_file_name)
        #start, end = find_most_energetic_interval(audio_file_name)
        waveformOriginal = self.resampleo(waveform)
        waveformSucia = waveformOriginal.clone()

        if(self.ir):
            IR_file_name = random.choice(self.reverb_files)
            IR_waveform, sample_Rate_IR = torchaudio.load(os.path.join(self.reverb_dir, IR_file_name))
            IR_waveform = self.resampleoIR(IR_waveform)#[:inputSize]
            convd = self.nivelDificultadReverb * Fa.fftconvolve(waveformSucia, IR_waveform, mode="full")


            waveformSucia = waveformSucia + convd[:,:waveformSucia[0].shape[0]]
            waveformSuciaMaxAbs = waveformSucia.abs().max()
            waveformSucia = waveformSucia / (waveformSuciaMaxAbs*1.5)
        
        #Me quedo con los primeros 2 seg
        if(self.fixedInterval):
            waveformOriginal = waveformOriginal[:,:self.target_length]
            waveformSucia = waveformSucia[:,:self.target_length]
        
        padding0 = torch.zeros((1, max(self.target_length - waveformOriginal.size(1),1)))
        padding1 = torch.zeros((1, max(self.target_length - waveformSucia.size(1),1)))

        waveformOriginal = torch.cat((waveformOriginal, padding0), dim=1)
        waveformSucia = torch.cat((waveformSucia, padding1), dim=1)

        waveformOriginal = waveformOriginal[:,:self.target_length]
        waveformSucia = waveformSucia[:,:self.target_length]

        # Load a random noise file
        noise_file_name = random.choice(self.noise_df["file_name_with_directory"])
        
        noise_waveform, sample_Rate_ruido = torchaudio.load(os.path.join(self.noise_dir, noise_file_name))
        # Repeat the noise waveform until it's at least as long as the audio waveform
        while noise_waveform.size(1) < waveformSucia.size(1):
            noise_waveform = torch.cat((noise_waveform, noise_waveform), dim=1)

        # Trim the noise waveform to match the length of the audio waveform
        noise_waveform = noise_waveform[:,:waveformSucia.size(1)]

        waveformSucia = waveformSucia + (noise_waveform * self.snr)
        whitenoise = random.uniform(self.maxRuido / 6, self.maxRuido)
        waveformSucia = waveformSucia + torch.randn_like(waveformOriginal) * whitenoise
        

        return 1*waveformSucia, 1*waveformOriginal



In [37]:
class AudioCleanerNet(pl.LightningModule):
    def __init__(self, inputSize, lr) -> None:
        super().__init__()
        self.layers = 12

        kernelSize = 5
        paddingSize = math.floor(kernelSize/2)

        self.convs = []
        for i in range(0,self.layers,1):
          if(i<(self.layers/2 )):
            self.convs.append(nn.Conv1d(in_channels=(2**i), out_channels=(2**i), kernel_size=kernelSize, stride=1, dilation=1, padding=2).to(device))
            self.convs.append(nn.Conv1d(in_channels=(2**i), out_channels=2**(i+1), kernel_size=kernelSize, stride=1, dilation=2**(i), padding=2*2**(i)).to(device))
          else:
            self.convs.append(nn.Conv1d(in_channels=1+(2**(self.layers-i)), out_channels=1+(2**(self.layers-i)), kernel_size=kernelSize, stride=1, dilation=1, padding=2).to(device))
            self.convs.append(nn.Conv1d(in_channels=1+(2**(self.layers-i)), out_channels=2**(self.layers-i-1), kernel_size=kernelSize, stride=1, dilation=(2**(self.layers-i)), padding=2*(2**(self.layers-i))).to(device))

        self.convFinal = nn.Conv1d(in_channels=2, out_channels=1, kernel_size=3, stride=1, dilation=1, padding=1).to(device)

        self.relu = nn.ReLU()
        self.tan = nn.Tanh()
        self.learning_rate = lr




    def forward(self, x):
        original_x = x.view(x.size(0), -1)  # Flatten the original input
        original_x = original_x.unsqueeze(1)

        for i in range(0,self.layers,1):
          if(i>=(self.layers/2 )):
            combined_tensor = torch.cat((original_x, x), dim=1)
          else:
            combined_tensor = x
          convActual = self.convs[2*i]
          convSig = self.convs[2*i+1]

          x = convActual(combined_tensor)
          x = self.tan(x)
          x = convSig(x)
          x = self.tan(x)


        combined_tensor = torch.cat((original_x, x), dim=1)

        x = self.convFinal(combined_tensor)
        #x = self.tan(x)


        return x

    def configure_optimizers(self, lr=learning_rate):
        learning_rate = self.learning_rate
        optimizer = torch.optim.SGD(self.parameters(), lr=learning_rate)
        return optimizer
        
    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        X = X.to(device)
        y = y.to(device)

        # forward pass
        y_pred = self.forward(X)

        # compute loss
        lossMel, lossAud = self.loss_fun(y_pred, y)
        loss = lossMel + lossAud
        writer.add_scalar("Loss audio", lossAud, batch_idx)
        writer.add_scalar("Loss melspect", lossMel, batch_idx)
        writer.add_scalar("Loss total", loss, batch_idx)
        self.log('wave_loss', lossAud, prog_bar=True)
        self.log('mel_loss', lossMel, prog_bar=True)
        self.log('train_loss', loss, prog_bar=True)

        rand = random.random()
        if(rand<0.05):
            torch.save(model, modeloNombre)
        return loss
    
    def val_step(self, val_batch, batch_idx):
        X, y = val_batch
        X = X.to(device)
        y = y.to(device)
        # forward pass
        y_pred = model(X)

        # compute loss
        loss = self.loss_fun(y_pred, y)

        return loss

In [38]:
class DilatedCausalConv1d(torch.nn.Module):
    """Dilated Causal Convolution for WaveNet"""
    def __init__(self, channels, dilation=1):
        super(DilatedCausalConv1d, self).__init__()

        self.conv = torch.nn.Conv1d(channels, channels,
                                    kernel_size=3, stride=1,  # Fixed for WaveNet
                                    dilation=dilation,
                                    padding=dilation,  # Fixed for WaveNet dilation
                                    bias=True)  
        if torch.cuda.is_available():
            self.conv = self.conv.to(device)

    def init_weights_for_test(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Conv1d):
                m.weight.data.fill_(1)

    def forward(self, x):
        output = self.conv(x)

        return output


class CausalConv1d(torch.nn.Module):
    """Causal Convolution for WaveNet"""
    def __init__(self, in_channels, out_channels):
        super(CausalConv1d, self).__init__()

        # padding=1 for same size(length) between input and output for causal convolution
        self.conv = torch.nn.Conv1d(in_channels, out_channels,
                                    kernel_size=3, stride=1, padding=1,
                                    bias=True)  # Fixed for WaveNet but not sure
        if torch.cuda.is_available():
            self.conv = self.conv.to(device)

    def init_weights_for_test(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Conv1d):
                m.weight.data.fill_(1)

    def forward(self, x):
        output = self.conv(x)

        # remove last value for causal convolution
        return output  #[:, :, :-1]


class ResidualBlock(torch.nn.Module):
    def __init__(self, res_channels, skip_channels, dilation):
        """
        Residual block
        :param res_channels: number of residual channel for input, output
        :param skip_channels: number of skip channel for output
        :param dilation:
        """
        super(ResidualBlock, self).__init__()

        self.dilated = DilatedCausalConv1d(res_channels, dilation=dilation)
        self.conv_res = torch.nn.Conv1d(res_channels, res_channels, 1)
        self.conv_skip = torch.nn.Conv1d(res_channels, skip_channels, 1)

        self.gate_tanh = torch.nn.Tanh()
        self.gate_sigmoid = torch.nn.Sigmoid()

        if torch.cuda.is_available():
            self.conv_skip = self.conv_skip.to(device)
            self.conv_res = self.conv_res.to(device)

    def forward(self, x, skip_size):
        """
        :param x:
        :param skip_size: The last output size for loss and prediction
        :return:
        """
        output = self.dilated(x)

        # PixelCNN gate
        gated_tanh = self.gate_tanh(output)
        gated_sigmoid = self.gate_sigmoid(output)
        gated = gated_tanh * gated_sigmoid

        # Residual network
        output = self.conv_res(gated)
        #output = output[:, :, 0:inputSize]

        input_cut = x#[:, :, -output.size(2):]

        output += input_cut

        # Skip connection
        skip = self.conv_skip(gated)
        skip = skip[:, :, -skip_size:]

        return output, skip


class ResidualStack(torch.nn.Module):
    def __init__(self, layer_size, stack_size, res_channels, skip_channels):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param res_channels: number of residual channel for input, output
        :param skip_channels: number of skip channel for output
        :return:
        """
        super(ResidualStack, self).__init__()

        self.layer_size = layer_size
        self.stack_size = stack_size

        self.res_blocks = self.stack_res_block(res_channels, skip_channels)

    @staticmethod
    def _residual_block(res_channels, skip_channels, dilation):
        block = ResidualBlock(res_channels, skip_channels, dilation)

        if torch.cuda.device_count() > 1:
            block = torch.nn.DataParallel(block)

        if torch.cuda.is_available():
            block.cuda()

        return block

    def build_dilations(self):
        dilations = []

        # 5 = stack[layer1, layer2, layer3, layer4, layer5]
        for s in range(0, self.stack_size):
            # 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
            for l in range(0, self.layer_size):
                dilations.append(2 ** l)

        return dilations

    def stack_res_block(self, res_channels, skip_channels):
        """
        Prepare dilated convolution blocks by layer and stack size
        :return:
        """
        res_blocks = []
        dilations = self.build_dilations()

        for dilation in dilations:
            block = self._residual_block(res_channels, skip_channels, dilation)
            res_blocks.append(block)

        return res_blocks

    def forward(self, x, skip_size):
        """
        :param x:
        :param skip_size: The last output size for loss and prediction
        :return:
        """
        output = x
        skip_connections = []

        for res_block in self.res_blocks:
            # output is the next input
            output, skip = res_block(output, skip_size)

            skip_connections.append(skip)


        return torch.stack(skip_connections)


class DensNet(torch.nn.Module):
    def __init__(self, channels):
        """
        The last network of WaveNet
        :param channels: number of channels for input and output
        :return:
        """
        super(DensNet, self).__init__()

        self.conv1 = torch.nn.Conv1d(channels, channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv1d(channels, channels, kernel_size=3, stride=1, padding=1)

        self.tan = torch.nn.Tanh()
        self.relu = torch.nn.ReLU()

        if torch.cuda.is_available():
            self.conv1 = self.conv1.to(device)
            self.conv2 = self.conv2.to(device)

    def forward(self, x):
        output = self.conv1(x)
        output1 = self.relu(output)
        output = self.conv2(output)

        return output




    
class WaveNet(pl.LightningModule):
    def __init__(self, layer_size, stack_size, in_channels, res_channels, learning_rate):
        """
        Stack residual blocks by layer and stack size
        :param layer_size: integer, 10 = layer[dilation=1, dilation=2, 4, 8, 16, 32, 64, 128, 256, 512]
        :param stack_size: integer, 5 = stack[layer1, layer2, layer3, layer4, layer5]
        :param in_channels: number of channels for input data. skip channel is same as input channel
        :param res_channels: number of residual channel for input, output
        :return:
        """
        super(WaveNet, self).__init__()

        self.receptive_fields = self.calc_receptive_fields(layer_size, stack_size)

        self.causal = CausalConv1d(in_channels, res_channels)

        self.res_stack = ResidualStack(layer_size, stack_size, res_channels, in_channels)

        self.densnet = DensNet(in_channels)

        self.loss_fun = CombinedLoss()
        
        self.learning_rate = learning_rate
        
        self.srmr = SpeechReverberationModulationEnergyRatio(16000)
        self.wb_pesq = PerceptualEvaluationSpeechQuality(16000, 'wb')
        self.stoi = ShortTimeObjectiveIntelligibility(16000, False)
        self.snr = SignalNoiseRatio(16000)

        self.ValLosses = []
        self.ValLoss = 0
        self.valMel1Loss = 0
        self.valMel2Loss = 0
        self.valL1Loss = 0
        self.valPropLoss = 0

        self.TrainLoss = 0
        
        
        self.epochNumberVal = 0

        self.PESQValLoss =0
        self.STOIValLoss = 0
        self.SRMRValLoss = 0
        self.FWSSNRValLoss = 0
        
        self.PESQValLosses =[]
        self.STOIValLosses = []
        self.SRMRValLosses = []
        self.FWSSNRValLosses = []


    @staticmethod
    def calc_receptive_fields(layer_size, stack_size):
        layers = [2 ** i for i in range(0, layer_size)] * stack_size
        num_receptive_fields = np.sum(layers)

        return int(num_receptive_fields)
    
    def change_loss_function(self, loss_fun):
        self.loss_fun = loss_fun

    def calc_output_size(self, x):
        output_size = int(x.size(2)) - self.receptive_fields

        #self.check_input_size(x, output_size)

        return inputSize

    def check_input_size(self, x, output_size):
        if output_size < 1:
            raise InputSizeError(int(x.size(2)), self.receptive_fields, output_size)

    def forward(self, x):
        """
        The size of timestep(3rd dimention) has to be bigger than receptive fields
        :param x: Tensor[batch, timestep, channels]
        :return: Tensor[batch, timestep, channels]
        """
        output = x#.transpose(1, 2)
        
        output_size = self.calc_output_size(output)

        output = self.causal(output)

        skip_connections = self.res_stack(output, output_size)

        output = torch.sum(skip_connections, dim=0)


        output = self.densnet(output)
        return output#.transpose(1, 2).contiguous()

    
    def configure_optimizers(self, lr=0.001):
        learning_rate = self.learning_rate
        optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate)
        return optimizer
    
    def training_step(self, train_batch, batch_idx):
        batch_idx = self.epochNumberVal
        
        if(len(self.ValLosses)>0):
            self.ValLoss = np.array(self.ValLosses).mean()
            self.ValLosses = []
           
        
        X, y = train_batch
        X = X.to(device)
        y = y.to(device)

        # forward pass
        y_pred = self.forward(X)

        # compute loss
        lossMel1, lossMel2, customLoss, lossAud = self.loss_fun(y_pred, y)
        loss = lossMel1 * weightOfMelspecLoss1 + lossMel2 * weightOfMelspecLoss2 + lossAud *weightOfL1Loss + customLoss *weightOfCustomLoss
        #writer.add_scalar("Loss audio", lossAud, batch_idx)
        #writer.add_scalar("Loss proportion", customLoss, batch_idx)
        #writer.add_scalar("Loss melspect1", lossMel1, batch_idx)
        #writer.add_scalar("Loss melspect2", lossMel2, batch_idx)


        trainLosssrmr = self.srmr(y_pred)
        trainLossstoi = self.stoi(y_pred, y)
        #trainLosspesq = self.wb_pesq(y_pred, y)
        trainLosssnr = self.snr(y_pred, y)

        

        writer.add_scalars("STOI", {'train':trainLossstoi,
                        'validation':trainLossstoi
                        }, batch_idx)
        writer.add_scalars("SRMR", {'train':trainLosssrmr,
                        'validation':trainLosssrmr
                        }, batch_idx)
        writer.add_scalars("SNR", {'train':trainLosssnr,
                        'validation':trainLosssnr
                        }, batch_idx)
        
        #writer.add_scalars("PESQ", {'train':trainLosspesq,
        #                                'validation':trainLosspesq
        #                               }, batch_idx)
        
        
        writer.add_scalars("Loss L1", {'train':lossAud,
                                'validation':self.valL1Loss
                                }, batch_idx)
        writer.add_scalars("Loss prop", {'train':customLoss,
                        'validation':self.valPropLoss
                        }, batch_idx)
        writer.add_scalars("Loss melspect1", {'train':lossMel1,
                        'validation':self.valMel1Loss
                        }, batch_idx)
        writer.add_scalars("Loss melspect2", {'train':lossMel2,
                        'validation':self.valMel2Loss
                        }, batch_idx)
        writer.add_scalars("Loss total", {'train':loss,
                                'validation':self.ValLoss
                                }, batch_idx)
        self.log('val_loss', self.ValLoss, prog_bar=True)
        self.log('train_loss', loss, prog_bar=True)
        #self.log('train_loss', loss, prog_bar=True)
        #self.log('wave_loss', lossAud, prog_bar=True)
        #self.log('mel_loss1', lossMel1, prog_bar=True)
        #self.log('mel_loss2', lossMel2, prog_bar=True)
        #self.log('prop_loss', customLoss, prog_bar=True)
        #self.log('val_loss', self.ValLoss, prog_bar=True)
        #self.log('total_loss', loss, prog_bar=True)
        self.epochNumberVal =  self.epochNumberVal +1
        rand = random.random()
        if((self.epochNumberVal % saveModelIntervalEpochs) == 0):
            torch.save(model, modeloNombre)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        X = X.to(device)
        y = y.to(device)


        # forward pass
        y_pred = self.forward(X)
        
        if(batch_idx<howManyAudiosValidationsSave):
            #for i in range(X.shape[0]):
            audio_clip = X[0].cpu().numpy()
            writer.add_audio(f'audio_clip_{batch_idx}_Sucio', audio_clip, global_step=batch_idx, sample_rate=16000)
            audio_clip = y[0].cpu().numpy()
            writer.add_audio(f'audio_clip_{batch_idx}_Original', audio_clip, global_step=batch_idx, sample_rate=16000)
            audio_clip = y_pred[0].cpu()#.numpy()
            writer.add_audio(f'audio_clip_{batch_idx}_{self.epochNumberVal}', audio_clip, global_step=batch_idx,sample_rate=16000)
            generatePlots(audio_clip[0], y[0].cpu(), X[0].cpu(), batch_idx, self.epochNumberVal)
                
        # compute loss
        lossMel1, lossMel2, customLoss, lossAud = self.loss_fun(y_pred, y)
        loss = lossMel1*weightOfMelspecLoss1 + lossMel2*weightOfMelspecLoss2 + lossAud*weightOfL1Loss + customLoss*weightOfCustomLoss
        self.log('val_loss', loss) 
        self.ValLosses.append(loss.item())
        self.valMel1Loss = lossMel1
        self.valMel2Loss = lossMel2
        self.valL1Loss = lossAud
        self.valPropLoss = customLoss
        return loss

In [41]:
directoryBase = "/home/afridman"
locationTrainFile = "/wavenet/CSV/newCSV/audiosTrain.csv"
locationValidationFile = "/wavenet/CSV/newCSV/audiosVal.csv"
maxRuido = 0.005
batch_size = 1

In [42]:
traindataset = AudioCleaningDataset(directoryBase+locationTrainFile,
                                    directoryBase+'/extra/audiosPaises',
                                    directoryBase+'/extra/ruidosDivididos',
                                    directoryBase+"/wavenet/CSV/ruido_train.csv",
                                    directoryBase+"/extra/irDivididos/irtrain", 
                                    maxRuido=maxRuido, fixedInterval=False)


valdataset = AudioCleaningDataset(directoryBase+locationValidationFile, 
                                  directoryBase+'/extra/audiosPaises', 
                                 directoryBase+'/extra/ruidosDivididos',
                                  directoryBase+"/wavenet/CSV/ruido_validation.csv",
                                  directoryBase+"/extra/irDivididos/irval",
                                  maxRuido=maxRuido, fixedInterval=True)



traindataloader = DataLoader(traindataset, batch_size=batch_size, shuffle=True, num_workers=16)


In [44]:
model = WaveNet(layer_size=10, stack_size=2, in_channels=1, res_channels=128, learning_rate=0.01)
#model = torch.load(modeloNombre, map_location=torch.device('cuda'))
model = model.to(device)
waveform, label = next(iter(traindataloader))

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/home/afridman/wavenet/CSV/newCSV/audiosTest.csv')

# Take a sample (in this case, let's take 10 random rows)
sample_df = df.sample(n=100)

# Save the sample to a new CSV file
sample_df.to_csv('/home/afridman/wavenet/CSV/newCSV/audiosMicroTest.csv', index=False)


In [24]:
import torch
import torch.nn as nn

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        
        # Define the layers with specified configurations
        self.layers = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=15, stride=1, groups=1),
            nn.LeakyReLU(0.2),
            
            nn.Conv1d(in_channels=16, out_channels=64, kernel_size=41, stride=4, groups=4),
            nn.LeakyReLU(0.2),
            
            nn.Conv1d(in_channels=64, out_channels=256, kernel_size=41, stride=4, groups=16),
            nn.LeakyReLU(0.2),
            
            nn.Conv1d(in_channels=256, out_channels=1024, kernel_size=41, stride=4, groups=64),
            nn.LeakyReLU(0.2),
            
            nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=41, stride=4, groups=256),
            nn.LeakyReLU(0.2),
            
            nn.Conv1d(in_channels=1024, out_channels=1024, kernel_size=5, stride=1, groups=1),
            nn.LeakyReLU(0.2),
            
            nn.Conv1d(in_channels=1024, out_channels=1, kernel_size=3, stride=1, groups=1),
            
            nn.Linear(106, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

# Instantiate the Discriminator
discriminator = Discriminator()

# Test with a random input
input_tensor = torch.randn(1, 1, 32000)  # Assuming input size is (batch_size, channels, sequence_length)
output_tensor = discriminator(input_tensor)
print(output_tensor.shape)

torch.Size([1, 1, 1])


In [40]:
(output_tensor[0][0][0])

tensor(0.5056, grad_fn=<SelectBackward0>)