# Tratar datos

Tratar los datos ajustándolos al formato legible por el modelo

In [3]:
import os
import json
import glob
import numpy as np
import scipy.io.wavfile
import torch
import torch.utils.data as data
import torchvision.transforms as transforms
import torchaudio
import sys
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from IPython.display import Audio, display

# ESPECTROGRAMA NUEVO

In [21]:
import torch
import torchaudio
import torch.utils.data as data
import torchvision.transforms as transforms
import numpy as np
#from nsynth import NSynth, SignalTransformation
import sys
import librosa.display
import json
import matplotlib.pyplot as plt
torch.cuda.is_available()

False

In [22]:
class NSynth(data.Dataset):

    """Pytorch dataset for NSynth dataset
    args:
        root: root dir containing examples.json and audio directory with
            wav files.
        transform (callable, optional): A function/transform that takes in
                a sample and returns a transformed version.
        target_transform (callable, optional): A function/transform that takes
            in the target and transforms it.
        blacklist_pattern: list of string used to blacklist dataset element.
            If one of the string is present in the audio filename, this sample
            together with its metadata is removed from the dataset.
        categorical_field_list: list of string. Each string is a key like
            instrument_family that will be used as a classification target.
            Each field value will be encoding as an integer using sklearn
            LabelEncoder.
    """

    def __init__(self, root, transform=None, target_transform=None,
                 blacklist_pattern=[],
                 categorical_field_list=["instrument_family"]):
        """Constructor"""
        assert(isinstance(root, str))
        assert(isinstance(blacklist_pattern, list))
        assert(isinstance(categorical_field_list, list))
        self.root = root
        self.filenames = glob.glob(os.path.join(root, "audio/*.wav"))
        with open(os.path.join(root, "examples.json"), "r") as f:
            self.json_data = json.load(f)
        for pattern in blacklist_pattern:
            self.filenames, self.json_data = self.blacklist(
                self.filenames, self.json_data, pattern)
        self.categorical_field_list = categorical_field_list
        self.le = []
        for i, field in enumerate(self.categorical_field_list):
            self.le.append(LabelEncoder())
            field_values = [value[field] for value in self.json_data.values()]
            self.le[i].fit(field_values)
        self.transform = transform
        self.target_transform = target_transform

    def blacklist(self, filenames, json_data, pattern):
        filenames = [filename for filename in filenames
                     if pattern not in filename]
        json_data = {
            key: value for key, value in json_data.items()
            if pattern not in key
        }
        return filenames, json_data

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (audio sample, *categorical targets, json_data)
        """
        name = self.filenames[index]
        _, sample = scipy.io.wavfile.read(name)
        target = self.json_data[os.path.splitext(os.path.basename(name))[0]]
        categorical_target = [
            le.transform([target[field]])[0]
            for field, le in zip(self.categorical_field_list, self.le)]
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)
        return [sample, *categorical_target, target]

    def getAudioCharacteristics(self, index):
        item = self.__getitem__(index)
        return {"fichero": item[3]["note_str"]+".wav","familia":item[3]["instrument_source"]}

    # def getSignal(self,index):
    #     #La ruta va desde el fichero donde se encuentra el jupyter notebook
    #     return torchaudio.load("nsynth-valid/audio/"+self.getAudioCharacteristics(index)["fichero"])

    # def getLabel(self,index):
    #     return self.getAudioCharacteristics(index)["familia"]

    # def samplingTransform(self,signal,sample_rate):
    #     #Usamos sample_rate 16000 por defecto
    #     if sample_rate != self.TARGET_SAMPLE_RATE:
    #         signal = torchaudio.transforms.signalResample(sample_rate,self.TARGET_SAMPLE_RATE)(signal)
    #     return signal
            

    # def monoTransform(self,signal):
    #     if signal.shape[0] > 1:
    #         signal = torch.mean(signal, dim=0,keepdim=True)
    #     return signal
    
    # def setTargetSampleRate(self,sample_rate):
    #     self.TARGET_SAMPLE_RATE=sample_rate
    #     return 0

    # def setTransformation(self,transformation):
    #     self.TRANSFORMATION=transformation
    #     return 0

In [23]:
class SignalTransformation():

     #Sample rate por defecto
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    TARGET_SAMPLE_RATE=16000
    #Usamos espectogramas de MEL por defecto al utilizar instrumentos musicales para captar las frecuencias melódicas de forma entendible para humanos y para el modelo
    TRANSFORMATION=torchaudio.transforms.MelSpectrogram(sample_rate=TARGET_SAMPLE_RATE,
                                                      n_fft=1024, 
                                                      hop_length=512,
                                                      n_mels=64).to(device)

    def __init__(self, fichero, label):
        self.fichero=fichero
        self.label=label
    
    def generarSpectrograma(self):
        signal = self.TRANSFORMATION(self.getSignalTuned())
        return signal

    @classmethod
    def generarSpectrogramaFromSignal(cls, signal):
        signal = cls.monoTransformClass(signal)
        signal = cls.samplingTransformClass(signal,cls.TARGET_SAMPLE_RATE)
        signal = cls.TRANSFORMATION(signal)
        return signal

    @classmethod
    def generarSTFTFromSignal(cls,signal):
        signal = cls.monoTransformClass(signal)
        signal = cls.samplingTransformClass(signal,cls.TARGET_SAMPLE_RATE)
        signal = torchaudio.transforms.Spectrogram().to(cls.device)(signal)
        return signal
    def getSignal(self):
        #La ruta va desde el fichero donde se encuentra el jupyter notebook
        return torchaudio.load("nsynth-valid/audio/"+self.fichero)

    def getSignalTuned(self):
        signal, sr  = self.getSignal()
        signal= self.monoTransform(signal)
        signal = self.samplingTransform(signal,self.TARGET_SAMPLE_RATE)
        return signal 

    def getLabel(self):
        return self.label
    def getSignalAndLabel(self):
        return self.getSignal(), self.getLabel()

    def samplingTransform(self,signal,sample_rate):
        #Usamos sample_rate 16000 por defecto
        if sample_rate != self.TARGET_SAMPLE_RATE:
            signal = torchaudio.transforms.signalResample(sample_rate,self.TARGET_SAMPLE_RATE)(signal)
        return signal

    @classmethod
    def samplingTransformClass(cls,signal,sample_rate):
        #Usamos sample_rate 16000 por defecto
        if sample_rate != cls.TARGET_SAMPLE_RATE:
            signal = torchaudio.transforms.signalResample(sample_rate,cls.TARGET_SAMPLE_RATE)(signal)
        return signal
            
    @classmethod        
    def monoTransformClass(cls, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0,keepdim=True)
        return signal

    def monoTransform(self,signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0,keepdim=True)
        return signal
    
    def setTargetSampleRate(self,sample_rate):
        self.TARGET_SAMPLE_RATE=sample_rate
        return 0

    def setTransformation(self,transformation):
        self.TRANSFORMATION=transformation
        return 0


In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [38]:
def separar(audio):
    waveform, sample_rate = torchaudio.load(audio)
    dimension = waveform.size(dim=1)
    print(dimension)
    #prueba = torch.narrow(waveform, dim=1, start =0, length = 64000)
    #print(prueba.size(dim=1))
    #plot_specgram_prueba(prueba,sample_rate,audio,0)
    #plot_specgram_prueba(prueba,sample_rate,audio,i)
    anterior = 1
    i=0
    k=32000
    while (anterior+64000) <= dimension:
            prueba = torch.narrow(waveform, dim=1, start = anterior, length = 64000)
            anterior = anterior + k
            print(prueba.size(dim=1))
            #plot_specgram_prueba(prueba,sample_rate,audio,i)
            prueba = SignalTransformation.generarSpectrogramaFromSignal(prueba)
            shape = prueba.shape
            prueba = torchaudio.transforms.AmplitudeToDB()(prueba) 
            prueba=prueba.cpu().data.numpy()
            librosa.display.specshow(prueba[0],cmap='magma')
            nombre = "./nsynth-test/guitarra6/{}.png".format(i)
            plt.savefig(nombre, bbox_inches='tight')
            plt.close()
            i = i+1
    #plot_specgram_prueba(waveform,sample_rate,audio)

    
#separar("nsynth-test/audio/bass_electronic_018-026-025.wav")
#separar("nsynth-test/Prueba-1.wav")
#separar("nsynth-test/Prueba-2.wav")
separar("nsynth-test/Guitarra6.wav")

452608
64000
64000
64000
64000
64000
64000
64000


Crear el espectograma entero de la muestra para comprobar que sirven 

In [37]:
prueba, sample_rate = torchaudio.load("nsynth-test/Guitarra6.wav")
prueba = SignalTransformation.generarSpectrogramaFromSignal(prueba)
shape = prueba.shape
prueba = torchaudio.transforms.AmplitudeToDB()(prueba) 
prueba=prueba.cpu().data.numpy()
librosa.display.specshow(prueba[0],cmap='magma')
i=1000
nombre = "./nsynth-test/guitarra6/{}.png".format(i)
plt.savefig(nombre, bbox_inches='tight')
plt.close()