In [1]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
def create_spectrogram(file_path):
    audio_array, sample_rate= librosa.load(file_path)
    spec = librosa.feature.melspectrogram(y=audio_array,
                                    sr=sample_rate, 
                                        n_fft=2048, 
                                        hop_length=512, 
                                        win_length=None, 
                                        window='hann', 
                                        center=True, 
                                        pad_mode='reflect', 
                                        power=1.0,
                                    n_mels=128)
    log_spec = librosa.power_to_db(spec, ref=np.max)
    return spec,sample_rate
 

def reverse_spectrogram(log_spec,sample_rate, output_path):
    #reversed_log=librosa.db_to_power(log_spec)
    # step3 converting mel-spectrogrma back to wav file
    res = librosa.feature.inverse.mel_to_audio(log_spec, 
                                        sr=sample_rate, 
                                        n_fft=2048, 
                                        hop_length=512, 
                                        win_length=None, 
                                        window='hann', 
                                        center=True, 
                                        pad_mode='reflect', 
                                        power=1.0, 
                                        n_iter=32)

    # step4 - save it as a wav file
    import soundfile as sf
    sf.write(output_path, res, sample_rate)

# Probando con un archivo limpio y uno con overlay de ruido

In [3]:
clean_file=r'wavs\clean\clnsp0.wav'
noise_file=r'wavs\noisy\clnsp0.wav'
test_clean_spec,test_clean_sr =create_spectrogram(clean_file)
test_noisy_spec,test_noisy_sr=create_spectrogram(noise_file)

In [39]:
type(test_clean_spec)

numpy.ndarray

In [4]:
test_clean_spec.shape,test_noisy_spec.shape

((128, 475), (128, 475))

In [5]:
test_clean_sr,test_noisy_sr

(22050, 22050)

In [6]:
# devolviendo a .wav
reverse_spectrogram(test_clean_spec,test_clean_sr,'test_clean_spec.wav')
reverse_spectrogram(test_noisy_spec,test_noisy_sr,'test_noisy_spec.wav')

# Generador de archivos en masa usando batcheador

In [7]:
from math import ceil
def create_spec_from_dir(dir_path,top_x=200):
    #dir_path directorio o folder donde estan los wavs
    # top_x opcional cuantos archivos maximo desea usar, dejar vacio para usarlos todos
    dir = os.listdir(dir_path)
    spec_list=[]
    s_rates=[]
    for i, file in enumerate(dir):
        try:
            if i<=top_x:
                input_file = os.path.join(dir_path, file)
                ms,sr=create_spectrogram(input_file)
                num_batches=ceil(ms.shape[1]/128) if ms.shape[1]>128 else 1
                #print(ms.shape)
                ms=np.resize(ms,(ms.shape[0],128*num_batches))
                batches=np.hsplit(ms,num_batches)
                for batch in batches:
                    spec_list.append(batch)
                    s_rates.append(sr)
        except:
            print(file," file skipped")
    
    return spec_list,s_rates

In [8]:
clean_specs,clean_s_rates=create_spec_from_dir(r'wavs\clean',24000)

In [9]:
noisy_specs,noisy_s_rates=create_spec_from_dir(r'wavs\noisy',24000)

In [10]:
len(clean_specs),len(noisy_specs)

(37301, 37297)

In [11]:
clean_specs[0].shape

(128, 128)

In [12]:
min(clean_s_rates),max(clean_s_rates)

(22050, 22050)

In [13]:
min(noisy_s_rates),max(noisy_s_rates)

(22050, 22050)

In [14]:
def standardize_specs(clean_specs,noisy_specs):
    
    #getting max lenght of all audios
    max_y=0
    
    for i,j in zip(clean_specs,noisy_specs):
        if i.shape[1]>max_y: max_y=i.shape[1]
        if j.shape[1]>max_y: max_y=j.shape[1]
    print(max_y)
    # reshapping all spectrogram

    for index,s in enumerate(clean_specs):
        try:
            clean_specs[index]=np.resize(s,(s.shape[0],max_y))
        except Exception as e:
            print(f"skipping clean {index} {e}")
    
    clean_specs=np.array(clean_specs)
    clean_specs=clean_specs.reshape(-1,s.shape[0],max_y,1)

    for index,s in enumerate(noisy_specs):
        try:
            noisy_specs[index]=np.resize(s,(s.shape[0],max_y))
        except Exception as e:
            print(f"skipping noise {index} {e}")
            
    noisy_specs=np.array(noisy_specs)
    noisy_specs=noisy_specs.reshape(-1,s.shape[0],max_y,1)

    return clean_specs,noisy_specs

In [15]:
s_clean_specs,s_noisy_specs=standardize_specs(clean_specs,noisy_specs)

128


In [16]:
type(s_clean_specs)

numpy.ndarray

In [17]:
s_clean_specs.shape

(37301, 128, 128, 1)

In [18]:
samples=min(s_clean_specs.shape[0],s_noisy_specs.shape[0])
s_clean_specs=s_clean_specs[:samples]
s_noisy_specs=s_noisy_specs[:samples]

In [19]:
from tensorflow.keras.utils import Sequence
import numpy as np   

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

train_gen = DataGenerator(s_clean_specs, s_clean_specs, 32)
#test_gen = DataGenerator(X_test, y_test, 32)

In [20]:
# step1 - converting a wav file to numpy array and then converting that to mel-spectrogram
scale_file=r'wavs\clean\clnsp0.wav'
my_audio_as_np_array, my_sample_rate= librosa.load(scale_file)

# step2 - converting audio np array to spectrogram
spec = librosa.feature.melspectrogram(y=my_audio_as_np_array,
                                        sr=my_sample_rate, 
                                            n_fft=2048, 
                                            hop_length=512, 
                                            win_length=None, 
                                            window='hann', 
                                            center=True, 
                                            pad_mode='reflect', 
                                            power=2.0,
                                     n_mels=128)
log_spec = librosa.power_to_db(spec)
reversed_log=librosa.db_to_power(log_spec)
# step3 converting mel-spectrogrma back to wav file
res = librosa.feature.inverse.mel_to_audio(reversed_log, 
                                           sr=my_sample_rate, 
                                           n_fft=2048, 
                                           hop_length=512, 
                                           win_length=None, 
                                           window='hann', 
                                           center=True, 
                                           pad_mode='reflect', 
                                           power=2.0, 
                                           n_iter=32)

# step4 - save it as a wav file
import soundfile as sf
sf.write("test1.wav", res, my_sample_rate)

# Autoencoder Naive

In [21]:
import os, shutil
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras import models
from keras.layers import Dense,Flatten,Reshape,InputLayer
from keras.models import Sequential
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import image
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#import mlflow
#import mlflow.tensorflow
from PIL import Image
import re

## Autoencoder 0

solo 2 capas densas y regularización

In [22]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder0=models.Sequential()
auto_encoder0.add(layers.Input(shape=img_shape))
auto_encoder0.add(layers.Flatten())
auto_encoder0.add(layers.Dense(64))
auto_encoder0.add(layers.Dropout(drop_out))
auto_encoder0.add(layers.Dense(64))
auto_encoder0.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder0.add(layers.Reshape(img_shape))
auto_encoder0.compile(optimizer='adamax', loss='mse')
auto_encoder0.summary()

history0 = auto_encoder0.fit(train_gen,
                    epochs=20,
                    )

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 16384)             0         
                                                                 
 dense (Dense)               (None, 64)                1048640   
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 16384)             1064960   
                                                                 
 reshape (Reshape)           (None, 128, 128)          0         
                                                                 
Total params: 2,117,760
Trainable params: 2,117,760
Non-

In [23]:
preds0=auto_encoder0.predict(s_noisy_specs)
reverse_spectrogram(preds0[0],clean_s_rates[0],'test_auto_0.wav')



In [24]:
clean_s_rates[0]

22050

## Autoencoder 1

capas más complejas

In [25]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder=models.Sequential()
auto_encoder.add(layers.Input(shape=img_shape))
auto_encoder.add(layers.Flatten())
auto_encoder.add(layers.Dense(128))
auto_encoder.add(layers.Dropout(drop_out))
auto_encoder.add(layers.Dense(256))
auto_encoder.add(layers.Dropout(drop_out))
auto_encoder.add(layers.Dense(512))
auto_encoder.add(layers.Dropout(drop_out))
auto_encoder.add(layers.Dense(256))
auto_encoder.add(layers.Dense(128))
auto_encoder.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder.add(layers.Reshape(img_shape))


auto_encoder.compile(optimizer='adamax', loss='mse')
auto_encoder.summary()

history1 = auto_encoder.fit(train_gen,
                    epochs=20,
                    )

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 16384)             0         
                                                                 
 dense_3 (Dense)             (None, 128)               2097280   
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 256)               33024     
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 512)               131584    
                                                                 
 dropout_3 (Dropout)         (None, 512)              

## autoencoder 2

In [26]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder2=models.Sequential()
auto_encoder2.add(layers.Input(shape=img_shape))
auto_encoder2.add(layers.Flatten())
auto_encoder2.add(layers.Dense(64))
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Dense(64))
auto_encoder2.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder2.add(layers.Reshape(img_shape))
auto_encoder2.add(layers.Flatten())
auto_encoder2.add(layers.Dense(128))
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Dense(128))
auto_encoder2.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder2.add(layers.Reshape(img_shape))
auto_encoder2.add(layers.Flatten())
auto_encoder2.add(layers.Dense(256))
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Dense(256))
auto_encoder2.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder2.add(layers.Reshape(img_shape))
auto_encoder2.compile(optimizer='adamax', loss='mse')
auto_encoder2.summary()

history2 = auto_encoder2.fit(train_gen,
                    epochs=20,
                    )

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_2 (Flatten)         (None, 16384)             0         
                                                                 
 dense_9 (Dense)             (None, 64)                1048640   
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 16384)             1064960   
                                                                 
 reshape_2 (Reshape)         (None, 128, 128)          0         
                                                                 
 flatten_3 (Flatten)         (None, 16384)            

## Autoencoder 3

In [27]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder3=models.Sequential()
auto_encoder3.add(layers.Input(shape=img_shape))
auto_encoder3.add(layers.Flatten())
auto_encoder3.add(layers.Dense(32))
auto_encoder3.add(layers.Dropout(drop_out))
auto_encoder3.add(layers.Dense(32))
auto_encoder3.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder3.add(layers.Reshape(img_shape))
auto_encoder3.compile(optimizer='adamax', loss='mse')
auto_encoder3.summary()

history3 = auto_encoder3.fit(train_gen,
                    epochs=20,
                    batch_size=32
                    )

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_5 (Flatten)         (None, 16384)             0         
                                                                 
 dense_18 (Dense)            (None, 32)                524320    
                                                                 
 dropout_7 (Dropout)         (None, 32)                0         
                                                                 
 dense_19 (Dense)            (None, 32)                1056      
                                                                 
 dense_20 (Dense)            (None, 16384)             540672    
                                                                 
 reshape_5 (Reshape)         (None, 128, 128)          0         
                                                                 
Total params: 1,066,048
Trainable params: 1,066,048
No

# Autoencoder CNN

In [28]:
img_shape_cnn=(s_clean_specs.shape[1],s_clean_specs.shape[2],1)
img_shape_cnn

(128, 128, 1)

In [29]:
np.prod((128, 702))

89856

In [30]:
s_clean_specs.shape,s_noisy_specs.shape

((37297, 128, 128, 1), (37297, 128, 128, 1))

In [31]:
s_noisy_specs.shape

(37297, 128, 128, 1)

In [32]:
img_shape_cnn=(s_noisy_specs.shape[1],s_noisy_specs.shape[2],s_noisy_specs.shape[3])

cnn_auto_encoder0=models.Sequential()
cnn_auto_encoder0.add(layers.Conv2D(filters = 32, kernel_size = (3,3),padding = 'Same', 
                activation ='relu', input_shape = img_shape_cnn))
cnn_auto_encoder0.add(layers.MaxPooling2D(2,strides=2))
cnn_auto_encoder0.add(layers.Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder0.add(layers.MaxPooling2D(2,strides=2))
cnn_auto_encoder0.add(layers.Conv2D(filters = 128, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder0.add(layers.MaxPooling2D(2,strides=2))
cnn_auto_encoder0.add(layers.Conv2D(filters = 128, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder0.add(layers.UpSampling2D(2))
cnn_auto_encoder0.add(layers.Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder0.add(layers.UpSampling2D(2))
cnn_auto_encoder0.add(layers.Conv2D(filters = 32, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder0.add(layers.UpSampling2D(2))
cnn_auto_encoder0.add(layers.Conv2D(filters = 1, kernel_size = (3,3), padding = 'Same',
                activation ='relu'))
#cnn_auto_encoder0.add(layers.BatchNormalization())




# #cnn_auto_encoder0.add(layers.MaxPool2D(pool_size=(2,2)))
# cnn_auto_encoder0.add(layers.BatchNormalization())
# cnn_auto_encoder0.add(layers.Dropout(drop_out))
# cnn_auto_encoder0.add(layers.Flatten())
# cnn_auto_encoder0.add(layers.Dense(512, activation = "relu"))
# cnn_auto_encoder0.add(layers.Dropout(drop_out))
# cnn_auto_encoder0.add(layers.Dense(64, activation = "softmax"))
# cnn_auto_encoder0.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
# cnn_auto_encoder0.add(layers.Reshape(img_shape))





cnn_auto_encoder0.compile(optimizer='adam', loss='mse')
cnn_auto_encoder0.summary()




history_cnn0 = cnn_auto_encoder0.fit(train_gen,
                    epochs=20,
                    batch_size=32
                    )

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 128, 128, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 64, 64, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 64, 64, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 32, 32, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 32, 32, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 16, 16, 128)     

In [33]:
preds_cnn0=cnn_auto_encoder0.predict(s_noisy_specs)



In [34]:
preds_cnn0[0]

array([[[2.0227897 ],
        [2.9176667 ],
        [3.0597537 ],
        ...,
        [6.6908145 ],
        [5.706136  ],
        [3.8917763 ]],

       [[2.6861646 ],
        [2.6189604 ],
        [2.8840673 ],
        ...,
        [4.7674985 ],
        [4.137945  ],
        [4.9098125 ]],

       [[2.2454064 ],
        [3.7180912 ],
        [3.764681  ],
        ...,
        [0.        ],
        [0.37897155],
        [2.6594043 ]],

       ...,

       [[1.2313939 ],
        [1.9505287 ],
        [1.3169881 ],
        ...,
        [0.6854678 ],
        [0.22386508],
        [0.27422246]],

       [[1.9386911 ],
        [2.1741734 ],
        [2.3559606 ],
        ...,
        [0.70895904],
        [1.0993992 ],
        [1.3418329 ]],

       [[1.027333  ],
        [1.187958  ],
        [1.1102605 ],
        ...,
        [0.9725395 ],
        [0.26324183],
        [0.20087756]]], dtype=float32)

In [35]:
s_noisy_specs[0]

array([[[3.6116675e-03],
        [4.2330422e-02],
        [2.9994888e+00],
        ...,
        [1.0766939e+01],
        [1.0084717e+01],
        [8.7430801e+00]],

       [[2.0240490e+00],
        [1.5824387e+00],
        [2.2952890e+00],
        ...,
        [4.0004206e+00],
        [4.6528821e+00],
        [3.4321313e+00]],

       [[3.6204121e+00],
        [3.9337368e+00],
        [4.4218721e+00],
        ...,
        [2.2771921e+00],
        [1.7581837e+00],
        [2.3866873e+00]],

       ...,

       [[2.5758204e+00],
        [2.7024705e+00],
        [2.0233986e+00],
        ...,
        [1.5009434e-03],
        [1.5728220e-02],
        [1.0320277e+00]],

       [[1.9026399e+00],
        [1.8726494e+00],
        [1.3823597e+00],
        ...,
        [1.3814447e+00],
        [1.6964116e+00],
        [1.9088768e+00]],

       [[1.1812829e+00],
        [1.3547335e+00],
        [1.5787264e+00],
        ...,
        [7.7776515e-01],
        [8.7209022e-01],
        [5.3810471e-01]]

In [38]:
res = librosa.feature.inverse.mel_to_audio(preds_cnn0[0], 
                                        sr=22050, 
                                         n_fft=2048, 
                                         hop_length=512, 
                                        win_length=None, 
                                         window='hann', 
                                        center=True, 
                                         pad_mode='reflect', 
                                        power=1.0, 
                                        n_iter=32)

ValueError: can't extend empty axis 1 using modes other than 'constant' or 'empty'

In [36]:
reverse_spectrogram(preds_cnn0[0],clean_s_rates[0],'test_cnn0.wav')



ValueError: can't extend empty axis 1 using modes other than 'constant' or 'empty'

In [None]:
drop_out=0.5

cnn_auto_encoder1=models.Sequential()
cnn_auto_encoder1.add(layers.Conv2D(filters = 32, kernel_size = (3,3),padding = 'Same', 
                activation ='relu', input_shape = img_shape_cnn))
cnn_auto_encoder1.add(layers.MaxPooling2D(2,strides=2))
cnn_auto_encoder1.add(layers.Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder1.add(layers.MaxPooling2D(2,strides=2))
cnn_auto_encoder1.add(layers.BatchNormalization())
cnn_auto_encoder0.add(layers.Dropout(drop_out))
cnn_auto_encoder1.add(layers.Conv2D(filters = 128, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder1.add(layers.MaxPooling2D(2,strides=2))
cnn_auto_encoder1.add(layers.Conv2D(filters = 128, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder1.add(layers.UpSampling2D(2))
cnn_auto_encoder1.add(layers.Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder1.add(layers.UpSampling2D(2))
cnn_auto_encoder1.add(layers.BatchNormalization())
cnn_auto_encoder0.add(layers.Dropout(drop_out))
cnn_auto_encoder1.add(layers.Conv2D(filters = 32, kernel_size = (3,3),padding = 'Same', 
                activation ='relu'))
cnn_auto_encoder1.add(layers.UpSampling2D(2))
cnn_auto_encoder1.add(layers.Conv2D(filters = 1, kernel_size = (3,3), padding = 'Same',
                activation ='relu'))
cnn_auto_encoder1.compile(optimizer='adamax', loss='mse')
cnn_auto_encoder1.summary()



history_cnn1 = cnn_auto_encoder1.fit(train_gen,
                    epochs=20,
                    batch_size=8
                    )

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_111 (Conv2D)         (None, 128, 128, 32)      320       
                                                                 
 max_pooling2d_50 (MaxPoolin  (None, 64, 64, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_112 (Conv2D)         (None, 64, 64, 64)        18496     
                                                                 
 max_pooling2d_51 (MaxPoolin  (None, 32, 32, 64)       0         
 g2D)                                                            
                                                                 
 batch_normalization_2 (Batc  (None, 32, 32, 64)       256       
 hNormalization)                                                 
                                                     

# Predicciones

In [None]:
preds=auto_encoder.predict(s_noisy_specs)



In [None]:
preds.shape

(2001, 128, 702)

In [None]:
preds[0].shape

(128, 702)

In [None]:
clean_specs[0]

array([[1.88221908e-04, 2.46463984e-04, 2.61851761e-04, ...,
        2.57317000e-03, 8.99837411e-04, 2.44650152e-03],
       [1.21181156e-03, 9.17276135e-04, 9.90588916e-04, ...,
        5.54336328e-03, 2.85331719e-02, 2.60114968e-01],
       [1.14793324e+00, 1.52739763e+00, 1.82428646e+00, ...,
        2.50480145e-01, 8.19973946e-02, 4.72248858e-03],
       ...,
       [9.29996677e-05, 9.84362341e-05, 1.06841406e-04, ...,
        2.54283252e-04, 6.28159833e-05, 1.36140082e-03],
       [2.84687756e-03, 9.32404399e-03, 1.19536798e-02, ...,
        1.18909981e-02, 1.26775932e-02, 8.28386191e-03],
       [6.48943149e-03, 3.44618829e-03, 1.06449577e-03, ...,
        2.61219740e-02, 3.30388226e-04, 1.67555991e-04]], dtype=float32)

In [None]:
len(preds)

2001

In [None]:
reversed_log=librosa.db_to_power(preds[0])
# step3 converting mel-spectrogrma back to wav file
res = librosa.feature.inverse.mel_to_audio(reversed_log, 
                                           sr=noisy_s_rates[0], 
                                           n_fft=2048, 
                                           hop_length=512, 
                                           win_length=None, 
                                           window='hann', 
                                           center=True, 
                                           pad_mode='reflect', 
                                           power=2.0, 
                                           n_iter=32)

import soundfile as sf
sf.write("test2.wav", res, noisy_s_rates[0])

# Convertidor de espectrograma a audio

In [None]:
import soundfile as sf
sf.write('scale.wav', scale, sr, format='ogg', subtype='vorbis')
sf.write('audio.wav', audio, sr, format='ogg', subtype='vorbis')