In [62]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import os

In [63]:
def create_spectrogram(file_path):
    audio_array, sample_rate= librosa.load(file_path)
    spec = librosa.feature.melspectrogram(y=audio_array,
                                    sr=sample_rate, 
                                        n_fft=2048, 
                                        hop_length=512, 
                                        win_length=None, 
                                        window='hann', 
                                        center=True, 
                                        pad_mode='reflect', 
                                        power=2.0,
                                    n_mels=128)
    log_spec = librosa.power_to_db(spec, ref=np.max)
    return spec,sample_rate

# Probando con un archivo limpio y uno con overlay de ruido

In [64]:
clean_file=r'wavs\clean\clnsp0.wav'
noise_file=r'wavs\noisy\output1.wav'
test_clean_spec,test_clean_sr =create_spectrogram(clean_file)
test_noisy_spec,test_noisy_sr=create_spectrogram(noise_file)

In [65]:
test_clean_spec.shape,test_noisy_spec.shape

((128, 475), (128, 475))

In [66]:
test_clean_sr

22050

In [67]:
test_noisy_sr

22050

# Generador de archivos en masa usando batcheador

In [68]:
def create_spec_from_dir(dir_path,top_x=200):
    #dir_path directorio o folder donde estan los wavs
    # top_x opcional cuantos archivos maximo desea usar, dejar vacio para usarlos todos
    dir = os.listdir(dir_path)
    spec_list=[]
    s_rates=[]
    for i, file in enumerate(dir):
        try:
            if i<=top_x:
                input_file = os.path.join(dir_path, file)
                ms,sr=create_spectrogram(input_file)
                spec_list.append(ms)
                s_rates.append(sr)
        except:
            print(file," file skipped")
    
    return spec_list,s_rates

In [69]:
clean_specs,clean_s_rates=create_spec_from_dir(r'wavs\clean',10000)
noisy_specs,noisy_s_rates=create_spec_from_dir(r'wavs\noisy',10000)

In [70]:
min(clean_s_rates),max(clean_s_rates)

(22050, 22050)

In [71]:
min(noisy_s_rates),max(noisy_s_rates)

(22050, 22050)

In [72]:
def standardize_specs(clean_specs,noisy_specs):
    
    #getting max lenght of all audios
    max_y=0
    
    for i,j in zip(clean_specs,noisy_specs):
        if i.shape[1]>max_y: max_y=i.shape[1]
        if j.shape[1]>max_y: max_y=j.shape[1]
    print(max_y)
    # reshapping all spectrogram

    for index,s in enumerate(clean_specs):
        try:
            clean_specs[index]=np.resize(s,(s.shape[0],max_y))
        except Exception as e:
            print(f"skipping clean {index} {e}")
    
    clean_specs=np.array(clean_specs)
    clean_specs=clean_specs.reshape(-1,s.shape[0],max_y,1)

    for index,s in enumerate(noisy_specs):
        try:
            noisy_specs[index]=np.resize(s,(s.shape[0],max_y))
        except Exception as e:
            print(f"skipping noise {index} {e}")
            
    noisy_specs=np.array(noisy_specs)
    noisy_specs=noisy_specs.reshape(-1,s.shape[0],max_y,1)

    return clean_specs,noisy_specs

In [73]:
s_clean_specs,s_noisy_specs=standardize_specs(clean_specs,noisy_specs)

702


In [74]:
s_clean_specs.shape,s_noisy_specs.shape

((10001, 128, 702, 1), (10001, 128, 702, 1))

In [75]:
from tensorflow.keras.utils import Sequence
import numpy as np   

class DataGenerator(Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

train_gen = DataGenerator(s_noisy_specs, s_clean_specs, 32)
#test_gen = DataGenerator(X_test, y_test, 32)

# Autoencoder Naive

In [76]:
import os, shutil
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras import models
from keras.layers import Dense,Flatten,Reshape,InputLayer
from keras.models import Sequential
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import image
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#import mlflow
#import mlflow.tensorflow
from PIL import Image
import re

## Autoencoder 0

solo 2 capas densas y regularización

In [97]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder0=models.Sequential()
auto_encoder0.add(layers.Input(shape=img_shape))
auto_encoder0.add(layers.Flatten())
auto_encoder0.add(layers.Dense(64))
auto_encoder0.add(layers.Dropout(drop_out))
auto_encoder0.add(layers.Dense(64))
auto_encoder0.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder0.add(layers.Reshape(img_shape))
auto_encoder0.compile(optimizer='adamax', loss='mse')
auto_encoder0.summary()

history0 = auto_encoder0.fit(train_gen,
                    epochs=20,
                    )

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_21 (Flatten)        (None, 89856)             0         
                                                                 
 dense_97 (Dense)            (None, 64)                5750848   
                                                                 
 dropout_45 (Dropout)        (None, 64)                0         
                                                                 
 dense_98 (Dense)            (None, 64)                4160      
                                                                 
 dense_99 (Dense)            (None, 89856)             5840640   
                                                                 
 reshape_19 (Reshape)        (None, 128, 702)          0         
                                                                 
Total params: 11,595,648
Trainable params: 11,595,648

## Autoencoder 1

capas más complejas

In [98]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder=models.Sequential()
auto_encoder.add(layers.Input(shape=img_shape))
auto_encoder.add(layers.Flatten())
auto_encoder.add(layers.Dense(128))
auto_encoder.add(layers.Dropout(drop_out))
auto_encoder.add(layers.Dense(256))
auto_encoder.add(layers.Dropout(drop_out))
auto_encoder.add(layers.Dense(512))
auto_encoder.add(layers.Dropout(drop_out))
auto_encoder.add(layers.Dense(256))
auto_encoder.add(layers.Dense(128))
auto_encoder.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder.add(layers.Reshape(img_shape))


auto_encoder.compile(optimizer='adamax', loss='mse')
auto_encoder.summary()

history1 = auto_encoder.fit(train_gen,
                    epochs=20,
                    )

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_22 (Flatten)        (None, 89856)             0         
                                                                 
 dense_100 (Dense)           (None, 128)               11501696  
                                                                 
 dropout_46 (Dropout)        (None, 128)               0         
                                                                 
 dense_101 (Dense)           (None, 256)               33024     
                                                                 
 dropout_47 (Dropout)        (None, 256)               0         
                                                                 
 dense_102 (Dense)           (None, 512)               131584    
                                                                 
 dropout_48 (Dropout)        (None, 512)             

## autoencoder 2

In [100]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder2=models.Sequential()
auto_encoder2.add(layers.Input(shape=img_shape))
auto_encoder2.add(layers.Flatten())
auto_encoder2.add(layers.Dense(64))
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Dense(64))
auto_encoder2.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder2.add(layers.Reshape(img_shape))
auto_encoder2.add(layers.Flatten())
auto_encoder2.add(layers.Dense(128))
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Dense(128))
auto_encoder2.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder2.add(layers.Reshape(img_shape))
auto_encoder2.compile(optimizer='adamax', loss='mse')
auto_encoder2.summary()

history2 = auto_encoder2.fit(train_gen,
                    epochs=20,
                    )

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_25 (Flatten)        (None, 89856)             0         
                                                                 
 dense_112 (Dense)           (None, 64)                5750848   
                                                                 
 dropout_51 (Dropout)        (None, 64)                0         
                                                                 
 dense_113 (Dense)           (None, 64)                4160      
                                                                 
 dense_114 (Dense)           (None, 89856)             5840640   
                                                                 
 reshape_23 (Reshape)        (None, 128, 702)          0         
                                                                 
 flatten_26 (Flatten)        (None, 89856)           

## Autoencoder 3

In [105]:
drop_out=0.1

img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2])

auto_encoder3=models.Sequential()
auto_encoder3.add(layers.Input(shape=img_shape))
auto_encoder3.add(layers.Flatten())
auto_encoder3.add(layers.Dense(32))
auto_encoder3.add(layers.Dropout(drop_out))
auto_encoder3.add(layers.Dense(32))
auto_encoder3.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder3.add(layers.Reshape(img_shape))
auto_encoder3.compile(optimizer='adamax', loss='mse')
auto_encoder3.summary()

history3 = auto_encoder3.fit(train_gen,
                    epochs=20,
                    batch_size=32
                    )

Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_31 (Flatten)        (None, 89856)             0         
                                                                 
 dense_128 (Dense)           (None, 32)                2875424   
                                                                 
 dropout_56 (Dropout)        (None, 32)                0         
                                                                 
 dense_129 (Dense)           (None, 32)                1056      
                                                                 
 dense_130 (Dense)           (None, 89856)             2965248   
                                                                 
 reshape_28 (Reshape)        (None, 128, 702)          0         
                                                                 
Total params: 5,841,728
Trainable params: 5,841,728
N

# Autoencoder CNN

In [114]:
img_shape_cnn=(s_clean_specs.shape[1],s_clean_specs.shape[2],1)
img_shape_cnn

(128, 702, 1)

In [117]:
np.prod((128, 702))

89856

In [122]:
img_shape_cnn=(s_clean_specs.shape[1],s_clean_specs.shape[2],1)

cnn_auto_encoder0=models.Sequential()
cnn_auto_encoder0.add(layers.Conv2D(filters = 3, kernel_size = (3,3),padding = 'Same', 
                activation ='relu', input_shape = img_shape_cnn))

cnn_auto_encoder0.add(layers.Flatten())
cnn_auto_encoder0.add(layers.Dense(128)) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
cnn_auto_encoder0.add(layers.Reshape(img_shape_cnn))
cnn_auto_encoder0.summary()

history_cnn0 = cnn_auto_encoder0.fit(train_gen,
                    epochs=20,
                    batch_size=8
                    )

ResourceExhaustedError: failed to allocate memory [Op:Mul]

In [85]:
img_shape=(s_clean_specs.shape[1],s_clean_specs.shape[2],1)
auto_encoder2=models.Sequential()
auto_encoder2.add(layers.Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                activation ='relu', input_shape = img_shape))
auto_encoder2.add(layers.Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                activation ='relu'))
auto_encoder2.add(layers.MaxPool2D(pool_size=(2,2)))
auto_encoder2.add(layers.BatchNormalization())
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Flatten())
auto_encoder2.add(layers.Dense(512, activation = "relu"))
auto_encoder2.add(layers.Dropout(drop_out))
auto_encoder2.add(layers.Dense(64, activation = "softmax"))
auto_encoder2.add(Dense(np.prod(img_shape))) # np.prod(img_shape) is the same as 32*32*3, it's more generic than saying 3072
auto_encoder2.add(layers.Reshape(img_shape))

auto_encoder2.compile(optimizer='adamax', loss='mse')
auto_encoder2.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_21 (Conv2D)          (None, 128, 702, 32)      832       
                                                                 
 conv2d_22 (Conv2D)          (None, 128, 702, 32)      25632     
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 64, 351, 32)      0         
 g2D)                                                            
                                                                 
 batch_normalization_10 (Bat  (None, 64, 351, 32)      128       
 chNormalization)                                                
                                                                 
 dropout_16 (Dropout)        (None, 64, 351, 32)       0         
                                                                 
 flatten_11 (Flatten)        (None, 718848)          

In [86]:
#history2 = auto_encoder2.fit(x=s_noisy_specs, y=s_clean_specs, epochs=100)
history2 = auto_encoder2.fit(train_gen,
                    epochs=20,
                    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Predicciones

In [172]:
preds=auto_encoder.predict(s_noisy_specs)



In [173]:
preds.shape

(2001, 128, 702)

In [174]:
preds[0].shape

(128, 702)

In [175]:
clean_specs[0]

array([[1.88221908e-04, 2.46463984e-04, 2.61851761e-04, ...,
        2.57317000e-03, 8.99837411e-04, 2.44650152e-03],
       [1.21181156e-03, 9.17276135e-04, 9.90588916e-04, ...,
        5.54336328e-03, 2.85331719e-02, 2.60114968e-01],
       [1.14793324e+00, 1.52739763e+00, 1.82428646e+00, ...,
        2.50480145e-01, 8.19973946e-02, 4.72248858e-03],
       ...,
       [9.29996677e-05, 9.84362341e-05, 1.06841406e-04, ...,
        2.54283252e-04, 6.28159833e-05, 1.36140082e-03],
       [2.84687756e-03, 9.32404399e-03, 1.19536798e-02, ...,
        1.18909981e-02, 1.26775932e-02, 8.28386191e-03],
       [6.48943149e-03, 3.44618829e-03, 1.06449577e-03, ...,
        2.61219740e-02, 3.30388226e-04, 1.67555991e-04]], dtype=float32)

In [176]:
len(preds)

2001

In [177]:
reversed_log=librosa.db_to_power(preds[0])
# step3 converting mel-spectrogrma back to wav file
res = librosa.feature.inverse.mel_to_audio(reversed_log, 
                                           sr=noisy_s_rates[0], 
                                           n_fft=2048, 
                                           hop_length=512, 
                                           win_length=None, 
                                           window='hann', 
                                           center=True, 
                                           pad_mode='reflect', 
                                           power=2.0, 
                                           n_iter=32)

import soundfile as sf
sf.write("test2.wav", res, noisy_s_rates[0])

# Convertidor de espectrograma a audio

In [41]:
# step1 - converting a wav file to numpy array and then converting that to mel-spectrogram
scale_file=r'wavs\clean\clnsp0.wav'
my_audio_as_np_array, my_sample_rate= librosa.load(scale_file)

# step2 - converting audio np array to spectrogram
spec = librosa.feature.melspectrogram(y=my_audio_as_np_array,
                                        sr=my_sample_rate, 
                                            n_fft=2048, 
                                            hop_length=512, 
                                            win_length=None, 
                                            window='hann', 
                                            center=True, 
                                            pad_mode='reflect', 
                                            power=2.0,
                                     n_mels=128)
log_spec = librosa.power_to_db(spec)
reversed_log=librosa.db_to_power(log_spec)
# step3 converting mel-spectrogrma back to wav file
res = librosa.feature.inverse.mel_to_audio(reversed_log, 
                                           sr=my_sample_rate, 
                                           n_fft=2048, 
                                           hop_length=512, 
                                           win_length=None, 
                                           window='hann', 
                                           center=True, 
                                           pad_mode='reflect', 
                                           power=2.0, 
                                           n_iter=32)

# step4 - save it as a wav file
import soundfile as sf
sf.write("test1.wav", res, my_sample_rate)

In [39]:
import soundfile as sf
sf.write('scale.wav', scale, sr, format='ogg', subtype='vorbis')
sf.write('audio.wav', audio, sr, format='ogg', subtype='vorbis')