In [1]:
import zipfile2
from zipfile2 import ZipFile
import os
import string
import json
from IPython.display import Audio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import librosa

import sklearn
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras


In [2]:
# extracting the files
# base_path = '../dataset/NOIZEUS_dataset/'
# for file in os.listdir(base_path):
#     if file.endswith('.zip'):
#         path = f'{base_path}/{file}'
#         filename = file.split('.')[0]
#         if not os.path.exists(f'{base_path}/{filename}'):
#             os.makedirs(f'{base_path}/{filename}')
#         with ZipFile(path , 'r') as zip:
#             zip.extractall(path = f'{base_path}/{filename}')
#         if os.path.exists(path):
#             os.remove(path)

In [3]:
# preparing the aurora-4 dataset
base_path = '../dataset/NOIZEUS_dataset/'
audio_file_paths = []
db_array = ['0dB' , '5dB' , '10dB' , '15dB']
place_array = ['airport' , 'babble' , 'car' , 'exhibition' , 'restaurant' , 'station' , 'street' , 'train']
for place in place_array:
    for dB in db_array:
        path = f'{base_path}/{place}_{dB}/{dB}'
        audio_file_paths.append(path)

In [4]:
noizeus_dataset = pd.DataFrame({'path' : audio_file_paths})
noizeus_dataset.to_csv('datasets/noizeus_dataset_audio_file_paths.csv')

In [5]:
filenames = []
mel_spectograms = []
try:
    for folder_path in noizeus_dataset['path']:
        for filename in os.listdir(folder_path):
            file_path = f'{folder_path}/{filename}'
            filenames.append(filename)
            y, sr = librosa.load(file_path,sr=22050)
            mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
            mel_spectograms.append(mel_spec)
            # Convert to decibels for better visualization
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

            # # Plot the Mel Spectrogram
            # plt.figure(figsize=(10, 4))
            # librosa.display.specshow(mel_spec_db, sr=sr, hop_length=512, x_axis='time', y_axis='mel', cmap='viridis')
            # plt.colorbar(format='%+2.0f dB')
            # plt.title('Mel Spectrogram')
            # plt.tight_layout()
            # filename = filename.split('.')[0]
            # plt.savefig(f'datasets/mel_spectogram_noizeus_dataset/{filename}.png' , bbox_inches = 'tight')
            # # plt.show()
            # plt.close()
except Exception as e:
    print(e)

[WinError 3] The system cannot find the path specified: '../dataset/NOIZEUS_dataset//train_15dB/15dB'


In [6]:
noisy_audiofiles_spectogram_dataset = pd.DataFrame({'noisy_file' : filenames , 'mel_spectorgrams' : mel_spectograms})
noisy_audiofiles_spectogram_dataset.to_csv('datasets/noisy_audiofiles_melspectograms_noizeus.csv',index=False)

In [7]:
filenames = []
mel_spectograms = []
clean_audio_path = '../dataset/NOIZEUS_dataset/clean/clean'
for file in os.listdir(clean_audio_path):
    filenames.append(file)
    path = f'{clean_audio_path}/{file}'
    y , sr = librosa.load(path , sr=22050)
    mel_spec = librosa.feature.melspectrogram(y = y , n_fft=2048 , hop_length = 512 , n_mels = 128)
    mel_spectograms.append(mel_spec)

    # # plotting spectrum
    # mel_spec_db = librosa.power_to_db(mel_spec , ref=np.max)
    # plt.figure(figsize=(10,4))
    # librosa.display.specshow(mel_spec_db , sr = 22050, hop_length=512,n_fft=2048,x_axis='time' , y_axis='mel')
    # plt.colorbar(format = '%+2.0f dB')
    # plt.title('Mel Spectogram')
    # plt.tight_layout()
    # filename = file.split('.')[0]
    # plt.savefig(f'datasets/mel_spectogram_noizeus_dataset/{filename}.png' , bbox_inches = 'tight')
    # plt.close()

In [8]:
clean_audiofiles_spectogram_dataset = pd.DataFrame({'clean_file' : filenames , 'mel_spectorgrams' : mel_spectograms})
clean_audiofiles_spectogram_dataset.to_csv('datasets/clean_audiofiles_melspectograms_noizeus.csv',index=False)

In [9]:
noisy_audio_frames = []
clean_audio_frames = []
for mel_spec in noisy_audiofiles_spectogram_dataset['mel_spectorgrams']:
    shape = mel_spec.shape[1]
    noisy_audio_frames.append(shape)
for mel_spec in noisy_audiofiles_spectogram_dataset['mel_spectorgrams']:
    shape = mel_spec.shape[1]
    clean_audio_frames.append(shape)
print(max(noisy_audio_frames))
print(max(clean_audio_frames))

152
152


In [10]:
noisy_audiofiles_spectogram_dataset['mel_spectorgrams'][0].shape

(128, 122)

In [11]:
# creating padded mel_spectograms:
def padding_spectrograms(mel_spec , max_frames):
    pad_width = max_frames - mel_spec.shape[1]
    return np.pad(mel_spec , [(0,0),(0,pad_width)] , mode='constant')

In [12]:
padded_noisy_spectograms = []
padded_clean_spectograms = []
for mel_spec in noisy_audiofiles_spectogram_dataset['mel_spectorgrams']:
    padded_spec = padding_spectrograms(mel_spec=mel_spec , max_frames=155)
    padded_noisy_spectograms.append(padded_spec)    
for mel_spec in clean_audiofiles_spectogram_dataset['mel_spectorgrams']:
    padded_spec = padding_spectrograms(mel_spec=mel_spec , max_frames=155)
    padded_clean_spectograms.append(padded_spec)

noisy_audiofiles_spectogram_dataset['padded_mel_spectogram'] = padded_noisy_spectograms
clean_audiofiles_spectogram_dataset['padded_mel_spectogram'] = padded_clean_spectograms

In [13]:
noisy_audiofiles_spectogram_dataset.head(15)

Unnamed: 0,noisy_file,mel_spectorgrams,padded_mel_spectogram
0,sp01_airport_sn0.wav,"[[0.0052139517, 0.0012600948, 1.3862577e-06, 1...","[[0.0052139517, 0.0012600948, 1.3862577e-06, 1..."
1,sp02_airport_sn0.wav,"[[0.00036863316, 8.2071834e-05, 1.0572604e-06,...","[[0.00036863316, 8.2071834e-05, 1.0572604e-06,..."
2,sp03_airport_sn0.wav,"[[0.012967354, 0.0031852573, 6.649581e-06, 6.5...","[[0.012967354, 0.0031852573, 6.649581e-06, 6.5..."
3,sp04_airport_sn0.wav,"[[0.00070778327, 0.00014989886, 2.8866025e-06,...","[[0.00070778327, 0.00014989886, 2.8866025e-06,..."
4,sp05_airport_sn0.wav,"[[0.0042353547, 0.0010432785, 6.5330046e-06, 3...","[[0.0042353547, 0.0010432785, 6.5330046e-06, 3..."
5,sp06_airport_sn0.wav,"[[0.016041888, 0.0038738293, 5.3317297e-07, 5....","[[0.016041888, 0.0038738293, 5.3317297e-07, 5...."
6,sp07_airport_sn0.wav,"[[7.423591e-05, 2.0347663e-05, 6.978971e-06, 5...","[[7.423591e-05, 2.0347663e-05, 6.978971e-06, 5..."
7,sp08_airport_sn0.wav,"[[0.0051079295, 0.0013462999, 2.6548855e-06, 4...","[[0.0051079295, 0.0013462999, 2.6548855e-06, 4..."
8,sp09_airport_sn0.wav,"[[0.001449844, 0.00034780998, 3.0090414e-06, 3...","[[0.001449844, 0.00034780998, 3.0090414e-06, 3..."
9,sp10_airport_sn0.wav,"[[0.028069697, 0.006719678, 2.6908883e-06, 4.7...","[[0.028069697, 0.006719678, 2.6908883e-06, 4.7..."


In [14]:
clean_audiofiles_spectogram_dataset.head(15)

Unnamed: 0,clean_file,mel_spectorgrams,padded_mel_spectogram
0,sp01.wav,"[[1.2776202e-06, 3.4964222e-07, 9.804121e-09, ...","[[1.2776202e-06, 3.4964222e-07, 9.804121e-09, ..."
1,sp02.wav,"[[1.2347035e-05, 3.0399951e-06, 1.4890509e-08,...","[[1.2347035e-05, 3.0399951e-06, 1.4890509e-08,..."
2,sp03.wav,"[[2.0339501e-05, 5.1593984e-06, 3.4205748e-08,...","[[2.0339501e-05, 5.1593984e-06, 3.4205748e-08,..."
3,sp04.wav,"[[3.6101492e-06, 9.116466e-07, 1.5388899e-08, ...","[[3.6101492e-06, 9.116466e-07, 1.5388899e-08, ..."
4,sp05.wav,"[[2.0704912e-05, 5.106193e-06, 9.217254e-09, 1...","[[2.0704912e-05, 5.106193e-06, 9.217254e-09, 1..."
5,sp06.wav,"[[6.298274e-06, 1.575383e-06, 6.9832686e-09, 3...","[[6.298274e-06, 1.575383e-06, 6.9832686e-09, 3..."
6,sp07.wav,"[[3.5274652e-06, 9.192975e-07, 4.9976454e-09, ...","[[3.5274652e-06, 9.192975e-07, 4.9976454e-09, ..."
7,sp08.wav,"[[1.3399462e-06, 3.7522113e-07, 1.8843824e-08,...","[[1.3399462e-06, 3.7522113e-07, 1.8843824e-08,..."
8,sp09.wav,"[[1.1531466e-06, 3.034936e-07, 2.186049e-08, 1...","[[1.1531466e-06, 3.034936e-07, 2.186049e-08, 1..."
9,sp10.wav,"[[6.1617684e-06, 1.572037e-06, 1.8210462e-08, ...","[[6.1617684e-06, 1.572037e-06, 1.8210462e-08, ..."


In [15]:
clean_audio_filenames = []
targeted_spectograms = []
for filename in noisy_audiofiles_spectogram_dataset['noisy_file']:
    filename = filename.split('_')[0]
    clean_audio_filename = f'{filename}.wav'
    clean_audio_filenames.append(clean_audio_filename)
    # targeted_spectogram = clean_audiofiles_spectogram_dataset[clean_audiofiles_spectogram_dataset['noisy_file'] == clean_audio_filename]
    # targeted_spectogram = targeted_spectogram['padded_mel_spectogram']
    # print(targeted_spectogram)
    # targeted_spectogram = targeted_spectogram.array()
    # targeted_spectograms.append(targeted_spectogram)
noisy_audiofiles_spectogram_dataset['clean_file'] = clean_audio_filenames


In [16]:
final_dataset = pd.merge(noisy_audiofiles_spectogram_dataset,clean_audiofiles_spectogram_dataset , on = 'clean_file' , how='outer')

In [17]:
xtrain,xtest,ytrain,ytest = train_test_split(final_dataset['padded_mel_spectogram_x'].to_numpy() , final_dataset['padded_mel_spectogram_y'].to_numpy() , test_size=0.2)
# defining a neural network
encoder = tf.keras.Sequential(
    [
        tf.keras.layers.Conv2D(filters=8 ,padding='same' , strides=1 , kernel_size = (3,3) , activation = 'relu'),
        tf.keras.layers.Conv2D(filters=8 ,padding='same' , strides=1 , kernel_size = (3,3) , activation = 'relu'),
        tf.keras.layers.Conv2D(filters=16 ,padding='same' , strides=1 , kernel_size = (3,3) , activation = 'relu'),
        tf.keras.layers.Conv2D(filters=16 ,padding='same' , strides=1 , kernel_size = (3,3) , activation = 'relu'),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(filters=64 ,padding='same' , strides=1 , kernel_size = (3,3) , activation = 'relu'),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(filters=128 ,padding='same' , strides=1 , kernel_size = (3,3) , activation = 'relu'),
        tf.keras.layers.MaxPool2D()
    ]
)
decoder = tf.keras.Sequential([
    tf.keras.layers.Conv2D(256, (3, 3), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(128, (3, 3), padding='same', activation='relu'),
    tf.keras.layers.UpSampling2D(size=(2, 2)),  # 16x19 → 32x38
    tf.keras.layers.Conv2D(64, (5, 5), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(64, (5, 5), padding='same', activation='relu'),
    tf.keras.layers.UpSampling2D(size=(2, 2)),  # 32x38 → 64x76
    tf.keras.layers.Conv2D(32, (5, 5), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(32, (5, 5), padding='same', activation='relu'),
    tf.keras.layers.UpSampling2D(size=(2, 2)),  # 64x76 → 128x152
    tf.keras.layers.Conv2D(16, (5, 5), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(8, (5, 5), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(1, (3, 3), padding='same', activation='sigmoid'),
    # tf.keras.layers.ZeroPadding2D(padding=((0, 0), (0, 3)))
])

conv_ae = tf.keras.Sequential([encoder,decoder])
conv_ae.compile(loss='mse' , optimizer='adam')


xtrain = [np.asarray(element).astype(np.float32) for element in xtrain]
xtest = [np.asarray(element).astype(np.float32) for element in xtest]
ytrain = [np.asarray(element).astype(np.float32) for element in ytrain]
ytest = [np.asarray(element).astype(np.float32) for element in ytest]

xtrain = tf.convert_to_tensor(xtrain)
ytrain = tf.convert_to_tensor(ytrain)
xtest = tf.convert_to_tensor(xtest)
ytest = tf.convert_to_tensor(ytest)

xtrain = tf.expand_dims(xtrain , axis = -1)
ytrain = tf.expand_dims(ytrain , axis = -1)
xtest = tf.expand_dims(xtest , axis = -1)
ytest = tf.expand_dims(ytest , axis = -1)

history = conv_ae.fit(xtrain , ytrain , batch_size=64 , epochs = 1000 , validation_data=(xtest , ytest))

Epoch 1/1000


ValueError: in user code:

    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "d:\IDEs\anaconda\envs\audio_processing\lib\site-packages\keras\losses.py", line 1486, in mean_squared_error
        return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)

    ValueError: Dimensions must be equal, but are 152 and 155 for '{{node mean_squared_error/SquaredDifference}} = SquaredDifference[T=DT_FLOAT](sequential_2/sequential_1/conv2d_14/Sigmoid, IteratorGetNext:1)' with input shapes: [?,128,152,1], [?,128,155,1].
