In [139]:
import os
import librosa
import numpy as np
import soundfile as sf

In [140]:
def padd_audio(audio, max_duration=15, sample_rate=16000):
    """This function take an audio and padd that audio with zeros"""
    
    max_length = max_duration * sample_rate
    padding_needed = max_length - len(audio)
    pad_left = padding_needed // 2
    pad_right = padding_needed - pad_left
    
    return np.pad(audio, (pad_left, pad_right), 'constant')

In [141]:
def mix_audio(original_audio_path, noise_audio_path, sample_rate=16000):
    """This function take an original audio and noise audio and mix it together"""
    
    # Load the original audio
    original_audio, sr = librosa.load(original_audio_path, sr=sample_rate)
    
    #Padd original audio
    original_audio = padd_audio(original_audio, sample_rate=sample_rate)
    
    # Load the noise audio
    noise_audio, sr_noise = librosa.load(noise_audio_path, sr=sample_rate)
    
    # Repeat the noise audio
    noise_audio = np.tile(noise_audio, int(np.ceil(len(original_audio) / len(noise_audio))))

    # Trim the repeated noise audio to match the length of the original audio
    noise_audio = noise_audio[:len(original_audio)]
    
    return original_audio + noise_audio

In [142]:
def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
    """This function take an audio and split into several frame
       in a numpy matrix of size (nb_frame,frame_length)"""

    sequence_sample_length = sound_data.shape[0]
    # Creating several audio frames using sliding windows
    
    sound_data_list = [sound_data[start:start + frame_length] for start in range(
    0, sequence_sample_length - frame_length + 1, hop_length_frame)]  # get sliding windows
    
    # Combining all the frames to single matrix
    return np.vstack(sound_data_list)

In [143]:
sr = 16000  # Sampling rate
frame_length_ms = 25  # Frame length in milliseconds
frame_length = int(frame_length_ms * sr / 1000)  # Frame length in samples
hop_length_frame = frame_length // 2  # Hop length (50% overlap)   

In [144]:
#Example usage
sound1 = mix_audio(os.path.join(os.getcwd(), 'Dataset', '89-218-0001.flac'), os.path.join(os.getcwd(), 'Noise', '5-202898-A-10.wav'))
clean1 = audio_to_audio_frame_stack(sound1,frame_length,hop_length_frame)
print(clean1.shape)

sound2 = mix_audio(os.path.join(os.getcwd(), 'Dataset', '89-218-0003.flac'), os.path.join(os.getcwd(), 'Noise', '5-202898-A-10.wav'))
clean2 = audio_to_audio_frame_stack(sound2,frame_length,hop_length_frame)
print(clean2.shape)

(1199, 400)
(1199, 400)


In [145]:
sf.write('noisy_audio.wav', sound1, sr)

In [154]:
# Randmly chosen noise for each audio
def combine_audio_with_noise(original_audio_dir, noise_audio_dir):
    combination_dict = {}
    noise_audios = os.listdir(noise_audio_dir)
    original_audios = os.listdir(original_audio_dir)
    
    for original_audio in original_audios:
        noise_audio = np.random.choice(noise_audios)
        combination_dict[os.path.join(original_audio_dir, original_audio)] = os.path.join(noise_audio_dir, noise_audio)
        
    return combination_dict

In [155]:
audio_noise_pairs = combine_audio_with_noise(os.path.join(os.getcwd(), 'Dataset'), os.path.join(os.getcwd(), 'Noise'))

In [163]:
#Getting all the mixed audio matrices
mixed_audios = np.zeros(len(audio_noise_pairs), dtype=object)
for index, (audio_dir, noise_dir) in enumerate(audio_noise_pairs.items()):
    sound = mix_audio(audio_dir, noise_dir, sample_rate=16000)
    audio_to_audio_frame_stack(sound,frame_length,hop_length_frame)
    mixed_audios[index] = audio_to_audio_frame_stack(sound,frame_length,hop_length_frame)

In [164]:
mixed_audios[0].shape

(1199, 400)

In [165]:
len(mixed_audios)

5037