In [5]:
import librosa, librosa.display
import numpy as np
from pydub import AudioSegment
import my_utils

In [6]:
def split_entire_audio(female, male, mix):
    
    n_fft = 2048
    len_hop = (int)(n_fft / 4)
    # compute stft for both audio files -> works better when calling librosa.mono -> forces the audio signal to go down to mono
    # stft_1 = librosa.stft(female, n_fft=80) 
    # stft_2 = librosa.stft(male, n_fft=80) 
    # stft_mix = librosa.stft(mix, n_fft=80)
    stft_1 = librosa.stft(librosa.to_mono(female), window='hann', n_fft=n_fft, hop_length=len_hop)    
    stft_2 = librosa.stft(librosa.to_mono(male), window='hann', n_fft=n_fft, hop_length=len_hop)    
    stft_mix = librosa.stft(librosa.to_mono(mix), window='hann', n_fft=n_fft, hop_length=len_hop)

    # get mask for entire audio
    mask_1 = my_utils.compute_mask(stft_1, stft_2)
    mask_2 = my_utils.compute_mask(stft_2, stft_1)
#     print(stft_1.shape, mask_1.shape)
    
    first_sound_stft = my_utils.get_stft_matrix_from_mixture(mask_1, stft_mix)
    second_sound_stft = my_utils.get_stft_matrix_from_mixture(mask_2, stft_mix)

    my_utils.write_new_audio_file(first_sound_stft, '../recordings/recover-female.wav')
    my_utils.write_new_audio_file(second_sound_stft, '../recordings/recover-male.wav')

In [14]:
def split_audio_using_small_segments(female_filename, male_filename):    
    
    n_fft = 1024
    hop_length = int(0.001 * 8000)
    mix_filename= my_utils.mix_audios(male_filename, female_filename)
    
    male = AudioSegment.from_wav(male_filename)
    female = AudioSegment.from_wav(female_filename)
    mix = AudioSegment.from_wav(mix_filename)
    
    sound1 = np.empty([0,])
    sound2 = np.empty([0,])
    
    total_ms = mix.__len__()
    frame_size_ms = 20
    i = 0
    
    while i * frame_size_ms < total_ms:
        start = i * frame_size_ms
        stop = i * frame_size_ms + frame_size_ms
        
        # in case the frame size goes above mix length
        if stop > total_ms:
            fs = total_ms - start
            stop = i * frame_size_ms + fs
            
#         print(total_ms, start, stop)
        # get frames from all audio signals, type = audiosegment
        frame_1 = my_utils.get_specific_frame_in_ms(female, start, stop)
        frame_2 = my_utils.get_specific_frame_in_ms(male, start, stop)
        frame_mix = my_utils.get_specific_frame_in_ms(mix,start, stop)

        # use the hop size at 50% of the frame size: for fs = 500, 0->500; 250->750; 500->1000...
#         start = i * (frame_size_ms / 2)
#         stop = i * (frame_size_ms / 2) + frame_size_ms
#         frame_1 = get_specific_frame_in_ms(female_filename, start, stop)
#         frame_2 = get_specific_frame_in_ms(male_filename, start, stop)
#         frame_mix = get_specific_frame_in_ms(mix_filename,start, stop)

        # from audio segment convert to ndarray how librosa uses
        frame_1, r = my_utils.audiosegment_to_ndarray(frame_1)
        frame_2, r = my_utils.audiosegment_to_ndarray(frame_2)
        frame_mix, r = my_utils.audiosegment_to_ndarray(frame_mix)
        
#         print("length of ndarray of frame ",frame_1.shape)

        frame_1, frame_mix = my_utils.make_wav_files_same_size(frame_1, frame_mix)
        frame_2, frame_mix = my_utils.make_wav_files_same_size(frame_2, frame_mix)
#         print("duration window: ",librosa.get_duration(y=frame_1, sr=16000))
          
        # compute the stft for each of them
        frame_1_stft = librosa.stft(librosa.to_mono(frame_1), window='hann', n_fft=n_fft, hop_length=hop_length)
        frame_2_stft = librosa.stft(librosa.to_mono(frame_2), window='hann', n_fft=n_fft, hop_length=hop_length)
        frame_mix_stft = librosa.stft(librosa.to_mono(frame_mix), window='hann', n_fft=n_fft, hop_length=hop_length)
        
        # compute masks for current frame
        mask_1 = my_utils.compute_mask(frame_1_stft, frame_2_stft)
        mask_2 = my_utils.compute_mask(frame_2_stft, frame_1_stft)

        # get sound for each source of i-th frame
        y_frame_1_stft = my_utils.get_stft_matrix_from_mixture(mask_1, frame_mix_stft)
        y_frame_2_stft = my_utils.get_stft_matrix_from_mixture(mask_2, frame_mix_stft)
        
        inverse_sound1_stft = librosa.istft(y_frame_1_stft ,hop_length=hop_length, window='hann')
        inverse_sound2_stft = librosa.istft(y_frame_2_stft, hop_length=hop_length, window='hann')
        
#         print(sound1.shape)
#         print(inverse_sound1_stft.shape)
        
        sound1 = np.concatenate((sound1, inverse_sound1_stft))
        sound2 = np.concatenate((sound2, inverse_sound2_stft))
        
        i += 1
#         print("sound1 shape: ", sound1.shape)
#         print("--------------------------------------------------------")

#     print("i: ", i)
#     print("len final: ", sound_1_stft.shape)
    print("sound len final: ", sound1.shape)
    sound1_start, sound1_end = mix_filename.index('arctic'), mix_filename.index('.wav')
    name = "../recordings/rec_" + mix_filename[sound1_start:sound1_end]
    
#     my_utils.write_new_audio_file(sound_1_stft, name + '_female.wav')
#     my_utils.write_new_audio_file(sound_2_stft, name + '_male.wav')
    sound1 = my_utils.delete_final_zeros_for_silence(sound1)
    sound2 = my_utils.delete_final_zeros_for_silence(sound2)
        
    librosa.output.write_wav(name + '_female.wav', librosa.to_mono(sound1), sr = 16000)
    librosa.output.write_wav(name + '_male.wav', sound2, sr = 16000)

In [16]:
male_filename = '../recordings/male1/arctic_a0111.wav'
female_filename = '../recordings/female1/arctic_a0111.wav'

# for entire audio
female, male, mix = my_utils.load_and_mix_files(female_filename, male_filename)
split_entire_audio(female, male, mix)

#---------------------------------------
# split audio in segments
# mix is created inside function
split_audio_using_small_segments(female_filename, male_filename)

4054.9999999999995 4054.9999999999995 4054.9999999999995
sound len final:  (64880,)
