In [69]:
import librosa, librosa.display
import numpy as np
from pydub import AudioSegment
import my_utils

In [70]:
def split_entire_audio(female, male, mix):
    
    n_fft = 1024
    len_hop = (int)(n_fft / 4)
    # compute stft for both audio files -> works better when calling librosa.mono -> forces the audio signal to go down to mono
    # stft_1 = librosa.stft(female, n_fft=80) 
    # stft_2 = librosa.stft(male, n_fft=80) 
    # stft_mix = librosa.stft(mix, n_fft=80)
    stft_1 = librosa.stft(librosa.to_mono(female), window='hann', n_fft=n_fft, hop_length=len_hop)
    stft_1_mag, stft_1_phase = librosa.magphase(stft_1)
    
    stft_2 = librosa.stft(librosa.to_mono(male), window='hann', n_fft=n_fft, hop_length=len_hop)
    stft_2_mag, stft_2_phase = librosa.magphase(stft_2)
    
    stft_mix = librosa.stft(librosa.to_mono(mix), window='hann', n_fft=n_fft, hop_length=len_hop)
    stft_mix_mag, stft_mix_phase = librosa.magphase(stft_mix)

    print(stft_1.shape, stft_1_mag.shape, stft_1_phase.shape)

    # get mask for entire audio
    mask_1 = my_utils.compute_mask(stft_1, stft_2)
    mask_2 = my_utils.compute_mask(stft_2, stft_1)
    print(stft_1.shape, stft_2.shape, stft_mix.shape, mask_1.shape)

    first_sound_stft = my_utils.get_stft_matrix_from_mixture(mask_1, stft_mix)
    # print(first_sound_stft.shape)

    second_sound_stft = my_utils.get_stft_matrix_from_mixture(mask_2, stft_mix)
    # print(second_sound_stft.shape)

    my_utils.write_new_audio_file(first_sound_stft, '../recordings/recover-female.wav')
    my_utils.write_new_audio_file(second_sound_stft, '../recordings/recover-male.wav')

In [83]:
def split_audio_using_small_segments(female_filename, male_filename):    
    
    n_fft = 2048
    len_hop = (int)(n_fft / 4)
    mix_filename= my_utils.mix_audios(male_filename, female_filename)
    
    male = AudioSegment.from_wav(male_filename)
    female = AudioSegment.from_wav(female_filename)
    mix = AudioSegment.from_wav(mix_filename)
    print("total seconds: ",female.__len__(), male.__len__(), mix.__len__())
    
#     female, male = make_wav_files_same_size(female, male)
    
    # the recover matrix must be the size of the mix, since the mix has the biggest size between the 2 audio files
    # we only need the lines number, since we split the matrix of the sound in a vertically manner, we add new columns 
    # in time for the result matrix
    mix_ndarray, r = my_utils.audiosegment_to_ndarray(mix)
    lines, cols = librosa.stft(librosa.to_mono(mix_ndarray)).shape
    sound_1_stft = np.empty([lines,0], dtype=np.float32)
    sound_2_stft = np.empty([lines,0], dtype=np.float32)
    
    print(lines, cols)
    
    total_ms = mix.__len__()
    frame_size_ms = 1000
    i = 0
    
    while i * frame_size_ms < total_ms:
        start = i * frame_size_ms
        stop = i * frame_size_ms + frame_size_ms
        
        # in case the frame size goes above mix length
        if stop > total_ms:
            fs = total_ms - start
            stop = i * frame_size_ms + fs
            
#         print(total_ms, start, stop)
        # get frames from all audio signals, type = audiosegment
        frame_1 = my_utils.get_specific_frame_in_ms(female, start, stop)
        frame_2 = my_utils.get_specific_frame_in_ms(male, start, stop)
        frame_mix = my_utils.get_specific_frame_in_ms(mix,start, stop)
        
        print("length in seconds of frame ",len(frame_1))
        # use the hop size at 50% of the frame size: for fs = 500, 0->500; 250->750; 500->1000...
#         start = i * (frame_size_ms / 2)
#         stop = i * (frame_size_ms / 2) + frame_size_ms
#         frame_1 = get_specific_frame_in_ms(female_filename, start, stop)
#         frame_2 = get_specific_frame_in_ms(male_filename, start, stop)
#         frame_mix = get_specific_frame_in_ms(mix_filename,start, stop)

        # from audio segment convert to ndarray how librosa uses
        frame_1, r = my_utils.audiosegment_to_ndarray(frame_1)
        frame_2, r = my_utils.audiosegment_to_ndarray(frame_2)
        frame_mix, r = my_utils.audiosegment_to_ndarray(frame_mix)
        
        print("length in array of frame ",len(frame_1), len(female))

        frame_1, frame_mix = my_utils.make_wav_files_same_size(frame_1, frame_mix)
        frame_2, frame_mix = my_utils.make_wav_files_same_size(frame_2, frame_mix)
            
        # compute the stft for each of them
        frame_1_stft = librosa.stft(librosa.to_mono(frame_1), window='hann', hop_length=512)
        frame_2_stft = librosa.stft(librosa.to_mono(frame_2), window='hann', hop_length=512)
        frame_mix_stft = librosa.stft(librosa.to_mono(frame_mix), window='hann', hop_length=512)
        
        print("stft: ",frame_1_stft.shape, frame_2_stft.shape, frame_mix_stft.shape)
        
        # compute masks for current frame
        mask_1 = my_utils.compute_mask(frame_1_stft, frame_2_stft)
        mask_2 = my_utils.compute_mask(frame_2_stft, frame_1_stft)

        # get sound for each source of i-th frame
        y_frame_1_stft = my_utils.get_stft_matrix_from_mixture(mask_1, frame_mix_stft)
        y_frame_2_stft = my_utils.get_stft_matrix_from_mixture(mask_2, frame_mix_stft)

#         write_new_audio_file(y_frame_1_stft, '../recordings/recover-' + str(i) + '.wav')

        sound_1_stft = np.hstack((sound_1_stft,y_frame_1_stft))
        sound_2_stft = np.hstack((sound_2_stft,y_frame_2_stft))
        i += 1

    print("i: ", i)
    print("len final: ", sound_1_stft.shape)
    sound1_start, sound1_end = mix_filename.index('arctic'), mix_filename.index('.wav')
    name = "../recordings/RECOVER_" + mix_filename[sound1_start:sound1_end]
    
    my_utils.write_new_audio_file(sound_1_stft, name + '_female.wav')
    my_utils.write_new_audio_file(sound_2_stft, name + '_male.wav')

In [84]:
male_filename = '../recordings/male1/arctic_a0023.wav'
female_filename = '../recordings/female1/arctic_a0407.wav'

# for entire audio
# female, male, mix = my_utils.load_and_mix_files(female_filename, male_filename)
# split_entire_audio(female, male, mix)

#---------------------------------------
# split audio in segments
# mix is created inside function
split_audio_using_small_segments(female_filename, male_filename)

total seconds:  6255 7000 7000
1025 219
length in seconds of frame  1000
length in array of frame  16000 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
length in seconds of frame  1000
length in array of frame  16000 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
length in seconds of frame  1000
length in array of frame  16000 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
length in seconds of frame  1000
length in array of frame  16000 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
length in seconds of frame  1000
length in array of frame  16000 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
length in seconds of frame  1000
length in array of frame  16000 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
length in seconds of frame  255
length in array of frame  4080 6255
stft:  (1025, 32) (1025, 32) (1025, 32)
i:  7
len final:  (1025, 224)
