In [None]:
import librosa, librosa.display
import numpy as np
from pydub import AudioSegment

In [None]:
def split_after_getting_stft_done(female_filename, male_filename):
    
    n_fft = 1024
    hop_length = int(0.001 * 8000)
    
    male, sr_male = librosa.load(male_filename, sr=16000) 
    female, sr_female = librosa.load(female_filename, sr=16000) 

    # pad smaller array with zeros, so both audio files have the same length
    female, male = my_utils.make_wav_files_same_size(female, male)

    # load the mixed audio 
    mix_filename= my_utils.mix_audios(male_filename, female_filename)
    mix, sr_mix = librosa.load(mix_filename, sr=16000)
    
    print("duration window: ",librosa.get_duration(y=mix, sr=16000))
    
    female_stft = librosa.stft(librosa.to_mono(female), window='hann', n_fft=n_fft, hop_length=hop_length)
    male_stft = librosa.stft(librosa.to_mono(male), window='hann', n_fft=n_fft, hop_length=hop_length)
    mix_stft = librosa.stft(librosa.to_mono(mix), window='hann', n_fft=n_fft, hop_length=hop_length)
    
    print(female_stft.shape)
    duration= librosa.get_duration(S=female_stft, sr=16000) * 1000
    seg = librosa.get_duration(S=female_stft[:,:2], sr=16000)
    print(seg)
    print("duration window: ",librosa.get_duration(S=female_stft, sr=16000))
    print("duration window: ",librosa.get_duration(S=male_stft, sr=16000))
    print("duration window: ",librosa.get_duration(S=mix_stft, sr=16000))
    
    sound1 = np.empty_like(female_stft)
    sound2 = np.empty_like(female_stft)
    
    total_ms = mix_stft.shape[1]
    frame_size_ms = 2
    i = 0
    
#     print("duration window: ",librosa.get_duration(y=frame_1, sr=16000))
    
    while i * frame_size_ms < total_ms:
        start = i * frame_size_ms
        stop = i * frame_size_ms + frame_size_ms
        
        # in case the frame size goes above mix length
        if stop > total_ms:
            fs = total_ms - start
            stop = i * frame_size_ms + fs
        
        # compute masks for current frame
        mask_1 = my_utils.compute_mask(female_stft[:, start:stop], male_stft[:, start:stop])
        mask_2 = my_utils.compute_mask(male_stft[:, start:stop], female_stft[:, start:stop])

        print("--------------------------")
        print(mask_1[0:10, 0])
        # get sound for each source of i-th frame
#         y_frame_1_stft = my_utils.get_stft_matrix_from_mixture(mask_1, mix_stft[:, start:stop])
#         y_frame_2_stft = my_utils.get_stft_matrix_from_mixture(mask_2, mix_stft[:, start:stop])
        
#         inverse_sound1_stft = librosa.istft(y_frame_1_stft, window='hann', hop_length=hop_length)
#         inverse_sound2_stft = librosa.istft(y_frame_2_stft, window='hann', hop_length=hop_length)
        
# #         print(sound1.shape)
# #         print(inverse_sound1_stft.shape)
        
        sound1 = np.concatenate((sound1, mask_1))
        sound2 = np.concatenate((sound2, mask_2))
        
        i += 1

    sound1 = my_utils.delete_final_zeros_for_silence(sound1)
    sound2 = my_utils.delete_final_zeros_for_silence(sound2)
        
    librosa.output.write_wav('../recordings/test-female.wav', sound1, sr = 16000)
    librosa.output.write_wav('../recordings/test-male.wav', sound2, sr = 16000)

In [None]:
def mix_audios(filename1, filename2):
    sound1 = AudioSegment.from_file(filename1)
    sound2 = AudioSegment.from_file(filename2)

    # overlay over the longest audio source
    if np.array(sound1.get_array_of_samples()).shape[0] > np.array(sound2.get_array_of_samples()).shape[0]:
        combined = sound1.overlay(sound2)
    else:
        combined = sound2.overlay(sound1)

    sound1_start, sound1_end = filename1.index('arctic'), filename1.index('.wav')
    sound2_start, sound2_end = filename2.index('arctic'), filename2.index('.wav')
    name = "../recordings/mixes/" + filename1[sound1_start:sound1_end] + '_' + filename2[sound2_start:sound2_end] + ".wav"
    combined.export(name, format='wav')
    
    return name

In [None]:
def get_specific_frame_in_ms(audio_array, start, stop):
    # in milliseconds
    newAudio = audio_array[start:stop]
    return newAudio
#     print("aici: ", len(audio_array), start, stop)
    
#     if stop <= len(audio_array):
#     print("1: ", len(audio_array), start, stop)
#     newAudio = audio_array[start:stop]
#     return newAudio
#     elif start >= len(audio_array):
#         print("2: ", len(audio_array), start, stop)
#         newAudio = np.zeros(stop-start)
#         return ndarray_to_audiosegment(newAudio, 16000)
#     elif start < len(audio_array) and stop > len(audio_array):
#         print("3: ", len(audio_array), start, stop)
#         oldAudio = audio_array[start:len(audio_array)-1]
#         newAudio = np.zeros(stop-start)
#         oldAudio.resize(newAudio.shape)
#         newAudio = newAudio + oldAudio
#         newAudio = ndarray_to_audiosegment(newAudio,16000)
#         return newAudio

In [None]:
def make_wav_files_same_size(arr1, arr2):
    if arr1.shape[0] < arr2.shape[0] :
        arr1 = np.pad(arr1, (0,(arr2.shape[0] - arr1.shape[0])), 'constant', constant_values=(0))
    else :
        arr2 = np.pad(arr2, (0,(arr1.shape[0] - arr2.shape[0])), 'constant', constant_values=(0))
    
    return arr1, arr2

In [None]:
def compute_mask(stft_1, stft_2):
#     print("aici: ", stft_1.shape, stft_2.shape)
    # small epsilon to avoid dividing by zero
    eps = np.finfo(np.float).eps

    # compute model as the sum of spectrograms
    mix = eps + np.abs(stft_1) + np.abs(stft_2)
    
    mask = np.divide(np.abs(stft_1), mix)
    
    return mask

In [None]:
def get_stft_matrix_from_mixture(mask, mixture):
    return np.multiply(mask, mixture)

In [None]:
def write_new_audio_file(sound, filename):
    inverse_sound_stft = librosa.istft(sound)
    librosa.output.write_wav(filename, inverse_sound_stft, 16000)
    #s = sound.export(filename, format="wav")

In [None]:
def audiosegment_to_ndarray(audiosegment):
    samples = audiosegment.get_array_of_samples()
    samples_float = librosa.util.buf_to_float(samples,n_bytes=2,
                                      dtype=np.float32)
    if audiosegment.channels==2:
        sample_left= np.copy(samples_float[::2])
        sample_right= np.copy(samples_float[1::2])
        sample_all = np.array([sample_left,sample_right])
    else:
        sample_all = samples_float
        
        
    return [sample_all,audiosegment.frame_rate]

In [None]:
def ndarray_to_audiosegment(y,frame_rate):
    
    if(len(y.shape) == 2):
        new_array = np.zeros((y.shape[1]*2),dtype=float)
        new_array[::2] = y[0]
        new_array[1::2] = y[1]
    else:
        new_array = y
        
    audio_segment = AudioSegment(
    new_array.tobytes(), 
    frame_rate=frame_rate,
    sample_width=new_array.dtype.itemsize, 
    channels = len(y.shape)
)
    return audio_segment

In [None]:
def load_and_mix_files(female_filename, male_filename):
    # get 2 audio files
    male, sr_male = librosa.load(male_filename, sr=16000) 
    female, sr_female = librosa.load(female_filename, sr=16000) 

    # pad smaller array with zeros, so both audio files have the same length
    female, male = make_wav_files_same_size(female, male)

    # load the mixed audio 
    mix_filename= mix_audios(male_filename, female_filename)
    mix, sr_mix = librosa.load(mix_filename, sr=16000)

    # durata totala a inregistrarii
    male_rec_ms = float(len(male)) / sr_male * 1000
    female_rec_ms = float(len(female)) / sr_female * 1000
    mixed_audio_rec_ms = float(len(mix)) / 16000 * 1000
    print(male_rec_ms, female_rec_ms, mixed_audio_rec_ms)
    
    return female, male, mix