In [2]:
import numpy as np
from scipy import signal
import os
import string
import librosa
import time
import soundfile as sf
from threading import Thread

import gpuRIR

from multiprocessing import Process
import IPython.display as ipd

import matplotlib.pyplot as plt
%matplotlib inline

import soundfile as sf

In [21]:
# Load noise 
def load_all_noise(basedir='./', sr=16000):
    wavs = []
    wavs_file = [f for f in os.listdir(basedir) if 'wav' in f]
    for name in wavs_file:
        s, _ = sf.read(basedir + name)
        wavs.append(s)
    return wavs
    
noises = load_all_noise('/Data/DATASETS/NoiseX/8k/')
print(len(noises))


15


In [25]:
print(noises[0].shape)
a = np.random.choice(range(len(noises)))
print(a)

(1881844,)
7


In [None]:
def make_WSJ_noise(idx, num, dataset='tr', sr=16000):
    
    num_spk = 1
    np.random.seed(1)
    max_len = sr * 15

    # load the text file for creating mixtures

    if dataset == 'tr' or dataset == 'cv':
        wsj_path = '/Data/DATASETS/WSJ/tr_whole/'
    elif dataset == 'tt':
        wsj_path = '/Data/DATASETS/WSJ/dt_05+et_05/'
    else:
        raise ValueError('Dataset can only be tr, cv or tt.')

    text_path = '/Data/DATASETS/WSJ/WSJ0-creation/mix_2_spk_'+dataset+'.txt'

    # load text files

    mixture_lines = []
    with open(text_path, 'r') as f:
        for line in f:
            mixture_lines.append(line.rstrip('\n'))

    mixture_wave = []

    for i in range(len(mixture_lines)):
        separate = mixture_lines[i].split(' ')
        mixture_wave.append([])
        for spk in range(num_spk):
            mixture_wave[i].append(wsj_path+separate[spk*2].split('/')[2]+'/'+separate[spk*2].split('/')[3])
            
    num_data = len(mixture_wave)
    
    # load noise 
#     noise_dir = '/Data/DATASETS/NoiseX/'
#     wavs_file = [f for f in os.listdir(noise_dir) if 'wav' in f]
            
#     noise = sorted(noise)
    
    wave_cnt = 0
    
    diameter = 0.15  # diameter of microphone array
        
    dist_src = []
    dist_mic = []
    adhoc_DRR = []
    
    average_time = 0.
    for i in range(idx, idx+num):
        start_time = time.time()
        
        mixture = []
        source = []
        
        # generate rir filter
        
        # sample room size and spk/noise position
        room = np.asarray([np.random.uniform(2., 6.), np.random.uniform(2., 6.), np.random.uniform(2.5, 3.)])
            
        length, width, height = room
        source_pos = [[np.random.uniform(0.5,length-0.5),
                               np.random.uniform(0.5,width-0.5),
                               np.random.uniform(0.5,height-0.5)]]
        
        source_pos = np.asarray(source_pos)
            
        # sample microphone center
        # ad hoc array of 6 mics
        mic_pos = []
        for j in range(6):
            mic_center = np.asarray([np.random.uniform(0.5,length-0.5),
                                   np.random.uniform(0.5,width-0.5),
                                   np.random.uniform(0.5,height-0.5)])
            mic_pos.append(mic_center)
        
        mic_pos = np.asarray(mic_pos)
        
        # sample RT60
        rt = np.random.uniform(0.1, 0.6)
        
        
        # sample rir        
        # echoic
        beta = gpuRIR.beta_SabineEstimation(room, rt)
        nb_img = gpuRIR.t2n(rt, room)
        echoic_rir = gpuRIR.simulateRIR(room, beta, source_pos, mic_pos, nb_img, rt, sr).squeeze()
        
        # anec gt
        # anechoic filter is the +- 6ms of the first peak of echoic rir
        context = int(6 * sr // 1000)
        echoic_rir_peak = np.argmax(echoic_rir[:,:int(sr/20)], 1)
        mask = np.zeros_like(echoic_rir)
        
        for mic in range(6):
            min_idx = max(0, echoic_rir_peak[mic]-context)
            max_idx = min(echoic_rir.shape[1], echoic_rir_peak[mic]+context+1)
            mask[mic][min_idx:max_idx] = 1.
                
        anechoic_rir = mask * echoic_rir

        # load waveform
        try:
            s, _ = librosa.load(mixture_wave[i][spk], sr=sr)
        except FileNotFoundError:
            wave_cnt += 1
            # randomly select another one from the same speaker
            speaker_folder = os.path.dirname(mixture_wave[i][spk])
            file_names = []
            for (_, _, files) in os.walk(speaker_folder):
                for name in files:
                    file_names.append(name)

            select_spk = np.random.choice(file_names)
            s, _ = librosa.load(speaker_folder+'/'+select_spk, sr=sr)

        s = s.astype(np.float32)
        source = s

        mixture = 0.
        
        echoic_source = []  # n_spk*n_mic
        
        s = source
        s = s / np.sqrt(np.sum(s ** 2))

        adhoc_e_source = []
        adhoc_a_source = []

        for mic in range(6):
            echoic_sig = signal.fftconvolve(s, echoic_rir[mic])
            anechoic_sig = signal.fftconvolve(s, anechoic_rir[mic])
            
            adhoc_e_source.append(echoic_sig)
            adhoc_a_source.append(anechoic_sig)

            # direct-to-reverb ratio
            adhoc_DRR.append(10 * np.log10(np.sum(anechoic_sig ** 2) / \
                                              np.sum((echoic_sig-anechoic_sig) ** 2)))
        
        # save waveforms
        # create dataset folder

        directory = '/Data/DATASETS/WSJ/mud_noise/' + dataset
        save_dir = directory+'/sample'+str(i+1)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        # save info

        with open(save_dir+'/info.txt', 'w') as f:
            f.writelines("room size: {}\n".format(room))
            f.writelines("spk position: {}\n".format(source_pos))
            f.writelines("mic position: {}\n".format(mic_pos))
            f.writelines("RT60: {:.2f}\n".format(rt))
        
        # save source for each channel
        for mic in range(6):
            sf.write(save_dir+'/echoic_spk'+str(1)+'_mic'+str(mic+1)+'.wav', adhoc_e_source[mic], sr)
            np.save(save_dir+'/echoic_rir_spk'+str(1)+'_mic'+str(mic+1), echoic_rir[mic])
            
        average_time += time.time() - start_time
        if not np.mod(i-idx+1, num//5):
            percentage = (i-idx+1)*20 / (num//5)
            print("{}% finished. Average time for generating each utterance is {:.2f}".format(percentage, 
                                                                                              average_time/(i-idx+1)))
    np.save(directory+'/dist_src'+str(idx), dist_src)
    np.save(directory+'/dist_mic'+str(idx), dist_mic)
    np.save(directory+'/DRR'+str(idx), adhoc_DRR)


In [None]:
sr = 8000

num_thread = 4
step = 20000//num_thread
idx = np.arange(0, 20000, step)

for i in range(num_thread):
    t = Thread(target=make_WSJ_noise, args=(idx[i], step, 'tr', sr))
    t.start()

In [None]:
num_thread = 1
step = 5000//num_thread
idx = np.arange(0, 5000, step)

for i in range(num_thread):
    t = Thread(target=make_WSJ_noise, args=(idx[i], step,  'cv', sr))
    t.start()

In [None]:
num_thread = 1
step = 3000//num_thread
idx = np.arange(0, 3000, step)

for i in range(num_thread):
    t = Thread(target=make_WSJ_noise, args=(idx[i], step, 'tt', sr))
    t.start()

In [27]:
import IPython.display as ipd
s, fs = librosa.load('/Data/DATASETS/WSJ/mud_noise/tr/sample10/echoic_spk1_mic2.wav')
s, fs = sf.read('/Data/DATASETS/WSJ/mud_noise/tr/sample10/echoic_spk1_mic2.wav')
print(fs)
ipd.display(ipd.Audio(s, rate=fs))

8000


In [3]:
from tqdm import tqdm
subset = 'tr'
# load all wavs to a maximum of 8 seconds
basedir = '/Data/DATASETS/WSJ/mud_noise/{}/'.format(subset)
max_len = 8 * 8000
offset = 1 * 8000
all_samples = [a for a in os.listdir(basedir) if 'sample' in a]
splits = 10
for k in range(splits):
    all_traces = []
    for sample_name in tqdm(all_samples[k * 2000: (k + 1) * 2000]):
        all_mics = []
        for i in range(6):
    #         s, _ = librosa.load(basedir + sample_name + '/' + 'echoic_spk1_mic{}.wav'.format(i + 1))
            s, _ = sf.read(basedir + sample_name + '/' + 'echoic_spk1_mic{}.wav'.format(i + 1))
            if len(s) < max_len:
                s = np.concatenate([s, np.zeros((max_len - len(s),))], 0)
            s = s[offset:max_len]
            all_mics.append(s)
        all_traces.append(all_mics)

    all_traces = np.array(all_traces)
    print(all_traces.shape)
    np.save(basedir + 'mud_split_{}_{}'.format(subset, k), all_traces)

100%|██████████| 2000/2000 [02:16<00:00, 16.65it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:09<00:00, 15.55it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:08<00:00, 15.54it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:08<00:00, 15.53it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:08<00:00, 15.61it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:14<00:00, 16.02it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:18<00:00, 14.95it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:15<00:00, 14.75it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:16<00:00, 14.65it/s]


(2000, 6, 56000)


100%|██████████| 2000/2000 [02:16<00:00, 12.54it/s]


(2000, 6, 56000)


In [22]:
import h5py
subset = tr
basedir = '/Data/DATASETS/WSJ/mud_noise/{}/'.format(subset)
data = h5py.File(basedir + 'debug_mud_v1.h5', 'r')
ipd.display(ipd.Audio(data['tr9'][220], rate=8000))
data.close()

In [None]:
import h5py
subset = 'tr'
basedir = '/Data/DATASETS/WSJ/mud_noise/{}/'.format(subset)
with h5py.File(basedir + 'debug_mud_v1.h5', 'a') as data: 
    for i in range(10):
        _a = np.load(basedir + 'mud_split_{}_{}.npy'.format(subset, i))
        data['{}{}'.format(subset, i)] = _a



