In [25]:
import os
from scipy.signal import stft
import librosa
import numpy as np
import h5py
import collections
import json
import sklearn

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

In [16]:
config = {"length_segments_s": 3.3,
          
          "audioset_data_folder": '/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/',
          "output_folder": "/home/vincent/Audio_blind_source_separation/Datadir/audioset_segments",
    
          "sampling_rate": 16000,
          
          "STFT_frame_width_ms": 64,
          "STFT_frame_shift_ms": 32,
          "STFT_window_function": "hamming",
          "detrend": False,
          "boundary": None,
          "padded": False,
          
          "n_Mel_filters": 64,
          "Mel_min_freq": 0,
          "Mel_max_freq": 8000}

In [4]:
STFT_frame_n_samples = int(config["sampling_rate"] / (1000.0 / config["STFT_frame_width_ms"]))
STFT_frame_n_samples_shift = int(config["sampling_rate"] / (1000.0 / config["STFT_frame_shift_ms"]))

In [5]:
n_stft_frames_in_segments = (config["length_segments_s"] * config["sampling_rate"] - STFT_frame_n_samples) / STFT_frame_n_samples_shift
round_segments_length_s = np.round(config["length_segments_s"] * config["sampling_rate"] / STFT_frame_n_samples) \
                          * STFT_frame_n_samples / config["sampling_rate"]
segment_n_samples = int(round_segments_length_s * config["sampling_rate"])
new_n_stft_frames_in_segments = int((round_segments_length_s * config["sampling_rate"] - STFT_frame_n_samples) / STFT_frame_n_samples_shift)
print("Number of STFT frames in a segment with the input parameters: {}".format(n_stft_frames_in_segments))
print("Rounding the length of the segments to {} seconds"
      ", in order to have exactly {} stft frames in each segment.".format(round_segments_length_s, new_n_stft_frames_in_segments))
print("There are {} audio samples in a segment.".format(segment_n_samples))

Number of STFT frames in a segment with the input parameters: 101.125
Rounding the length of the segments to 3.328 seconds, in order to have exactly 102 stft frames in each segment.
There are 53248 audio samples in a segment.


In [6]:
all_wavs_filenames = [os.path.join(dp, f) 
                     for dp, dn, fn in os.walk(os.path.expanduser(config["audioset_data_folder"]))
                     for f in fn
                     if f.endswith('.wav')]

In [7]:
all_labels_filenames = [os.path.splitext(wav_filename)[0] + '.txt' for wav_filename in all_wavs_filenames]

In [18]:
classes = [directory for directory in os.listdir(config["audioset_data_folder"])
                     if os.path.isdir(os.path.join(config["audioset_data_folder"], directory))] + ['Human Speech']
config["classes"] = classes
n_classes = len(classes)

In [9]:
def label_duration_in_segments(segment_start, segment_end, label_start_times, label_end_times):
    
    smaller_start_times = label_start_times[label_start_times <= segment_end]
    greater_end_times = label_end_times[label_end_times >= segment_start]
    if smaller_start_times.size == 0 or greater_end_times.size == 0:
        return 0.0
    in_seg_start_times = smaller_start_times[smaller_start_times >= segment_start]
    in_seg_end_times = greater_end_times[greater_end_times <= segment_end]
    if in_seg_end_times.size == 0:
        in_seg_end_times = np.array([segment_end])
    if in_seg_start_times.size == 0:
        in_seg_start_times = np.array([segment_start])
    
    duration = 0.0
    if in_seg_end_times[0] <= in_seg_start_times[0]:
        duration += in_seg_end_times[0] - segment_start
        in_seg_end_times = in_seg_end_times[1:]
    if in_seg_end_times.size == 0 or in_seg_start_times[-1] >= in_seg_end_times[-1]:
        duration += segment_end - in_seg_start_times[-1]
        in_seg_start_times = in_seg_start_times[:-1]
    duration += np.sum(in_seg_end_times - in_seg_start_times)
    
    return duration

In [10]:
def parse_label_file(filename, classes_list):
    # read label file content
    with open(filename, 'r') as file:
        lines = file.readlines()
    lines = [x.strip() for x in lines]
    # Parse the time stamps per classes if any
    timestamps = []
    for a_class in classes_list:
        # find idx of line with the class name
        class_line_idx = [idx for idx, line in enumerate(lines)
                          if line.find(a_class) > - 1
                          and line.find('.eaf') == -1
                          and line.find('AM') == -1
                          and line.find('PM') == -1]
        for line_idx_idx, line_idx in enumerate(class_line_idx):
            # time stamps should be the line after the class name
            segments_lines_idx = line_idx + 1

            if segments_lines_idx < len(lines):
                if line_idx_idx == 0:
                    timestamps.append(np.array(list(
                        map(float, filter(None,
                                          lines[segments_lines_idx].replace('TC', '')
                                          .replace('-', '\t')
                                          .replace('00:00:', '').split('\t'))))))
                else:
                    additional_line_timestamps = np.array(list(
                        map(float, filter(None,
                                          lines[segments_lines_idx].replace('TC', '')
                                          .replace('-', '\t')
                                          .replace('00:00:', '').split('\t')))))
                    timestamps[-1] = np.array(timestamps[-1].tolist() + additional_line_timestamps.tolist())
            else:
                timestamps.append(np.array([]))  # Sometimes the class name is in the label file but there are no timestamps
        # class name is not present in the label file
        if not class_line_idx:
            timestamps.append(np.array([]))  # Sometines there is no class name and no timestamps

    return timestamps

In [11]:
if not os.path.exists(config["output_folder"]):
    os.makedirs(config["output_folder"])
else:
    if os.listdir(config["output_folder"]):  # if folder is not empty
        raise ValueError('Output folder already exist !')

In [12]:
labels = collections.deque()
durations = collections.deque()
mel_spectrograms = collections.deque()
stft_magnitudes = collections.deque()
stft_phases = collections.deque()
segment_names = collections.deque()

mel_filterbank = librosa.filters.mel(config["sampling_rate"],
                                     n_fft=STFT_frame_n_samples,
                                     n_mels=config["n_Mel_filters"],
                                     fmin=config["Mel_min_freq"],
                                     fmax=config["Mel_max_freq"])

for idx, (audio_file, label_file) in enumerate(zip(all_wavs_filenames, all_labels_filenames)):
    try:
        audio, _ = librosa.core.load(audio_file, sr=config["sampling_rate"], mono=True)
        labels_segment = parse_label_file(label_file, classes)
        if len(labels_segment) != len(classes):
            raise ValueError('Length of labels_segment is ' + str(len(labels_segment)) + ' while there are only ' + str(len(classes)) + ' classes.')
    except Exception as e:
        print(e)
        print(audio_file)
        continue
        
    n_seg_in_audio = audio.shape[0] // segment_n_samples
    audio = audio[:n_seg_in_audio*segment_n_samples]
    segments = np.split(audio, n_seg_in_audio)
    
    for seg_idx, segment in enumerate(segments):
        try:
            durations.append(np.array([label_duration_in_segments(seg_idx*round_segments_length_s,
                                                                  (seg_idx+1)*round_segments_length_s,
                                                                  class_label_segments[::2], class_label_segments[1::2])
                                       for class_label_segments in labels_segment]))
        except:
            print(labels_segment)
        # The segment get a label for a class if at least 1 frame in the segment has the label
        seg_labels = (np.array(durations[-1]) >= config["STFT_frame_width_ms"] / 1000.0)
        labels.append(seg_labels)
        
        _, _, seg_stft = stft(segment,
                              window=config["STFT_window_function"],
                              nperseg=STFT_frame_n_samples,
                              noverlap=STFT_frame_n_samples - STFT_frame_n_samples_shift,
                              detrend=config["detrend"],
                              boundary=config["boundary"],
                              padded=config["padded"])
        
        stft_magnitudes.append(np.abs(seg_stft))
        stft_phases.append(seg_stft / (stft_magnitudes[-1] + 1e-15))
        
        mel_spectrograms.append(mel_filterbank @ stft_magnitudes[-1])
        
        name = os.path.join(config["output_folder"],
                            os.path.splitext(os.path.basename(audio_file))[0]
                            + '_seg{}'.format(seg_idx) + '.wav')
        segment_names.append(os.path.basename(name))
        librosa.output.write_wav(name, segment, sr=config["sampling_rate"], norm=False)

could not convert string to float: ' 0.390                0.610 '
/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/Fire Alarm/xFqA0foiO7g_30000_40000.wav
[Errno 2] No such file or directory: '/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/Car Honking/Aqow6daDNYc_10000_20000.txt'
/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/Car Honking/Aqow6daDNYc_10000_20000.wav
[Errno 2] No such file or directory: '/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/Car Honking/5BtgtrX7OuQ_70000_80000(1).txt'
/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/Car Honking/5BtgtrX7OuQ_70000_80000(1).wav
could not convert string to float: ' 3.760  8.162 '
/home/vincent/Audio_blind_source_separation/Datadir/cloud_factorized_audioset/Glass Breaking/xfzunkprFeI_280000_290000.wav
could not convert string to float: ' 8.576 8.716 '
/home/vincent/Audio_blind_source_se

In [55]:
stft_magnitudes = np.array(stft_magnitudes).astype(np.float32)
stft_phases = np.array(stft_phases)
mel_spectrograms = np.array(mel_spectrograms).astype(np.float32)
labels = np.array(labels).astype(np.float32)
segment_names = np.array(segment_names)
durations = np.array(durations).astype(np.float32)

In [13]:
with h5py.File(os.path.join(config["output_folder"], 'all_data.hdf5'), 'w') as hdf5_file:
    hdf5_file.create_dataset('stft_magnitudes', data=stft_magnitudes)
    hdf5_file.create_dataset('stft_phases', data=stft_phases)
    hdf5_file.create_dataset('mel_spectrograms', data=mel_spectrograms)
    hdf5_file.create_dataset('labels', data=labels)
    hdf5_file.create_dataset('filenames', data=[filename.encode("ascii", "ignore") for filename in segment_names])
    hdf5_file.create_dataset('durations', data=durations)

In [20]:
with open(os.path.join(config["output_folder"], 'config.json'), 'w') as config_file:
    json.dump(config, config_file)

### train, dev, test split: 0.8, 0.1, 0.1

In [56]:
tr_sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [57]:
train_index, dev_test_index = next(tr_sss.split(np.zeros(stft_magnitudes.shape[0]), labels))

In [58]:
dev_test_sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

In [59]:
dev_index, test_index = next(dev_test_sss.split(np.zeros(stft_magnitudes[dev_test_index].shape[0]), labels[dev_test_index]))

In [60]:
with h5py.File(os.path.join(config["output_folder"], 'train_data.hdf5'), 'w') as hdf5_file:
    hdf5_file.create_dataset('stft_magnitudes', data=stft_magnitudes[train_index])
    hdf5_file.create_dataset('stft_phases', data=stft_phases[train_index])
    hdf5_file.create_dataset('mel_spectrograms', data=mel_spectrograms[train_index])
    hdf5_file.create_dataset('labels', data=labels[train_index])
    hdf5_file.create_dataset('filenames', data=[filename.encode("ascii", "ignore") for filename in segment_names[train_index]])
    hdf5_file.create_dataset('durations', data=durations[train_index])

In [61]:
with h5py.File(os.path.join(config["output_folder"], 'dev_data.hdf5'), 'w') as hdf5_file:
    hdf5_file.create_dataset('stft_magnitudes', data=stft_magnitudes[dev_test_index][dev_index])
    hdf5_file.create_dataset('stft_phases', data=stft_phases[dev_test_index][dev_index])
    hdf5_file.create_dataset('mel_spectrograms', data=mel_spectrograms[dev_test_index][dev_index])
    hdf5_file.create_dataset('labels', data=labels[dev_test_index][dev_index])
    hdf5_file.create_dataset('filenames', data=[filename.encode("ascii", "ignore") for filename in segment_names[dev_test_index][dev_index]])
    hdf5_file.create_dataset('durations', data=durations[dev_test_index][dev_index])

In [62]:
with h5py.File(os.path.join(config["output_folder"], 'test_data.hdf5'), 'w') as hdf5_file:
    hdf5_file.create_dataset('stft_magnitudes', data=stft_magnitudes[dev_test_index][test_index])
    hdf5_file.create_dataset('stft_phases', data=stft_phases[dev_test_index][test_index])
    hdf5_file.create_dataset('mel_spectrograms', data=mel_spectrograms[dev_test_index][test_index])
    hdf5_file.create_dataset('labels', data=labels[dev_test_index][test_index])
    hdf5_file.create_dataset('filenames', data=[filename.encode("ascii", "ignore") for filename in segment_names[dev_test_index][test_index]])
    hdf5_file.create_dataset('durations', data=durations[dev_test_index][test_index])