In [27]:
from pydub import AudioSegment
import librosa
import os
import re

DATA_ROOT_DIR = './data_full/michigan/tone_perfect_all_mp3/tone_perfect/'
EXPORT_DIR = './data_synthesized/'

def remove_extension(file_path):
    root, extension = os.path.splitext(file_path)
    return root

def merge_audio_and_label(root_dir, export_dir, path1, path2):
    """returns label and path to merged audio
    label of type dictionary
    key: path to merged audio
    value: list of tuples. tuple(tone, start time in seconds, end time in seconds)
    """
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)

    label = {}

    audio1 = AudioSegment.from_mp3(root_dir + path1)
    audio2 = AudioSegment.from_mp3(root_dir + path2)
    merged_audio = audio1 + audio2
    new_path = remove_extension(path1) + '_' + remove_extension(path2) + '.mp3'
    merged_audio.export(export_dir + new_path, format='mp3')

    tone1 = str(re.search(r'\d', path1).group())
    tone2 = str(re.search(r'\d', path2).group())
    label[new_path] =[]
    label[new_path].append((tone1, 0, audio1.duration_seconds))
    label[new_path].append((tone2, audio1.duration_seconds, audio1.duration_seconds + audio2.duration_seconds))

    return label

#    duration1 = librosa.get_duration(filename=path1)
#    duration2 = librosa.get_duration(filename=path2)

In [28]:
path1 = "a1_FV1_MP3.mp3"
path2 = "a4_FV2_MP3.mp3"

merge_audio_and_label(DATA_ROOT_DIR, EXPORT_DIR, path1, path2)

{'a1_FV1_MP3_a4_FV2_MP3.mp3': [('1', 0, 0.6802494331065759),
  ('4', 0.6802494331065759, 1.177641723356009)]}

In [29]:
import math
import random
import json

def prepare_audio_files(root_dir):
    audio_files = []
    for file_path in os.listdir(root_dir):
        tone = str(re.search(r'\d', file_path).group())
        audio = AudioSegment.from_mp3(root_dir + file_path)
        audio_files.append((remove_extension(file_path), audio, tone, audio.duration_seconds))
    return audio_files

def sythesize_data(export_dir, audio_files, duration=10, min_num_clips=5, max_num_clips=10, num_total=100, train_split=0.8, seed=None):
    INTERVAL_FACTOR = 10

    random.seed(a=seed)
    num_total_train = math.ceil(num_total * train_split)
    num_total_test = num_total - num_total_train

    assert num_total_train < num_total
    assert num_total_test > 0

    train_dir = export_dir + 'train/'
    test_dir = export_dir + 'test/'
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)

    train_labels = {}
    test_labels = {}
    train_merged_filenames = {}
    test_merged_filenames = {}

    def merge_audio(i, mode):
        num_to_sample = 0
        audio_clips = []
        while True:
            num_to_sample = random.randint(min_num_clips, max_num_clips)
            audio_clips = random.sample(population=audio_files, k=num_to_sample)
            
            total_duration = 0
            for clip in audio_clips:
                total_duration += clip[3]
            if total_duration < duration:
                break
        
        total_padding_duration = duration - total_duration
        start_idx = 0
        onset_time = 0

        if mode == 'train':
            file_name = f'train_{i + 1}.mp3'
            train_labels[file_name] = []
            train_merged_filenames[file_name] = []
        elif mode == 'test':
            file_name = f'test_{i + 1}.mp3'
            test_labels[file_name] = []
            test_merged_filenames[file_name] = []

        merged_audio = AudioSegment.empty()
        for j in range(num_to_sample - 1):
            if mode == 'train':
                train_labels[file_name].append((audio_clips[j][2], onset_time, (onset_time + audio_clips[j][3])))
                train_merged_filenames[file_name].append(audio_clips[j][0])
            elif mode == 'test':
                test_labels[file_name].append((audio_clips[j][2], onset_time, (onset_time + audio_clips[j][3])))
                test_merged_filenames[file_name].append(audio_clips[j][0])      

            audio = audio_clips[j][1]
            merged_audio += audio

            random_idx = random.randint(start_idx, (j + 1) * INTERVAL_FACTOR)
            padding_duration = ((random_idx - start_idx) / ((num_to_sample - 1) * INTERVAL_FACTOR)) * total_padding_duration
            padding = AudioSegment.silent(duration=padding_duration)
            merged_audio += padding

            start_idx = random_idx
            onset_time += (audio_clips[j][3] + padding_duration)

        merged_audio += audio_clips[-1][1]       
        if mode == 'train':
            train_labels[file_name].append((audio_clips[-1][2], onset_time, (onset_time + audio_clips[-1][3])))
            train_merged_filenames[file_name].append(audio_clips[-1][0])
            merged_audio.export(train_dir + file_name, format='mp3')
        elif mode == 'test':
            test_labels[file_name].append((audio_clips[-1][2], onset_time, (onset_time + audio_clips[-1][3])))
            test_merged_filenames[file_name].append(audio_clips[-1][0])
            merged_audio.export(test_dir + file_name, format='mp3')

    
    for i in range(num_total_train):
        merge_audio(i, 'train')
    
    with open(train_dir + 'train_labels.json', 'w', encoding='utf8') as f:
        f.write(json.dumps(train_labels, indent=4, sort_keys=False, ensure_ascii=False))
    with open(train_dir + 'train_merged_filenames.json', 'w', encoding='utf8') as f:
        f.write(json.dumps(train_merged_filenames, indent=4, sort_keys=False, ensure_ascii=False))
    print(f'Done synthesizing {num_total_train} audio files in {train_dir}')

    for i in range(num_total_test):
        merge_audio(i, 'test')
    with open(test_dir + 'test_labels.json', 'w', encoding='utf8') as f:
        f.write(json.dumps(test_labels, indent=4, sort_keys=False, ensure_ascii=False))
    with open(test_dir + 'test_merged_filenames.json', 'w', encoding='utf8') as f:
        f.write(json.dumps(test_merged_filenames, indent=4, sort_keys=False, ensure_ascii=False))
    print(f'Done synthesizing {num_total_test} audio files in {test_dir}')

In [25]:
audio_files = prepare_audio_files(DATA_ROOT_DIR)

In [30]:
sythesize_data(EXPORT_DIR, audio_files, num_total=1000)

Done synthesizing 8 audio files in ./data_synthesized/train/
Done synthesizing 2 audio files in ./data_synthesized/test/
