# Generate training set

Real-time Deep Neural Networks for Microphone Array Direction of Arrival Estimation

This research uses the CMU_ARCTIC database, which is a speech database in CMU_ARCTIC speech synthesis databases, established by the Language Technologies Institute of Carnegie Mellon University, USA. This database is mainly used in the research of speech synthesis. The content of the corpus database was selected by the non-copyright center of Project Gutenberg, which is about 1150 sentences. An audio of two males and two females with American English accents were collected. The recording format is 16 bits, the sampling rate is 32 KHz, and the length of each sentence is 3 seconds. The database has a total of 4528 audio files.

## Imports

In [13]:
import math
import sys
import os
from collections import defaultdict
from itertools import combinations

import numpy as np
from scipy.io import wavfile
from scipy import signal
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters

import pyroomacoustics as pra
from pyroomacoustics.utilities import normalize
from pyroomacoustics.transform import stft

## Constants

In [None]:
# Label resolution of angles
RESOLUTION = 15

# Number of samples to include while creating one ML feature
SAMPLES = 2048

# Determines the overlap of samples between consecutive features
STEP = 1024

# Training rooms dimensions
ROOMS = {
    'small' : np.array([4, 4, 3]),
    'medium' : np.array([6, 6, 3]),
    'large' : np.array([8, 8, 3])
}

# Testing rooms dimensions
TEST_ROOMS = {
    'small' : np.array([5, 5, 2]),
    'medium' : np.array([7, 7, 2]),
    'large' : np.array([9, 9, 2])
}

AUDIO_PATH = 'C:\\Users\\Alex\\source\\repos\\sound-localization\\sounds\\training'

# Number of microphones
MICS_NUMBER = 2

## Classes to store signals

In [14]:
class RIR():
    def __init__(self, room, source, angle, distance, mic_nr, src_nr):
        self.name = source + '-' + str(distance) + '-' + str(angle * 180. * np.pi)
        self.room = room
        self.source = source
        self.angle = angle
        self.distance = distance

        # Check validity of the room
        self.room.simulate()
        
        # Include simulated RIR to dataset
        self.rir = room.rir[mic_nr][src_nr]
        

    def getSignals(self):
        # Extract simulated signal for each microphone
        data = self.room.mic_array.signals.T
        data = np.array(normalize(data, bits=16), dtype=np.int16)
        wav_signals[dist].extend(data)

In [11]:
# Dataset

class SimulatedRIR():
    def __init__(self, dataset):
        self.dataset = dict()

    def addRIR(self, RIR):
        self.dataset[RIR.name] = RIR

    def removeRIR(self, RIR):
        self.dataset.pop(RIR.name)

    def data(self):
        return self.dataset

## Definition for rooms, microphones and sources

In [1]:
def create_simulation_room(room_dim=[4, 4, 3], mic_pos=[2, 2, 1.5], room_fs=16000):    
    # Initialize room
    room = pra.ShoeBox(room_dim, fs=room_fs)
    w = room_dim[0]
    l = room_dim[1]
    h = room_dim[2]

    # Generate the microphones
    mic_loc = np.c_[[w/2+0.2, l/2, h/2],[w/2-0.2, l/2, h/2]] 
    room.add_microphone_array(mic_loc)
    
    return room


def create_sound_sources(room_dim=[4, 4, 3], resolution=15):        
    # Specify angle in distance ranges
    angle_range = range(0, 361, resolution)
    dist_range = [1., 2.]
    height_range = [room_dim[-1]/2]
    sources = defaultdict(list)

    for angle in angle_range:
        for R in dist_range:
            for h in height_range:
                source = [R * math.cos(math.radians(angle)) + room_dim[0] / 2, 
                            R * math.sin(math.radians(angle)) + room_dim[0] / 2, h]
                sources[angle].append((R, h, source))
    return sources

In [None]:
rooms_dim = [[5.,5.]]
sound_files = ["C:\\Users\\Alex\\source\\repos\\Data\\Sound\\arctic_a0010.wav"] # size 100
azimuth = np.arange(0, 181, 15, dtype=float) / 180. * np.pi # 15 degrees distance
distances = np.array([1., 2.]) # source distance of 1 and 2 meters

dataset = SimulatedRIR(dataset=None)

for room_dim in rooms_dim:   
    w = room_dim[0]
    l = room_dim[1]
    h = 2.5

    # add sources
    for sound_file in sound_files:
        fs, signal = wavfile.read(sound_file)

        for angle in azimuth:
            for distance in distances:
                # create room
                room = pra.ShoeBox(room_dim, fs=fs)
                room.extrude(h)

                # add microphones
                mic_loc = np.c_[[(w/2) + 0.3, 1., 1.],[(w/2) - 0.3, 1., 1.]]
                room.add_microphone_array(mic_loc)

                source_location = [(w/2) + distance * np.cos(angle), 1. + distance * np.sin(angle), 1.]
                room.add_source(source_location, signal=signal)

                rir = RIR(room=room, source=os.path.basename(sound_file), angle=angle, distance=distance)
                dataset.addRIR(rir)
