In [1]:
import numpy as np
import librosa
from IPython.display import Audio
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from scipy.signal import fftconvolve
import rir_generator as rir

In [2]:

def gcc_phat(sig, refsig, fs=1, interp=16):
    '''
    This function computes the offset between the signal sig and the reference signal refsig
    using the Generalized Cross Correlation - Phase Transform (GCC-PHAT) method.
    '''
    # make sure the length for the FFT is larger or equal than len(sig) + len(refsig)
    n = sig.shape[0] + refsig.shape[0]

    # Generalized Cross Correlation Phase Transform
    SIG = np.fft.rfft(sig, n=n)
    REFSIG = np.fft.rfft(refsig, n=n)
    R = SIG * np.conj(REFSIG)

    cc = np.fft.irfft(R / (np.abs(R) + 1e-10), n=interp * n)  
    

    max_shift = int(interp * n / 2)

    cc = np.concatenate((cc[-max_shift:], cc[:max_shift+1]))
    # # find max cross correlation index
    
    # shift = np.argmax(np.abs(cc)) - max_shift

    # tau = shift / float(interp * fs)
    return  cc


In [3]:
import itertools
def srp_phat(signals, mic_positions, grid, fs, interp = 16):
    """Perform SRP-PHAT localization."""
    num_mics = len(mic_positions)
    srp_map = np.zeros(grid.shape[0], dtype=np.float32)

    sig_len = signals.shape[1]

    delay_axs = np.linspace(-sig_len/ fs, sig_len/ fs, interp*sig_len * 2 +1).astype(np.float32) # mapping between index and delay in seconds

    n_channels = signals.shape[0]
    # taus = np.zeros((n_channels, n_channels))
    gccs = {}
    for (i, j) in itertools.combinations(range(num_mics), 2):
        if i == j:
            continue
        cc = gcc_phat(signals[i], signals[j], fs=fs, interp=interp)
        # print(i,j,tau)
        gccs[(i,j)] = cc


    for idx,candidate in enumerate(grid):
        power = 0
        for (i, j) in itertools.combinations(range(num_mics), 2):
            if i == j:
                continue
            dist_i = np.linalg.norm(candidate - mic_positions[i])
            dist_j = np.linalg.norm(candidate - mic_positions[j])
            tau = (dist_i - dist_j) / 343.0  # Speed of sound (m/s)
            
            gcc = gccs[(i,j)]
            
            closest_idx = np.argmin(np.abs(delay_axs - tau))
            power += gcc[closest_idx]

        srp_map[idx] = power

    return srp_map

def MUSIC(signals, mic_positions, grid, fs):
    dists_from_mics = np.linalg.norm(grid[:, None] - mic_positions, axis=2)
    tau = dists_from_mics / 343.0

    nfft = 512
    hoplength = 256
    stft_signals = librosa.stft(signals, n_fft=nfft, hop_length=hoplength)


    stft_sig = stft_signals
    H = stft_sig.transpose(1,0,2)  @ stft_sig.transpose(1,2,0).conj() / stft_sig.shape[2]
    val,vec = np.linalg.eigh(H)
    noise_space = vec[:,:,1:]
    omega_f =  fs * np.arange(nfft // 2 + 1) / nfft
    steering_vec = np.exp(-1j * 2 * np.pi * omega_f * tau[:, :, None])
    Pmu = []
    for f in range(nfft // 2 + 1):
        Pmu_f = (np.abs(steering_vec[:,:,f].conj() @ noise_space[f])**2).sum(-1)
        Pmu.append(Pmu_f)
    Pmu = np.array(Pmu)
    spec = Pmu.mean(0)
    return spec


In [4]:
d = 0.2
fs = 16000

mic_pos = np.array([
          [2.6 - d / 2, 3, 1.5], [2.6 + d / 2, 3, 1.5], [2.6, 3-d/2, 1.5], [2.6 , 3+d/2, 1.5]

      ])
room_dim = [5.2,6.2,3.5]


In [5]:



def rnd_speaker_pos():
  x = np.random.uniform(low=1, high=4, size=(1,))[0]
  y = np.random.uniform(low=1, high=5, size=(1,))[0]
  speaker_pos =  [x, y, 1.5]
  return speaker_pos

def gen_rir(t60,pos):

  h = rir.generate(
      c=340,                  # Sound velocity (m/s)
      fs=fs,                  # Sample frequency (samples/s)
      r=mic_pos,

      s=pos,          # Source position [x y z] (m)
      L=room_dim,            # Room dimensions [x y z] (m)
      reverberation_time=t60, # Reverberation time (s)
      nsample=int(t60 * fs),           # Number of output samples
  )
      # [1.617, 2.45, 1.7]
  return h

def add_white_noise(signal, snr_db=10):
    if len(signal.shape) == 1 :
      signal = signal.reshape(1,-1)
    signal_power = np.mean(signal**2,axis=1, keepdims=True)
    noise_power = signal_power / (10**(snr_db / 10))
    noise = np.sqrt(noise_power) * np.random.normal(0, 1, signal.shape) # spectral and spatial white noise - uncorollated sensors
    noisy_signal = signal + noise
    return noisy_signal, noise


def convolve_rir(signal, rirs):
     rirs = rirs.T
     return np.stack([np.convolve(signal, rir, mode='full') for rir in rirs])


In [6]:
def plot_map(srp_map, grid, mic_pos, room_dim, best_pos, speaker_pos, title):

    fig = px.imshow(srp_map, x=np.linspace(0, room_dim[0], 20), y=np.linspace(0, room_dim[1], 20))
    #  add the microphones
    fig.add_trace(go.Scatter(x=mic_pos[:, 0], y=mic_pos[:, 1], mode='markers', marker=dict(size=10, color='red')))
    # add the estimated source position
    fig.add_trace(go.Scatter(x=[grid[best_pos][0]], y=[grid[best_pos][1]], mode='markers', marker=dict(size=10, color='blue')))
    fig.add_trace(go.Scatter(x=[speaker_pos[0]], y=[speaker_pos[1]], mode='markers', marker=dict(size=10, color='green')))
    fig.update_layout(title=title,
                    xaxis_title='X (m)',
                    yaxis_title='Y (m)')
    # add names to traces
    fig.data[1].name = 'Microphones'
    fig.data[2].name = 'Estimated Source Position'
    fig.data[3].name = 'True Source Position'
    # place colorbar to the left
    fig.update_layout(coloraxis_colorbar=dict(
        title='Likehood',
        tickvals=[srp_map.min(), srp_map.max()],
        ticktext=['Low', 'High'],
        orientation = 'h',
        
    ))
    
    fig.show()

In [7]:
wav, sr = librosa.load('12.wav', sr=None, mono=False)
speaker_pos = rnd_speaker_pos()
imp03_main = gen_rir(0.3,speaker_pos)

main_speaker_room = convolve_rir(wav, imp03_main)

main_speaker_room_noisy_white, white_noise = add_white_noise(main_speaker_room, snr_db=15)

grid = np.array([[x, y, 1.5] for x in np.linspace(0, room_dim[0], 20) for y in np.linspace(0, room_dim[1], 20)])

srp = srp_phat(main_speaker_room_noisy_white, mic_pos, grid, sr, interp=1)
best_pos = srp.argmax()

In [8]:
srp = srp_phat(main_speaker_room_noisy_white, mic_pos, grid, sr, interp=1)
best_pos = srp.argmax()
print(f"Estimated Source Position: {grid[best_pos]}")
print(f"True Source Position: {speaker_pos}")
srp_map = srp
srp_map = srp_map.reshape(20, 20)

plot_map(srp_map.T, grid, mic_pos, room_dim, best_pos, speaker_pos, title='SRP-PHAT Map')


Estimated Source Position: [3.55789474 2.28421053 1.5       ]
True Source Position: [3.08116711940001, 2.6293023658996235, 1.5]


In [9]:

spec = MUSIC(main_speaker_room_noisy_white, mic_pos, grid, sr)

spec = spec / spec.max()

music_map = spec.reshape(20, 20)

best_pos = spec.argmax()

print(f"Estimated Source Position: {grid[best_pos]}")
print(f"True Source Position: {speaker_pos}")

plot_map(music_map.T, grid, mic_pos, room_dim, best_pos, speaker_pos, title='MUSIC Map')



Estimated Source Position: [4.37894737 1.63157895 1.5       ]
True Source Position: [3.08116711940001, 2.6293023658996235, 1.5]
