## Data input

In [1]:
import torch
import torchaudio
import matplotlib.pyplot as plt
from scipy.fft import fft, ifft
from scipy.signal import correlate
import numpy as np
from tqdm import tqdm
import math

In [13]:
path = "/project/data_asr/CHiME5/data/CHiME5/audio/dev/"

# U = mic array with 4 channels, P = Person (2 channels in wav)
data_1, sample_rate_1 = torchaudio.load(path+"S02_U01.CH1.wav")
data_2, sample_rate_2 = torchaudio.load(path+"S02_U02.CH1.wav")
print(data_1.size(),data_2.size(), sample_rate_1, sample_rate_2)

torch.Size([1, 142464640]) torch.Size([1, 142464640]) 16000 16000


In [14]:
# Narrow data1 and data2
data_1 = torch.narrow(data_1, 1, 0, int(data_1.size(dim=1)*0.1)) #1% = 2849292
data_2 = torch.narrow(data_2, 1, 0, int(data_2.size(dim=1)*0.1))
inputs = [data_1,data_2]
print(data_1.size(dim=1))

14246464


In [19]:
print(inputs[0].size(), inputs[1].size())

torch.Size([1, 1424646]) torch.Size([1, 1424646])


## Beamforming

In [16]:
def same_size_tensors(tensors: list):
    chan = tensors[0].size(dim=0)
    if tensors[0].size(dim=1)>tensors[1].size(dim=1):
        diff = tensors[0].size()[1]-tensors[1].size()[1]
        tensors[1] = torch.concat((tensors[1],torch.zeros([chan,diff])),1)
    else:
        diff = -tensors[0].size(dim=1) + tensors[1].size(dim=1)
        tensors[0] = torch.concat((tensors[1],torch.zeros([chan,diff])),1)
    return tensors

def calculate_delays_and_weights(inputs):
    delay = calc_best_delay(inputs)
    return delay,[0.5,0.5]

In [18]:
# Better: Cross-Correlation -> Choose Delay that maximizes this!
def calculate_GCCPHAT(signal_1: torch.Tensor,signal_2: torch.Tensor):
    # torch Tensor to numpy array
    signal_1 = signal_1.detach().numpy()
    signal_2 = signal_2.detach().numpy()
    # a is x_i b is x_ref
    z = fft(signal_1)*np.conj(fft(signal_2))
    n = np.absolute(fft(signal_1),fft(signal_2))
    return ifft(z/n)

def four_maxsum(array):
    m1 = 0
    m2 = 0
    m3 = 0
    m4 = 0
    for a in array[0]:
        if a > m4:
            if a > m3:
                if a > m2:
                    if a > m1:
                        m1 = a
                    else:
                        m2 = a
                else:
                    m3 = a
            else:
                m4 = a
    return m1 + m2 + m3 + m4

def calc_best_delay(inputs):
    best_delay = 0
    maxgcc = 0
    delay_range = 100
    for i in range(0,delay_range):
        gcc = calculate_GCCPHAT(inputs[0][:,:1000],inputs[1][:,i:i+1000])
        # calculate top 4 maxima of gccphat
        maxsum = four_maxsum(gcc)
        if maxsum > maxgcc:
            maxgcc = maxsum
            best_delay = i
    return [0,best_delay]

calc_best_delay(inputs)

[0, 24]

In [20]:
# Weights like DAS Beamformer in Anguera et al. (2007)
def calculate_init_weights(inputs: list):
    w = [[]]
    #Computed at output, first:
    for i in inputs:
        channels = len(inputs)
        w[0].append(1/channels)
    return w

def calculate_weight_update(inputs, w):
    t = len(w)
    corr = xcorr(inputs, t)
    adapt_ratio = 0.05
    #channels = len(w[0])
    #helplist = []
    #for i in range(0,channels):
    #        helplist.append((1-adapt_ratio)*w[t-1][i]+adapt_ratio*corr)
    #w.append(helplist)
    w1 = (1-adapt_ratio)*w[t-1][0]+adapt_ratio*corr
    w2 = 1-w1
    w.append([w1,w2])
    return w

def xcorr(inputs, t):
    #1 second blocks
    try:
        ch1 = torch.narrow(inputs[0], 1, (t-1)*16000, 16000)
        ch2 = torch.narrow(inputs[1], 1, (t-1)*16000, 16000)
        num_channels = len(inputs)
        return (1/(16000*(num_channels))) * np.sum(signal.correlate(ch1,ch2))    
    except Exception as e:
        return 0.5

In [21]:
# Generate new sound waveform

def DAS_bf(inputs: list, delays: list, weights_updated: bool):
    output = torch.Tensor()
    inputs = same_size_tensors(inputs)
    w = calculate_init_weights(inputs)

    max_time = inputs[0].size()[1]
    num_channels = len(inputs)

    for t in tqdm(range(0,max_time)):
        output_at_t = torch.FloatTensor([0.0])
        for i in range(0,num_channels):
            try:
                a = torch.multiply(w[-1][i],(inputs[i][:,t+delays[i]]))
                output_at_t = torch.add(output_at_t,a)
            except Exception as e:
                continue
        output = torch.cat((output,output_at_t),0)
        if weights_updated and t % 16000 == 0:
            w = calculate_weight_update(inputs,w)
    output = output[None,:]
    return output

In [22]:
delays = calc_best_delay(inputs)
output = DAS_bf(inputs, delays, weights_updated=True)
#print(output,output.size())

100%|██████████| 1424646/1424646 [01:56<00:00, 12207.84it/s]


In [23]:
torchaudio.save("output.wav", output, sample_rate_1)
torchaudio.save("data1.wav", data_1, sample_rate_1)
torchaudio.save("data2.wav", data_2, sample_rate_1)