# Basic Workflow Test

In [None]:
import librosa
import numpy as np

In [None]:
wav_path = 'mix_0000001.wav'
wav_data = librosa.load(path=wav_path, sr=8000)
if len(wav_data) == 2:
    wav_data = wav_data[0]
    
print("wav_data_length   = {} points".format(len(wav_data)))
print("wav_data_second   = {:.2f} s".format(float(len(wav_data) / 8000.)))
sample_rate = 8000
frame_size = 200
frame_shift = 80
sub_sampling = 10
fft_size = 1 << (frame_size - 1).bit_length()
data_len = int(len(wav_data) / frame_shift)
data_len = int(data_len / sub_sampling)
chunk_size_frame = 500
chunk_size_second = int(chunk_size_frame * sub_sampling * frame_shift / sample_rate)
print("\
sample_rate       = {}\n\
frame_size        = {}\n\
frame_shift       = {}\n\
fft_size          = {}\n\
sub_sampling      = {}\n\
data_len          = {} frames\n\
chunk_size_frame  = {} frames\n\
chunk_size_second = {} s\n\
chunk_size_second = chunk_size_frame * sub_sampling * frame_shift / sample_rate"
    .format(sample_rate, frame_size, frame_shift, fft_size, sub_sampling, 
            data_len, chunk_size_frame, chunk_size_second))

In [None]:
def _count_frames(data_len, chunk_size, chunk_step):
    return int((data_len - chunk_size + chunk_step) / chunk_step)

def _gen_frame_indices(data_len, chunk_size, chunk_step):
    i = -1
    frames_count = _count_frames(data_len, chunk_size, chunk_step)
    print("chunk count   = {}".format(frames_count + 1))
    
    for i in range(frames_count):
        yield i * chunk_size, i * chunk_size + chunk_step
        
    if  i * chunk_size + chunk_step < data_len:
        if data_len - (i + 1) * chunk_step > 0:
            if i == -1:
                yield (i + 1) * chunk_step, data_len
            else:
                yield data_len - chunk_size, data_len

chunk_indices = []
for start_time, end_time in _gen_frame_indices(data_len, chunk_size=chunk_size_frame, chunk_step=chunk_size_frame):
    chunk_indices.append((start_time * sub_sampling, end_time * sub_sampling))
print("chunk indices = {}".format(chunk_indices))
    

In [None]:
num_speakers = 2
start_time, end_time = chunk_indices[0]
chunk_data = wav_data[start_time * frame_shift: end_time * frame_shift]
print("start_chunk_index = {}".format(start_time))
print("end_chunk_index   = {}".format(end_time))
print("start_time        = start_chunk_index * frame_shift = {}".format(start_time * frame_shift))
print("end_time          = end_chunk_index   * frame_shift = {}".format(end_time * frame_shift))
print("chunk_data length = {}".format(len(chunk_data)))

In [None]:
def stft(
        data,
        frame_size=1024,
        frame_shift=256):
    """ Compute STFT features

    Args:
        data: audio signal
            (n_samples,)-shaped np.float32 array
        frame_size: number of samples in a frame (must be a power of two)
        frame_shift: number of samples between frames

    Returns:
        stft: STFT frames
            (n_frames, n_bins)-shaped np.complex64 array
    """
    # round up to nearest power of 2
    fft_size = 1 << (frame_size - 1).bit_length()
    # HACK: The last frame is ommited
    #       as librosa.stft produces such an excessive frame
    if len(data) % frame_shift == 0:
        return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
                            hop_length=frame_shift).T[:-1]
    else:
        return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
                            hop_length=frame_shift).T
        
Y = stft(data=chunk_data, frame_size=frame_size, frame_shift=frame_shift)

print("\
chunk_data length = {}\n\
frame_size        = {}\n\
frame_shift       = {}\n\
fft_size          = {}\n\
Y.shape           = {}\n\
Y.shape[0](Time)  = chunk_data_len/frame_shift\n\
Y.shape[1](Freq)  = 1 + fft_size/2"
.format(len(chunk_data), frame_size, frame_shift, fft_size, Y.shape))

In [None]:
T = np.zeros((Y.shape[0], num_speakers), dtype=np.int32).astype(np.float32)
print("T.shape              = {}".format(T.shape))
print("T.shape[0](Time)     = Y.shape[0]")
print("T.shape[1](num_spks) = num_speakers")

In [None]:
def transform(
        Y,
        transform_type=None,
        dtype=np.float32):
    """ Transform STFT feature

    Args:
        Y: STFT
            (n_frames, n_bins)-shaped np.complex array
        transform_type:
            None, "log"
        dtype: output data type
            np.float32 is expected
    Returns:
        Y (numpy.array): transformed feature
    """
    Y = np.abs(Y)
    if not transform_type:
        pass
    elif transform_type == 'log':
        Y = np.log(np.maximum(Y, 1e-10))
    elif transform_type == 'logmel':
        n_fft = 2 * (Y.shape[1] - 1)
        sr = 16000
        n_mels = 40
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
        Y = np.dot(Y ** 2, mel_basis.T)
        Y = np.log10(np.maximum(Y, 1e-10))
    elif transform_type == 'logmel23':
        n_fft = 2 * (Y.shape[1] - 1)
        sr = 8000
        n_mels = 23
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
        Y = np.dot(Y ** 2, mel_basis.T)
        Y = np.log10(np.maximum(Y, 1e-10))
    elif transform_type == 'logmel23_mn':
        n_fft = 2 * (Y.shape[1] - 1)
        sr = 8000
        n_mels = 23
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
        Y = np.dot(Y ** 2, mel_basis.T)
        Y = np.log10(np.maximum(Y, 1e-10))
        mean = np.mean(Y, axis=0)
        Y = Y - mean
    elif transform_type == 'logmel23_swn':
        n_fft = 2 * (Y.shape[1] - 1)
        sr = 8000
        n_mels = 23
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
        Y = np.dot(Y ** 2, mel_basis.T)
        Y = np.log10(np.maximum(Y, 1e-10))
        # b = np.ones(300)/300
        # mean = scipy.signal.convolve2d(Y, b[:, None], mode='same')

        #  simple 2-means based threshoding for mean calculation
        powers = np.sum(Y, axis=1)
        th = (np.max(powers) + np.min(powers)) / 2.0
        for i in range(10):
            th = (np.mean(powers[powers >= th]) + np.mean(powers[powers < th])) / 2
        mean = np.mean(Y[powers > th, :], axis=0)
        Y = Y - mean
    elif transform_type == 'logmel23_mvn':
        n_fft = 2 * (Y.shape[1] - 1)
        sr = 8000
        n_mels = 23
        mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
        Y = np.dot(Y ** 2, mel_basis.T)
        Y = np.log10(np.maximum(Y, 1e-10))
        mean = np.mean(Y, axis=0)
        Y = Y - mean
        std = np.maximum(np.std(Y, axis=0), 1e-10)
        Y = Y / std
    else:
        raise ValueError('Unknown transform_type: %s' % transform_type)
    return Y.astype(dtype)

In [None]:
Y = transform(Y=Y, transform_type="logmel23_mn")
print("Y.shape                         = {}".format(Y.shape))
print("Y.shape[0](Time)                = before Y.shape[0]")
print("Y.shape[1](Log-Mel Filterbank)  = 23")

In [None]:
def splice(Y, context_size=0):
    """ Frame splicing

    Args:
        Y: feature
            (n_frames, n_featdim)-shaped numpy array
        context_size:
            number of frames concatenated on left-side
            if context_size = 5, 11 frames are concatenated.

    Returns:
        Y_spliced: spliced feature
            (n_frames, n_featdim * (2 * context_size + 1))-shaped
    """
    Y_pad = np.pad(
        Y,
        [(context_size, context_size), (0, 0)],
        'constant')
    Y_spliced = np.lib.stride_tricks.as_strided(
        np.ascontiguousarray(Y_pad),
        (Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
        (Y.itemsize * Y.shape[1], Y.itemsize), writeable=False)
    return Y_spliced

In [None]:
context_size = 7
Y_spliced = splice(Y=Y, context_size=context_size)
print("context_size                        = {}".format(context_size))
print("Y_spliced.shape                     = {}".format(Y.shape))
print("Y_spliced.shape[0](Time)            = Y.shape[0]")
print("Y_spliced.shape[1](append context)  = Y.shape[1] * (2 * context_size + 1)")

In [None]:
def subsample(Y, T, subsampling=1):
    """ Frame subsampling
    """
    Y_ss = Y[::subsampling]
    T_ss = T[::subsampling]
    return Y_ss, T_ss

In [None]:
Y_ss, T_ss = subsample(Y=Y_spliced, T=T, subsampling=sub_sampling)
print("sub sampling                   = {}".format(sub_sampling))
print("Y_ss.shape                     = {}".format(Y_ss.shape))
print("Y_ss.shape[0](Time)            = Y_spliced.shape[0] / sub_sampling")
print("Y_ss.shape[1](append context)  = Y_spliced.shape[1]")

print("T_ss.shape                     = {}".format(T_ss.shape))
print("T_ss.shape[0](Time)            = T.shape[0] / sub_sampling")
print("T_ss.shape[1](append context)  = T.shape[1]")


# Dataset Test

In [None]:
import sys
sys.path.append('/home/dasein/Projects/Speech-Diarization')

from src.datamodules.components.diarization_dataset import DiarizationDataset

In [None]:
data_dir='/home/dasein/Projects/Speech-Diarization/data/simu/dev_clean_ns2_beta2_500'
dataset = DiarizationDataset(data_dir=data_dir,
                             chunk_size=2000,
                             context_size=7,
                             frame_size=1024,
                             frame_shift=256,
                             subsampling=10,
                             sample_rate=8000,
                             input_transform="logmel23_mn",
                             n_speakers=None)

print("dataset length={}".format(len(dataset)))

In [None]:
Y_ss, T_ss = dataset[0]
print("Y_ss", type(Y_ss), Y_ss.shape)
print("T_ss", type(T_ss), T_ss.shape)