## Imports

In [1]:
import os
import sys

import tensorflow as tf
from tensorflow.keras  import backend as K
from tensorflow.keras.models import load_model, save_model

import matplotlib.pyplot as plt
import numpy as np
import librosa as lr
import soundfile as sf
from scipy import signal

from pystoi import stoi
from pesq import pesq

## Utility functions

In [29]:
def my_crossentropy(y_true, y_pred):
    return K.mean(2*K.abs(y_true-0.5) * K.binary_crossentropy(y_pred, y_true), axis=-1)

def mymask(y_true):
    return K.minimum(y_true+1., 1.)

def msse(y_true, y_pred):
    return K.mean(mymask(y_true) * K.square(K.sqrt(y_pred) - K.sqrt(y_true)), axis=-1)

def mycost(y_true, y_pred):
     return K.mean(mymask(y_true) * (10*K.square(K.square(K.sqrt(y_pred) - K.sqrt(y_true))) + K.square(K.sqrt(y_pred) - K.sqrt(y_true)) + 0.01*K.binary_crossentropy(y_pred, y_true)), axis=-1)

def my_accuracy(y_true, y_pred):
    return K.mean(2*K.abs(y_true-0.5) * K.equal(y_true, K.round(y_pred)), axis=-1)


# list of mfcc differentials, adds 0 to the beginning
def get_diff_list(data):
    L = []
    for d in data:
        L.append(np.concatenate([[d[0]], np.diff(d, axis=-2)], axis=-2))
    return np.array(L)


def normalize(data, n, quantize=True):
    limit = pow(2, n)
    data = np.clip(data, -limit, limit)/limit
    if quantize:
        data = np.round(data * 128)/ 128.0
    return data


def iir_design(band_frequency, samplerate, order=1): # the band frequency is the middle freq
    b = []
    a = []
    fre = band_frequency / (samplerate/2)
    for i in range(1, len(band_frequency)-1):
        b_, a_ = signal.iirfilter(order, [fre[i] - (fre[i]-fre[i-1])/2, fre[i]+ (fre[i+1]-fre[i])/2], btype='bandpass', output='ba')
        b.append(b_)
        a.append(a_)
    return b, a


def bandpass_filter_iir(sig, b_in, a_in, step, gains):
    x = sig
    y = np.zeros(len(x))
    state = np.zeros(len(b_in)-1)
    g=0
    for n in range(0, len(gains)):
        g = max(0.6*g, gains[n])    # r=0.6 pre RNNoise paper https://arxiv.org/pdf/1709.08243.pdf
        b = b_in*g
        a = a_in
        filtered, state = signal.lfilter(b, a, x[n*step: min((n+1)*step, len(x))], zi=state)
        y[n*step: min((n+1)*step, len(x))] = filtered
    return y


def filter_voice(sig, rate, gains, nband=22, lowfreq=20, highfreq=4000):
    # see gen_dataset.py's example for detial
    band_freq = lr.mel_frequencies(n_mels=nband, fmin=lowfreq, fmax=highfreq)
    # band_freq = lr.mel_to_hz(mel_scale)
    band_frequency = band_freq[1:-1] # the middle point of each band
    print('band frequency', band_frequency)
    b, a = iir_design(band_freq, rate, order=1)
    step = int(0.020 * rate / 2)
    filtered_signal = np.zeros(len(sig))
    for i in range(len(b)):
        filtered_signal += bandpass_filter_iir(sig, b[i].copy(), a[i].copy(), step, gains[:, i])
        print("filtering with frequency: ", band_frequency[i])
    filtered_signal = filtered_signal * 0.6
    return filtered_signal


def voice_denoise(sig, rate, model, timestamp_size=512, numcep=26, plot=False):
    # sig = sig / 32768
    num_diffs = 10
    window_length = int(np.round(0.020*rate))
    hop_length = int(np.round(0.010*rate))
    # get the mfcc of noisy voice
    mfcc_feat = lr.feature.mfcc(sig, rate, n_mfcc=numcep, n_fft=512, win_length = window_length, hop_length = hop_length, dct_type=2, lifter=0, fmin=20, fmax=4000)
    mfcc_feat = mfcc_feat.astype('float32')
    # mfcc_feat = mfcc_feat[:,:3888]
    mfcc_feat = mfcc_feat.T
    print("mfcc_feat.shape: ", mfcc_feat.shape) # (6223, 22)
    # differential of mfcc, add 0 to the beginning
    diff = np.diff(mfcc_feat, axis=0)
    diff = np.concatenate([[mfcc_feat[0]], diff], axis=0)  # first derivative
    diff1 = np.diff(diff, axis=0)
    diff1 = np.concatenate([[diff[0]], diff1], axis=0) # second derivative
    diff = diff[:, :num_diffs]
    diff1 = diff1[:, :num_diffs]
    # concat both differential and original mfcc
    print("diff.shape: ", diff.shape)
    print("diff1.shape: ", diff1.shape)
    feat = np.concatenate([mfcc_feat, diff, diff1], axis=-1)
    print("1feat.shape: ", feat.shape)
    # requantise the MFCC (same as training data)
    feat = normalize(feat, 3, quantize=False)
    print("2feat.shape: ", feat.shape)
    feat = np.reshape(feat, (feat.shape[0], 1, feat.shape[1])) # 
    print("3feat.shape: ", feat.shape)
    feat = feat[: feat.shape[0] // timestamp_size * timestamp_size]
    print("4feat.shape: ", feat.shape)
    prediction = model.predict(feat, batch_size=timestamp_size)
    if(type(prediction) is list):
        predicted_gains = prediction[0]
        predicted_vad = prediction[1]
    else:
        predicted_gains = prediction
        predicted_vad = None

    # now process the signal.
    print('predicted_gains: ', predicted_gains.shape)
    # filtered_sig = filter_voice(sig, rate=rate, gains=predicted_gains, nband=mfcc_feat.shape[-1])
    filtered_sig = filter_voice(sig, rate=rate, gains=predicted_gains, nband=24)
    if(plot):
        plt.figure(figsize=(20, 7))
        for i in range(10):
            plt.plot(predicted_gains[:, i], label='band'+str(i))
        if(predicted_vad is not None):
            plt.plot(predicted_vad, 'r', label='VAD')
        plt.ylabel("Gains")
        plt.xlabel("MFCC Sample")
        plt.legend()
        plt.show()
    return filtered_sig


def stoi_wrapper(ref, denoised, sr, extension = True):
    """
    Computes the intelligibility score based on the STOI predictor.

    Based on: C. H. Taal, R. C. Hendriks, R. Heusdens, and J. Jensen, 
    "A Short-Time Objective Intelligibility Measure for Time-Frequency Weighted Noisy
    Speech", IEEE Int. Conf. Acoust., Speech, Signal Processing, Dallas, United States,
    pp. 4214-4217, 2010.

    Signals are resampled to 10kHz by the stoi method if sr != 10000.

    Parameters:
    -----------
    ref (1D ndarray): clean signal
    denoised (1D ndarray): restored signal, i.e, denoised.
    sr (float): sampling frequency 
    extension (str): True extends the STOI score for non-linearly processed input signals
        such as time-frequency masking operations.

    Returns:
    --------
    float: estimated intelligibility score in [0, 100]


    Calls: None

    """

    d = stoi(ref, denoised, sr, extended = extension)
    return 100 / (1 + np.exp(-13.1903 * d + 6.5192))


def pesq_wrapper(ref, denoised, sr, mode = 'nb'):
    """
    Computes an estimation of the MOS-LQO score based on raw PESQ scores.

    The implementation is taken from the github repo pypesq. Sampling frequency 
    should be either 8k or 16k. It is not consistent with the results provided
    by pesq_ITU_wrapper.

    Parameters:
    -----------
    ref (1D ndarray): clean signal
    denoised (1D ndarray): restored signal, i.e, denoised.
    sr (float): sampling frequency; either 8k for narrowband or 16k for wideband
    mode (str): either 'nb' or 'wb' for narrow- and wide-band respectively.

    Returns:
    --------
    float: estimated MOS-LQO score. Rescaling of [-0.5, 4.5] raw MOS scores to 
        a range of [1.02, 4.56] for MOS-LQO scores.


    Calls: None

    """
    
    p = pesq(sr, ref, denoised, mode) # Raw scores in [-0.5, 4.5]
    # mapping to MOS-LQO
    return 0.999 + (4.999 - 0.999) / (1 + np.exp(-1.4945 * p + 4.6607))

## Load Dataset and prepare data

In [3]:
# load training dataset
try:
    dataset = np.load('dataset.npz', allow_pickle=True)
except:
    raise Exception("Given dataset not found.")

# extract mfcc, vad and gains
clnsp_mfcc = dataset['clnsp_mfcc']    # clean speech mfccs
noisy_mfcc = dataset['noisy_mfcc']    # noisy speech mfccs
vad = dataset['vad']                  # voice activation detection
gains = dataset['gains']              # gains
# get mfcc derivative from dataset.
clnsp_mfcc_diff = get_diff_list(clnsp_mfcc)
noisy_mfcc_diff = get_diff_list(noisy_mfcc)
clnsp_mfcc_diff1 = get_diff_list(clnsp_mfcc_diff)
noisy_mfcc_diff1 = get_diff_list(noisy_mfcc_diff)

# combine all pieces to one large array
clnsp_mfcc = np.concatenate(clnsp_mfcc, axis=0)
noisy_mfcc = np.concatenate(noisy_mfcc, axis=0)
clnsp_mfcc_diff = np.concatenate(clnsp_mfcc_diff, axis=0)
noisy_mfcc_diff = np.concatenate(noisy_mfcc_diff, axis=0)
clnsp_mfcc_diff1 = np.concatenate(clnsp_mfcc_diff1, axis=0)
noisy_mfcc_diff1 = np.concatenate(noisy_mfcc_diff1, axis=0)
vad = np.concatenate(vad, axis=0)
gains = np.concatenate(gains, axis=0)

# these max and min are rear
print('mfcc max:', noisy_mfcc.max(), 'mfcc min:', noisy_mfcc.min())
print('mfcc diff max:', noisy_mfcc_diff.max(), 'mfcc diff min:', noisy_mfcc_diff.min())

# preprocess data
timestamp_size = 1024 # this must be > than 1024, since we are using 1 sample as a batch, which still too small for BP
num_sequence = len(vad) // timestamp_size
print('timestamp', timestamp_size, 'num of data', num_sequence)

# prepare data
diff = np.copy(noisy_mfcc_diff[:num_sequence * timestamp_size, :10])
diff1 = np.copy(noisy_mfcc_diff1[:num_sequence * timestamp_size, :10])
feat = np.copy(noisy_mfcc[:num_sequence * timestamp_size, :])

# concat mfcc, 1st and 2nd derivative together as the training data.
x_train = np.concatenate([feat, diff, diff1], axis=-1)
# convert MFCC range to -1 to 1.0 In quantization, we will saturate them to leave more resolution in smaller numbers
# we saturate the peak to leave some more resolution in other band.
x_train = normalize(x_train, 3, quantize=False)

# reshape
x_train = np.copy(x_train[:num_sequence * timestamp_size, :])
x_train = np.reshape(x_train, (num_sequence* timestamp_size, 1, x_train.shape[-1]))
y_train = np.copy(gains[:num_sequence * timestamp_size,:])
y_train = np.reshape(y_train, (num_sequence* timestamp_size, gains.shape[-1]))
vad_train = np.copy(vad[:num_sequence * timestamp_size]).astype(np.float32)
vad_train = np.reshape(vad_train, (num_sequence * timestamp_size, 1))
print("x_train.shape: ", x_train.shape)
print("y_train.shape: ", y_train.shape)
print("vad_train.shape: ", vad_train.shape)

mfcc max: 27.341293 mfcc min: -17.184874
mfcc diff max: 22.255035 mfcc diff min: -24.103506
timestamp 1024 num of data 1025
x_train.shape:  (1049600, 1, 42)
y_train.shape:  (1049600, 22)
vad_train.shape:  (1049600, 1)


## Define Model

In [4]:
def train(x_train, y_train, vad_train, batch_size=64, epochs=10, model_name="model_new.h5"):
    """
    RNNoise-like structure with some adaption to fit NNoM's implementation.
    """
    input_feature_size = x_train.shape[-1] # 42
    output_feature_size = y_train.shape[-1] # 22
    timestamp_size = batch_size # 2048
    input = tf.keras.Input(shape=(1, input_feature_size), batch_size=timestamp_size) # (1, 42, 2048)
    
    """
        This is an RNNoise-like structure
    """
    # voice activity detection
    # x1_1 = tf.keras.layers.GRU(24, return_sequences=True, stateful=True, recurrent_dropout=0.2)(input)
    x1_1 = tf.keras.layers.Dense(24, activation="tanh")(input)
    # x1_1 = tf.keras.layers.Dropout(0.3)(x1_1)
    x1_2 = tf.keras.layers.GRU(24, activation="relu", reset_after=False, return_sequences=True)(x1_1)
    # x1_2 = tf.keras.layers.Dropout(0.3)(x1_2)
    x = tf.keras.layers.Flatten()(x1_2)
    # x = tf.keras.layers.Dropout(0.3)(x)
    vad_output = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    # vad_output = tf.keras.layers.Activation("sigmoid")(x)

    # we dont concate input with layer output, because the range different will cause quite many quantisation lost.
    # x_in = tf.keras.layers.GRU(64, return_sequences=True, stateful=True, recurrent_dropout=0.3)(input)

    # Noise spectral estimation
    x2 = tf.keras.layers.concatenate([input, x1_1, x1_2], axis=-1)
    x2 = tf.keras.layers.GRU(48, activation="relu", reset_after=False, return_sequences=True)(x2)
    # x2 = tf.keras.layers.Dropout(0.3)(x2)

    #Spectral subtraction
    x3 = tf.keras.layers.concatenate([input, x2, x1_2], axis=-1)
    x3 = tf.keras.layers.GRU(96, activation="relu", reset_after=False, return_sequences=True)(x3)
    # x3 = tf.keras.layers.Dropout(0.3)(x3)
    x = tf.keras.layers.Flatten()(x3)
    x = tf.keras.layers.Dense(output_feature_size, activation="sigmoid")(x) #output_feature_size
    # x = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.Model(inputs=input, outputs=[x, vad_output])
    model.compile("adam", loss=[mycost, my_crossentropy], loss_weights=[10, 0.5], metrics=[msse])
    # model.compile("adam", loss=["MSE", "binary_crossentropy"], loss_weights=[10, 2])
    model.summary()

    history = model.fit(x_train, [y_train, vad_train], batch_size=timestamp_size, epochs=epochs, verbose=2, shuffle=False)

    # free the session to avoid nesting naming while we load the best model after.
    tf.keras.models.save_model(model, model_name)
    del model
    K.clear_session()
    return history

## Train Model

In [5]:
# training
history = train(x_train, y_train, vad_train, batch_size=timestamp_size, epochs=50, model_name="model_new.h5")

# get the best model
model = tf.keras.models.load_model("model_new.h5", custom_objects={'mycost': mycost, 'msse': msse, 'my_crossentropy': my_crossentropy, 'my_accuracy': my_accuracy})
# model = tf.keras.models.load_model("model.h5")

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(1024, 1, 42)]      0                                            
__________________________________________________________________________________________________
dense (Dense)                   (1024, 1, 24)        1032        input_1[0][0]                    
__________________________________________________________________________________________________
gru (GRU)                       (1024, 1, 24)        3528        dense[0][0]                      
__________________________________________________________________________________________________
concatenate (Concatenate)       (1024, 1, 90)        0           input_1[0][0]                    
                                                                 dense[0][0]                  

## Denoise a file

In [30]:
# Make sure the MFCC parameters inside the voice_denoise() are the same as our gen_dataset.

# Load noisy file
(sig, rate) = lr.load("_noisy_sample.wav", sr=8000)
# sig = np.asarray(sig * 32767, dtype=np.int16)
print('min: ', np.min(sig))
print('max: ', np.max(sig))
print('rate: ', rate)
# Denoising
filtered_sig = voice_denoise(sig, rate, model, timestamp_size=1, numcep=y_train.shape[-1], plot=True) # use plot=True argument to see the gains/vad
filtered_sig = np.asarray(filtered_sig * 32767, dtype=np.int16)
# Write denoised file
sf.write("_nn_filtered_sample_new.wav", filtered_sig, rate)
# sf.write("_nn_filtered_sample_inv.wav", filtered_sig[::-1], rate)
# sf.write("_clean_sample_inv.wav", sig[::-1], rate)
print('min: ', np.min(filtered_sig))
print('max: ', np.max(filtered_sig))

min:  -0.75931174
max:  0.62397695
rate:  8000
mfcc_feat.shape:  (1707, 22)
diff.shape:  (1707, 10)
diff1.shape:  (1707, 10)
1feat.shape:  (1707, 42)
2feat.shape:  (1707, 42)
3feat.shape:  (1707, 1, 42)
4feat.shape:  (1707, 1, 42)
predicted_gains:  (1707, 22)
band frequency [ 112.97002751  205.94005501  298.91008252  391.88011002  484.85013753
  577.82016503  670.79019254  763.76022005  856.73024755  949.70027506
 1044.9874619  1150.1388874  1265.87113102 1393.24888317 1533.44396826
 1687.74612504 1857.57487169 2044.49256482 2250.2187725  2476.64609362
 2725.85756906 3000.14584479 3302.03426333 3634.3000775 ]
filtering with frequency:  112.97002750564441
filtering with frequency:  205.9400550112888
filtering with frequency:  298.9100825169332
filtering with frequency:  391.8801100225776
filtering with frequency:  484.850137528222
filtering with frequency:  577.8201650338665
filtering with frequency:  670.7901925395109
filtering with frequency:  763.7602200451553
filtering with frequenc

IndexError: index 22 is out of bounds for axis 1 with size 22

## Speech Evaluation - Denoised

In [13]:
(x, rate) = lr.load("_clean_sample.wav", sr=16000)
x = np.asarray(x * 32767, dtype=np.int16)

mos_lqo = pesq_wrapper(ref=x, denoised=filtered_sig, sr=rate)
stoi = stoi_wrapper(ref=x, denoised=filtered_sig, sr=rate)
print('mos_lqo: ', mos_lqo) # 1.4281256460921785
print('stoi: ', stoi) # 62.942086539244094

TypeError: 'numpy.float64' object is not callable