## Description

##### Implementation of "Speech enhancement by LSTM-based noise suppression followed by CNN-based speech restoration" paper - https://link.springer.com/article/10.1186/s13634-020-00707-1

##### Implementation of Speech Restoration Network

### Libraries

In [None]:
import tensorflow as tf
import numpy as np
import typing

In [None]:
from ipynb.fs.full.speech_enhancement_layers import Conv1D
from ipynb.fs.full.speech_enhancement_noise_suppression_module import NoiseSuppressor

In [None]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

### Input

##### Let's assume we have an input of [batch_size, sequence_length, nr_features]

In [None]:
input_shape = (1, 1024, 2)
x = tf.random.normal(input_shape)
x = tf.constant(x, dtype=tf.float32)

In [None]:
x

### Speech Restoration Network

##### The speech restoration module makes use of con1d, conv1d transpose, max pooling and upsampling layers

###### Below is the class for the first block in figure 3 of the paper

In [None]:
class SpeechRestorationNetworkBlock1(tf.Module):
    
    def __init__(self,
                nr_conv_filters: int,
                conv_filter_size: int,
                name: str = None
                ):
        super(SpeechRestorationNetworkBlock1, self).__init__(name)
        
        self.conv1D_1 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_2 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        
        self.max_pooling_1 = tf.keras.layers.MaxPool1D(pool_size=2)
        self.conv1D_3 = Conv1D(nr_filters=2*nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_4 = Conv1D(nr_filters=2*nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        
        self.max_pooling_2 = tf.keras.layers.MaxPool1D(pool_size=2)
        
        self.conv1D_5 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.upsampling_1 = tf.keras.layers.UpSampling1D(size=2)
        
        self.conv1D_6 = Conv1D(nr_filters=2*nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_7 = Conv1D(nr_filters=2*nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        
        self.upsampling_2 = tf.keras.layers.UpSampling1D(size=2)
        self.conv1D_8 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_9 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)

        self.conv1D_last = Conv1D(nr_filters=2, kernel=conv_filter_size, stride=1, use_bias=True)
        
    
    def __call__(self, x_in: tf.Tensor):
        
        x = self.conv1D_1(x_in)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_2(x) # this will be addeed at a later stage in model
        temp1 = tf.nn.leaky_relu(x)
        x = self.max_pooling_1(temp1)
        x = self.conv1D_3(x)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_4(x) # this will be added at a later stage in model
        temp2 = tf.nn.leaky_relu(x)
        x = self.max_pooling_2(temp2)
        x = self.conv1D_5(x)
        x = tf.nn.leaky_relu(x)
        x = self.upsampling_1(x)
        x = self.conv1D_6(x)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_7(x)
        x = tf.nn.leaky_relu(x)
        add_1 = tf.add(x, temp2)
        x = self.upsampling_2(add_1)
        x = self.conv1D_8(x)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_9(x)
        x = tf.nn.leaky_relu(x)
        add_2 = tf.add(x, temp1)
        x = self.conv1D_last(add_2)
        return tf.nn.leaky_relu(x)

In [None]:
speech_restoration_network_block1 = SpeechRestorationNetworkBlock1(nr_conv_filters=88, conv_filter_size=24)

In [None]:
block1_output = speech_restoration_network_block1(x_in=x)

###### Below is the class for the first block in figure 4 of the paper

In [None]:
class SpeechRestorationNetworkBlock2(tf.Module):
    
    def __init__(self,
                nr_conv_filters: int,
                conv_filter_size: int,
                name: str = None
                ):
        super(SpeechRestorationNetworkBlock2, self).__init__(name)
        
        self.conv1D_1 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_2 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=2, use_bias=True)
        
        self.conv1D_3 = Conv1D(nr_filters=2*nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_4 = Conv1D(nr_filters=2*nr_conv_filters, kernel=conv_filter_size, stride=2, use_bias=True)
        
        self.conv1D_5 = Conv1D(nr_filters=nr_conv_filters, kernel=conv_filter_size, stride=1, use_bias=True)
        self.conv1D_transpose_1 = tf.keras.layers.Conv1DTranspose(filters=2*nr_conv_filters, kernel_size=conv_filter_size, strides=2, padding="same") # padding must be set to same to ensure output shape = input shape
        
        self.conv1D_transpose_2 = tf.keras.layers.Conv1DTranspose(filters=2*nr_conv_filters, kernel_size=conv_filter_size, strides=1, padding="same")
        self.conv1D_transpose_3 = tf.keras.layers.Conv1DTranspose(filters=nr_conv_filters, kernel_size=conv_filter_size, strides=2, padding="same")
        
        self.conv1D_transpose_4 = tf.keras.layers.Conv1DTranspose(filters=nr_conv_filters, kernel_size=conv_filter_size, strides=1, padding="same")
        
        self.conv1D_last = Conv1D(nr_filters=2, kernel=conv_filter_size, stride=1, use_bias=True)
        
    def __call__(self, x_in: tf.Tensor):
        x = self.conv1D_1(x_in)
        temp1 = tf.nn.leaky_relu(x)
        x = self.conv1D_2(temp1)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_3(x)
        temp2 = tf.nn.leaky_relu(x)
        x = self.conv1D_4(temp2)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_5(x)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_transpose_1(x)
        x = tf.nn.leaky_relu(x)
        add_1 = tf.add(x, temp2)
        x = self.conv1D_transpose_2(add_1)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_transpose_3(x)
        x = tf.nn.leaky_relu(x)
        add_2 = tf.add(x, temp1)
        x = self.conv1D_transpose_4(add_2)
        x = tf.nn.leaky_relu(x)
        x = self.conv1D_last(x)
        return tf.nn.leaky_relu(x)

In [None]:
speech_restoration_network_block2 = SpeechRestorationNetworkBlock2(nr_conv_filters=88, conv_filter_size=24)

In [None]:
speech_restoration_network_block2(x_in=block1_output) # input to block 2 is output from block 1

##### Bring both blocks into one

In [None]:
class SpeechRestorationNetwork(tf.Module):
    
    def __init__(self,
                nr_conv_filters: int,
                conv_filter_size: int,
                name: str = None):
        super(SpeechRestorationNetwork, self).__init__(name)
        
        self.speech_restoration_network_block1 = SpeechRestorationNetworkBlock1(nr_conv_filters=88, conv_filter_size=24)
        self.speech_restoration_network_block2 = SpeechRestorationNetworkBlock2(nr_conv_filters=88, conv_filter_size=24)
        
    def __call__(self, x_in: tf.Tensor):
        x = self.speech_restoration_network_block1(x_in)
        return self.speech_restoration_network_block2(x)

In [None]:
speech_restoration_network = SpeechRestorationNetwork(nr_conv_filters=88, conv_filter_size=24)

In [None]:
noise_suppressor = NoiseSuppressor(output_size=2) # build noise suppressor to test with

In [None]:
speech_restoration_network(x_in=noise_suppressor(x_in=x))