We attempt in thisto create a vanilla end to end ASR model with CTC loss using keras

In [24]:
import sys 
import os
import numpy as np
sys.path.insert(1, os.path.abspath('../models'))
import data_preparation 
import pandas as pd

In [26]:
df = pd.read_pickle(os.path.join(path2features, filename))

In [27]:
df.shape

(591, 3)

In [41]:
path2data = os.path.abspath('../../Datasets/tedlium/TEDLIUM_release1')
path2features = os.path.abspath('../../Datasets/dev_mfcc')
filename = 'ids_labels.p'
n_mfcc = 26
n_mels = 40
hop_length = 160
frame_length = 320
batch_size = 32
epoch_length = 0
shuffle = False

In [4]:
data_prep_params = {'path2data': path2data,
                    'path2features': path2features,
                    'pickle_filename': filename,
                    'n_mfcc': n_mfcc,
                    'n_mels': n_mels,
                    'hop_length': hop_length,
                    'frame_length': frame_length
                    }

data_prep = data_preparation.DataPrep(**data_prep_params)

In [None]:
data_prep.process_tedelium(category='dev')

In [42]:
data_generator_params = {'path2features': path2features,
                         'pickle_filename': filename,
                         'batch_size': batch_size,
                         'mfcc_features': n_mfcc,
                         'epoch_length': epoch_length,
                         'shuffle': shuffle
                        }

data_gen = data_preparation.DataGenerator(**data_generator_params)

In [19]:
from keras import backend as K
from keras.layers import Dense, SimpleRNN, LSTM, CuDNNLSTM, Bidirectional, TimeDistributed, Conv1D, ZeroPadding1D
from keras.layers import Lambda, Input, Dropout, Masking
from keras.models import Model
from keras.optimizers import Adam

In [32]:
# Lambda implementation of CTC loss, using ctc_batch_cost from TensorFlow backend
# CTC implementation from Keras example found at https://github.com/keras-team/keras/blob/master/examples/image_ocr.py
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # print "y_pred_shape: ", y_pred.shape
    y_pred = y_pred[:, 2:, :]
    # print "y_pred_shape: ", y_pred.shape
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


# Returns clipped relu, clip value set to 20.
def clipped_relu(value):
    return K.relu(value, max_value=20)

In [33]:
def deep_rnn(units, input_dim=26, output_dim=29, dropout=0.2, numb_of_dense=1, n_layers=1):
    """
    :param units: Hidden units per layer
    :param input_dim: Size of input dimension (number of features), default=26
    :param output_dim: Output dim of final layer of model (input to CTC layer), default=29
    :param dropout: Dropout rate, default=0.2
    :param numb_of_dense: Number of fully connected layers before recurrent, default=3
    :param n_layers: Number of simple RNN layers, default=3
    :return: network_model: deep_rnn
    Default model contains:
     1 layer of masking
     3 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     3 layers of RNN with 20% dropout
     1 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of softmax
    """

    # Input data type
    dtype = 'float32'

    # Kernel and bias initializers for fully connected dense layers
    kernel_init_dense = 'random_normal'
    bias_init_dense = 'random_normal'

    # Kernel and bias initializers for recurrent layer
    kernel_init_rnn = 'glorot_uniform'
    bias_init_rnn = 'zeros'

    # ---- Network model ----
    # x_input layer, dim: (batch_size * x_seq_size * mfcc_features)
    input_data = Input(name='the_input',shape=(None, input_dim), dtype=dtype)

    # Masking layer
    x = Masking(mask_value=0., name='masking')(input_data)

    # Default 3 fully connected layers DNN ReLu
    # Default dropout rate 20 % at each FC layer
    for i in range(0, numb_of_dense):
        x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                                  activation=clipped_relu), name='fc_'+str(i+1))(x)
        x = TimeDistributed(Dropout(dropout), name='dropout_'+str(i+1))(x)

    # Deep RNN network with a default of 3 layers
    for i in range(0, n_layers):
        x = SimpleRNN(units, activation='relu', kernel_initializer=kernel_init_rnn, bias_initializer=bias_init_rnn,
                      dropout=dropout, return_sequences=True, name=('deep_rnn_'+ str(i+1)))(x)

    # 1 fully connected layer DNN ReLu with default 20% dropout
    x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                              activation='relu'), name='fc_4')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_4')(x)

    # Output layer with softmax
    y_pred = TimeDistributed(Dense(units=output_dim, kernel_initializer=kernel_init_dense,
                                   bias_initializer=bias_init_dense, activation='softmax'), name='softmax')(x)

    # ---- CTC ----
    # y_input layers (transcription data) for CTC loss
    labels = Input(name='the_labels', shape=[None], dtype=dtype)        # transcription data (batch_size * y_seq_size)
    input_length = Input(name='input_length', shape=[1], dtype=dtype)   # unpadded len of all x_sequences in batch
    label_length = Input(name='label_length', shape=[1], dtype=dtype)   # unpadded len of all y_sequences in batch

    # Lambda layer with ctc_loss function due to Keras not supporting CTC layers
    loss_out = Lambda(function=ctc_lambda_func, name='ctc', output_shape=(1,))(
                      [y_pred, labels, input_length, label_length])

    network_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return network_model

In [35]:
def blstm(units, input_dim=26, output_dim=29, dropout=0.2, numb_of_dense=3, cudnn=False, n_layers=1):
    """
    :param units: Hidden units per layer
    :param input_dim: Size of input dimension (number of features), default=26
    :param output_dim: Output dim of final layer of model (input to CTC layer), default=29
    :param dropout: Dropout rate, default=0.2
    :param numb_of_dense: Number of fully connected layers before recurrent, default=3
    :param cudnn: Whether to use the CuDNN optimized LSTM (GPU only), default=False
    :param n_layers: Number of stacked BLSTM layers, default=1
    :return: network_model: blstm
    Default model contains:
     1 layer of masking
     3 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of BLSTM
     1 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of softmax
    """

    # Input data type
    dtype = 'float32'

    # Kernel and bias initializers for fully connected dense layers
    kernel_init_dense = 'random_normal'
    bias_init_dense = 'random_normal'

    # Kernel and bias initializers for recurrent layer
    kernel_init_rnn = 'glorot_uniform'
    bias_init_rnn = 'random_normal'

    # ---- Network model ----
    # x_input layer, dim: (batch_size * x_seq_size * features)
    input_data = Input(name='the_input', shape=(None, input_dim), dtype=dtype)

    if cudnn:
        # CuDNNLSTM does not support masking
        x = input_data
    else:
        # Masking layer
        x = Masking(mask_value=0., name='masking')(input_data)

    # Default 3 fully connected layers DNN ReLu
    # Default dropout rate 20 % at each FC layer
    for i in range(0, numb_of_dense):
        x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                                  activation=clipped_relu), name='fc_'+str(i+1))(x)
        x = TimeDistributed(Dropout(dropout), name='dropout_'+str(i+1))(x)

    # Bidirectional RNN (with ReLu)
    # If running on GPU, use the CuDNN optimised LSTM model
    if cudnn:
        for i in range(0, n_layers):
            x = Bidirectional(CuDNNLSTM(units, kernel_initializer=kernel_init_rnn, bias_initializer=bias_init_rnn,
                                        unit_forget_bias=True, return_sequences=True),
                              merge_mode='sum', name=('CuDNN_bi_lstm' + str(i+1)))(x)
    else:
        for i in range(0, n_layers):
            x = Bidirectional(LSTM(units, activation='relu', kernel_initializer=kernel_init_rnn, dropout=dropout,
                                   bias_initializer=bias_init_rnn, return_sequences=True),
                              merge_mode='sum', name=('bi_lstm' + str(i+1)))(x)

    # 1 fully connected layer DNN ReLu with default 20% dropout
    x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                              activation='relu'), name='fc_4')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_4')(x)

    # Output layer with softmax
    y_pred = TimeDistributed(Dense(units=output_dim, kernel_initializer=kernel_init_dense,
                                   bias_initializer=bias_init_dense, activation='softmax'), name='softmax')(x)

    # ---- CTC ----
    # y_input layers (transcription data) for CTC loss
    labels = Input(name='the_labels', shape=[None], dtype=dtype)       # transcription data (batch_size * y_seq_size)
    input_length = Input(name='input_length', shape=[1], dtype=dtype)  # unpadded len of all x_sequences in batch
    label_length = Input(name='label_length', shape=[1], dtype=dtype)  # unpadded len of all y_sequences in batch

    # Lambda layer with ctc_loss function due to Keras not supporting CTC layers
    loss_out = Lambda(function=ctc_lambda_func, name='ctc', output_shape=(1,))(
                      [y_pred, labels, input_length, label_length])

    network_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return network_model

In [43]:
loss = {'ctc': lambda y_true, y_pred: y_pred}
training_generator = data_gen
#model = deep_rnn(units=256, input_dim=26, output_dim=29, dropout=0.2, numb_of_dense=3, n_layers=3)
model = blstm(units=256, input_dim=26, output_dim=29, dropout=0.2, numb_of_dense=1, cudnn=False, n_layers=1)
model_train_params = {'generator': training_generator,
                      'epochs': 4,
                      'verbose': 2,
                      #'validation_data': validation_generator,
                      'workers': 1,
                      'shuffle': shuffle}

optimizer = Adam(lr=0.0001, epsilon=1e-8, clipnorm=2.0)

model.compile(loss=loss, optimizer=optimizer)
model.fit_generator(**model_train_params)

loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 255.88it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 2182.19it/s]
loading and padding mfcc features:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 1/4


loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 238.23it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 2311.27it/s]
loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 404.24it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 2430.12it/s]
loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 493.23it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 3009.57it/s]
loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 418.47it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 3035.16it/s]
loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 444.34it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 2644.53it/s]
loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 619.01it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 3265.96it/s]
loading and padding mfcc fea

 - 90s - loss: nan


loading and padding mfcc features:  56%|█████▋    | 18/32 [00:00<00:00, 174.42it/s]

Epoch 2/4


loading and padding mfcc features: 100%|██████████| 32/32 [00:00<00:00, 152.83it/s]
loading and padding labels: 100%|██████████| 32/32 [00:00<00:00, 2544.17it/s]


InvalidArgumentError: Not enough time for target transition sequence (required: 678, available: 641)2You can turn this error into a warning by using the flag ignore_longer_outputs_than_inputs
	 [[{{node ctc_9/CTCLoss}}]]