In [None]:
from keras.layers.convolutional import Conv3D, ZeroPadding3D
from keras.layers.pooling import MaxPooling3D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import GRU
from keras.layers import Input
from keras.models import Model
#from lipnet.core.layers import CTC
from keras import backend as K

In [None]:
from keras.layers.core import Lambda
from keras import backend as K

# Actual loss calculation
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # From Keras example image_ocr.py:
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # y_pred = y_pred[:, 2:, :]
    y_pred = y_pred[:, :, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

def CTC(name, args):
    return Lambda(ctc_lambda_func, output_shape=(1,), name=name)(args)

In [None]:
#input_shape = (3, 10, 64, 64)
input_shape = (10, 64, 64, 3)
x = Input(name = 'the_input', shape = input_shape, dtype = 'float32')
print(x.shape)
x = ZeroPadding3D(padding = (1,2,2), name = 'zero1')(x)
print(x.shape)
x = Conv3D(32, (3,5,5), strides = (1,2,2), activation = 'relu', kernel_initializer = 'he_normal', name = 'conv1')(x)
print(x.shape)
x = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max1')(x)
print(x.shape)
x = Dropout(0.5)(x)
print(x.shape)
print("=====================================================")
x = ZeroPadding3D(padding=(1, 2, 2), name='zero2')(x)
print(x.shape)
x = Conv3D(64, (3, 5, 5), strides=(1, 1, 1), activation='relu', kernel_initializer='he_normal', name='conv2')(x)
print(x.shape)
x = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max2')(x)
print(x.shape)
x = Dropout(0.5)(x)
print(x.shape)
print("=====================================================")
x = ZeroPadding3D(padding=(1, 1, 1), name='zero3')(x)
print(x.shape)
x = Conv3D(96, (3, 3, 3), strides=(1, 1, 1), activation='relu', kernel_initializer='he_normal', name='conv3')(x)
print(x.shape)
x = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max3')(x)
print(x.shape)
x = Dropout(0.5)(x)
print(x.shape)
print("=====================================================")
x = TimeDistributed(Flatten())(x)
print(x.shape)

y_pred = Activation('softmax', name='softmax')(x)
print(y_pred.shape)

(None, 10, 64, 64, 3)
(None, 12, 68, 68, 3)
(None, 10, 32, 32, 32)
(None, 10, 16, 16, 32)
(None, 10, 16, 16, 32)
(None, 12, 20, 20, 32)
(None, 10, 16, 16, 64)
(None, 10, 8, 8, 64)
(None, 10, 8, 8, 64)
(None, 12, 10, 10, 64)
(None, 10, 8, 8, 96)
(None, 10, 4, 4, 96)
(None, 10, 4, 4, 96)
(None, 10, 1536)
(None, 10, 1536)


In [None]:
class LipNet(object):
    def __init__(self):
        self.img_c = img_c
        self.img_w = img_w
        self.img_h = img_h
        self.frames_n = frames_n
        
        self.output_size = output_size
        self.build()
        
    def build(self):
        input_shape = (self.frames_n, self.img_w, self.img_h, self.img_c)
        #if K.image_data_format() == 'channels_first':
            #input_shape = (self.img_c, self.frames_n, self.img_w, self.img_h)
        #else:
            #input_shape = (self.frames_n, self.img_w, self.img_h, self.img_c)
        
        self.input_data = Input(name = 'the_input', shape = input_shape, dtype = 'float32')
        self.zero1 = ZeroPadding3D(padding = (1,2,2), name = 'zero1')(self.input_data)
        self.conv1 = Conv3D(32, (3,5,5), strides = (1,2,2), activation = 'relu', kernel_initializer = 'he_normal', name = 'conv1')(self.zero1)
        self.maxp1 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max1')(self.conv1)
        self.drop1 = Dropout(0.5)(self.maxp1)

        self.zero2 = ZeroPadding3D(padding=(1, 2, 2), name='zero2')(self.drop1)
        self.conv2 = Conv3D(64, (3, 5, 5), strides=(1, 1, 1), activation='relu', kernel_initializer='he_normal', name='conv2')(self.zero2)
        self.maxp2 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max2')(self.conv2)
        self.drop2 = Dropout(0.5)(self.maxp2)

        self.zero3 = ZeroPadding3D(padding=(1, 1, 1), name='zero3')(self.drop2)
        self.conv3 = Conv3D(96, (3, 3, 3), strides=(1, 1, 1), activation='relu', kernel_initializer='he_normal', name='conv3')(self.zero3)
        self.maxp3 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max3')(self.conv3)
        self.drop3 = Dropout(0.5)(self.maxp3)

        self.resh1 = TimeDistributed(Flatten())(self.drop3)

        self.gru_1 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(self.resh1)
        self.gru_2 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru2'), merge_mode='concat')(self.gru_1)

        # transforms RNN output to character activations:
        self.dense1 = Dense(self.output_size, kernel_initializer='he_normal', name='dense1')(self.gru_2)

        self.y_pred = Activation('softmax', name='softmax')(self.dense1)
        
        self.labels = Input(name = 'the_labels', shape = [self.absolute_max_len], dtype = 'float32')
        self.input_length = Input(name = 'input_length', shape = [1], dtype = 'int64')
        self.label_length = Input(name = 'label_length', shape = [1], dtype = 'int64')
        
        self.loss_out = CTC('ctc', [self.y_pred, self.labels, self.input_length, sef.label_length], outputs=self.loss_out)

In [None]:
labels = Input(name = 'the_labels', shape = [40], dtype = 'float32')
input_length = Input(name = 'input_length', shape = [1], dtype = 'int64')
label_length = Input(name = 'label_length', shape = [1], dtype = 'int64')

In [None]:
labels.shape

TensorShape([None, 40])

In [None]:
input_length.shape

TensorShape([None, 1])

In [None]:
label_length.shape

TensorShape([None, 1])

In [None]:
loss = CTC('ctc', [y_pred, labels, input_length, label_length])

In [None]:
loss.shape

TensorShape([None, 1])