In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.layers import conv2d, max_pooling2d, flatten
from tensorflow.nn import relu


In [None]:
# IAM dataset contains 79 chars
charList = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM1234567890“!”#&’()*+,-./:;?"

## Example image

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
img = np.trunc(np.random.random((32, 128)) * 255)
plt.imshow(img, cmap='gray')

# CNN part of model

In [None]:
def buildCNN(inputs):
    # in (None, 32, 128, 1)
    # out(None, 1, 32, 256) -> (None, 32, 256)
    with tf.name_scope("CNN"):
        # Layer 1
        x = conv2d(inputs=inputs, filters=32, kernel_size=[5, 5], padding='same', activation=relu)
        x = max_pooling2d(x, pool_size=[2, 2], strides=[2, 2], padding="valid")

        # Layer 2
        x = conv2d(inputs=x, filters=64, kernel_size=[5, 5], padding='same', activation=relu)
        x = max_pooling2d(x, pool_size=[2, 2], strides=[2, 2], padding="valid")

        # Layer 3
        x = conv2d(inputs=x, filters=128, kernel_size=[3, 3], padding='same', activation=relu)
        x = max_pooling2d(x, pool_size=[2, 1], strides=[2, 1], padding="valid")

        # Layer 4
        x = conv2d(inputs=x, filters=128, kernel_size=[3, 3], padding='same', activation=relu)
        x = max_pooling2d(x, pool_size=[2, 1], strides=[2, 1], padding="valid")

        # Layer 5
        x = conv2d(inputs=x, filters=256, kernel_size=[3, 3], padding='same', activation=relu)
        x = max_pooling2d(x, pool_size=[2, 1], strides=[2, 1], padding="valid")

    x = tf.squeeze(x, axis=1)
    return x

In [None]:
def buildRNN(inputs):
    with tf.name_scope("RNN"):
        # basic cells which is used to build RNN
        numHidden = 256
        cells = [tf.contrib.rnn.LSTMCell(num_units=numHidden, state_is_tuple=True) for _ in range(2)] # 2 layers

        # stack basic cells
        stacked = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)

        # bidirectional RNN
        # BxTxF -> BxTx2H
        ((fw, bw), _) = tf.nn.bidirectional_dynamic_rnn(cell_fw=stacked, cell_bw=stacked, inputs=inputs, dtype=inputs.dtype)

        # BxTxH + BxTxH -> BxTx2H -> BxTx1X2H
        concat = tf.expand_dims(tf.concat([fw, bw], 2), 2)

        # project output to chars (including blank): BxTx1x2H -> BxTx1xC -> BxTxC
        kernel = tf.Variable(tf.truncated_normal([1, 1, numHidden * 2, len(charList) + 1], stddev=0.1))
        output = tf.squeeze(tf.nn.atrous_conv2d(value=concat, filters=kernel, rate=1, padding='SAME'), axis=2)

    return output

In [None]:
with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
    inputImgs = tf.placeholder(tf.float32, shape=(None, 32, 128, 1))
    cnn = buildCNN(inputImgs)
    rnn = buildRNN(cnn)

In [None]:
with tf.Session() as sess:
    writer = tf.summary.FileWriter("logs", sess.graph)