In [None]:
import os
import fnmatch
import cv2
import numpy as np
import string
import time
 
from keras.preprocessing.sequence import pad_sequences
 
from keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional
from keras.models import Model
from keras.activations import relu, sigmoid, softmax
import keras.backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
 
import tensorflow as tf

In [None]:
!wget https://thor.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz


In [None]:
!tar -xvzf mjsynth.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
mnt/ramdisk/max/90kDICT32px/2717/2/291_Dethrone_21191.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/290_cruelly_18370.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/289_works_87244.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/288_UPHOLSTERED_83520.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/287_plaited_58007.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/286_TOURISTY_79890.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/285_slavic_71550.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/284_labrador_43023.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/283_DETERMINEDLY_21171.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/282_femur_28535.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/281_HESITATE_35963.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/280_FANCYING_28069.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/279_Meadow_47378.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/278_Carters_11749.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/277_Geologically_32213.jpg
mnt/ramdisk/max/90kDICT32px/2717/2/276_Scullions_68804.jpg
mnt/ramd

In [None]:
char_list = string.ascii_letters+string.digits
 
def encode_to_labels(txt):
    # encoding each output word into digits
    dig_lst = []
    for index, char in enumerate(txt):
        try:
            dig_lst.append(char_list.index(char))
        except:
            print(char)
        
    return dig_lst

In [None]:
path = '/content/mnt/ramdisk/max/90kDICT32px'
 
 
# lists for training dataset
training_img = []
training_txt = []
train_input_length = []
train_label_length = []
orig_txt = []
 
#lists for validation dataset
valid_img = []
valid_txt = []
valid_input_length = []
valid_label_length = []
valid_orig_txt = []
 
max_label_len = 0
 
i =1 
flag = 0
 
for root, dirnames, filenames in os.walk(path):
 
    for f_name in fnmatch.filter(filenames, '*.jpg'):
        # read input image and convert into gray scale image
        img = cv2.cvtColor(cv2.imread(os.path.join(root, f_name)), cv2.COLOR_BGR2GRAY)   
 
        # convert each image of shape (32, 128, 1)
        w, h = img.shape
        if h > 128 or w > 32:
            continue
        if w < 32:
            add_zeros = np.ones((32-w, h))*255
            img = np.concatenate((img, add_zeros))
 
        if h < 128:
            add_zeros = np.ones((32, 128-h))*255
            img = np.concatenate((img, add_zeros), axis=1)
        img = np.expand_dims(img , axis = 2)
        
        # Normalize each image
        img = img/255.
        
        # get the text from the image
        txt = f_name.split('_')[1]
        
        # compute maximum length of the text
        if len(txt) > max_label_len:
            max_label_len = len(txt)
            
           
        # split the 150000 data into validation and training dataset as 10% and 90% respectively
        if i%10 == 0:     
            valid_orig_txt.append(txt)   
            valid_label_length.append(len(txt))
            valid_input_length.append(31)
            valid_img.append(img)
            valid_txt.append(encode_to_labels(txt))
        else:
            orig_txt.append(txt)   
            train_label_length.append(len(txt))
            train_input_length.append(31)
            training_img.append(img)
            training_txt.append(encode_to_labels(txt)) 
        
        # break the loop if total data is 150000
        if i == 50000:
            flag = 1
            break
        i+=1
    if flag == 1:
        break

In [None]:
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list))
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))

In [None]:
# input with shape of height=32 and width=128 
inputs = Input(shape=(32,128,1))
 
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)
 
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
 
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
 
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)
 
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
batch_norm_5 = BatchNormalization()(conv_5)
 
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)
 
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)
 
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)
 
blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)
 
outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

act_model = Model(inputs, outputs)

In [None]:
act_model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 32, 128, 1)]      0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 32, 128, 64)       640       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 16, 64, 64)        0         
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 16, 64, 128)       73856     
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 8, 32, 128)        0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 8, 32, 256)        295168    
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 8, 32, 256)        5900

In [None]:
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
 
 
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 
 
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

#model to be used at training time
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

In [None]:
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
 
filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [checkpoint]

In [None]:
training_img = np.array(training_img)
train_input_length = np.array(train_input_length)
train_label_length = np.array(train_label_length)

valid_img = np.array(valid_img)
valid_input_length = np.array(valid_input_length)
valid_label_length = np.array(valid_label_length)

In [None]:
batch_size = 256
epochs = 10


model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], 
          y=np.zeros(len(training_img)), batch_size=batch_size, epochs = epochs,
          validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], 
                             [np.zeros(len(valid_img))]), verbose = 1, callbacks = callbacks_list)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: [Errno 28] No space left on device: '/tmp/tmpds01l15c.py'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: [Errno 28] No space left on device: '/tmp/tmpds01l15c.py'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: [Errno 28] No space left on device: '/tmp/tmp36syser3.py'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: [Errno 28] No space left on device: '/tmp/tmp36syser3.py'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (

OSError: ignored

In [None]:

# load the saved best model weights
act_model.load_weights('best_model.hdf5')
 
# predict outputs on validation images
prediction = act_model.predict(valid_img[:10])
 
# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
 
# see the results
i = 0
for x in out:
    print("original_text =  ", valid_orig_txt[i])
    print("predicted text = ", end = '')
    for p in x:  
        if int(p) != -1:
            print(char_list[int(p)], end = '')       
    print('\n')
    i+=1

OSError: ignored