In [4]:
import torch
import torch.nn as nn
from collections import OrderedDict

class ConvReluBlock(nn.Module):
    '''
    2-layer Conv2d + (batch norm) + LeakyBlock
    :param res_conn: boolean
        if add residual connection
    '''
    def __init__(self, in_channels, out_channels, kernel_sizes, padding_sizes,
                 res_conn=False, bottle_conv=False, **kwargs):
        super(ConvReluBlock, self).__init__(**kwargs)
        self.res_conn = res_conn
        layers = []
        # Layer 1
        if bottle_conv:
            # conv 1x1 with same input and output channels
            layers += [
                nn.Conv2d(in_channels[0], out_channels[0] // 2, (1, 1),
                      stride=1, padding=(0, 0)),
                nn.LeakyReLU()
            ]
            in_channels[0] = out_channels[0] // 2
            out_channels[0] = out_channels[0] // 2
        layers += [
            nn.Conv2d(in_channels[0], out_channels[0], kernel_sizes[0],
                      stride=1, padding=padding_sizes[0]),
            nn.LeakyReLU()
        ]
        if bottle_conv:
            layers += [
                nn.Conv2d(out_channels[0], out_channels[0] * 2, (1, 1),
                      stride=1, padding=(0, 0)),
                nn.LeakyReLU()
            ]
        if len(in_channels) > 1:
            if bottle_conv:
                layers += [
                    nn.Conv2d(in_channels[1], out_channels[1] // 2, (1, 1),
                          stride=1, padding=(0, 0)),
                    nn.LeakyReLU()
                ]
                in_channels[1] = out_channels[1] // 2
                out_channels[1] = out_channels[1] // 2
            # Layer 2 with batch norm
            layers += [
                nn.Conv2d(in_channels[1], out_channels[1], kernel_sizes[1],
                      stride=1, padding=padding_sizes[1]),
                nn.BatchNorm2d(out_channels[1]),
                nn.LeakyReLU(),
            ]
            if bottle_conv:
                layers += [
                    nn.Conv2d(out_channels[1], out_channels[1] * 2, (1, 1),
                          stride=1, padding=(0, 0)),
                    nn.LeakyReLU()
                ]
        self.block = nn.Sequential(*layers)

    def forward(self, x):
        identity = x
        out = self.block(x)
        # residual connection
        if self.res_conn:
            out += x
        return out

    
class CRNN(nn.Module):
    '''
    Same as cnocr/symbols/crnn.py/crnn_lstm_lite.
    7-layer CNN + 2-layer LSTM
    CNN reduce the width of image to width // 4 - 1, e.g. 560 => 139
    '''
    def __init__(self, dropout=0., rnn_hidden_size=100,
                 batch_first=False, **kwargs):
        super(CRNN, self).__init__(**kwargs)
        self.batch_first = batch_first
        # 7-layer Conv + 3-layer Pool
        conv_kernel = [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (4, 1)]
        conv_padding = [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (0, 0)]
        conv_channel = [64, 128, 256, 512, 512, 512, 512]
        # layer number of Pooling
        pool_kernel = [(2, 2)] * 3
        pool_stride = [(2, 2)] * 2 + [(2, 1)]
        # Suppose input shape (N, C, H, W) is: (N, 1, 32, 560)
        self.cnn = nn.Sequential(OrderedDict([
            # => (N, 128, 32, 560)
            ('ConvBlock-0', ConvReluBlock(
                [1, conv_channel[0]], conv_channel[0:2],
                conv_kernel[0:2], conv_padding[0:2])),
            # => (N, 128, 16, 280)
            ('Pool-0', nn.MaxPool2d(pool_kernel[0], pool_stride[0])),
            # => (N, 512, 16, 280)
            ('ConvBlock-1', ConvReluBlock(
                conv_channel[1:3], conv_channel[2:4],
                conv_kernel[2:4], conv_padding[2:4])),
            # => (N, 512, 8, 140)
            ('Pool-1', nn.MaxPool2d(pool_kernel[1], pool_stride[1])),
            # => (N, 512, 8, 140)
            ('BottleBlock-0', ConvReluBlock(
                conv_channel[3:5], conv_channel[4:6],
                conv_kernel[4:6], conv_padding[4:6],
                bottle_conv=True, res_conn=True)),
            # => (N, 512, 4, 139)
            ('Pool-2', nn.MaxPool2d(pool_kernel[2], pool_stride[2])),
            # => (N, 512, 1, 139)
            ('BottleBlock-1', ConvReluBlock(
                conv_channel[5:6], conv_channel[6:],
                conv_kernel[6:], conv_padding[6:],
                bottle_conv=True)),
        ]))
        if dropout > 0:
            self.cnn.add_module('Dropout-0', nn.Dropout(dropout))
        # 2-layer Bi-LSTM
        self.rnn = nn.LSTM(input_size=conv_channel[-1],
                           hidden_size=rnn_hidden_size,
                           num_layers=2,
                           batch_first=batch_first,
                           bidirectional=True
                          )
            
    def forward(self, x):
        conv = self.cnn(x)
        # h == 1
        b, c, h, w = conv.size()
        assert h == 1, f'the output height of conv must be 1 instead of {h}'
        if self.batch_first:
            # => (B, width, channel) a.k.a. (B, seq_len, input_size)
            # e.g. (N, 139, 512)
            conv = conv.squeeze(2).permute(0, 2, 1)
        else:
            conv = conv.squeeze(2).permute(2, 0, 1)
        # => (N, 139, 200) / (139, N, 200)
        rnn_output, _ = self.rnn(conv)
        return rnn_output
    
    
class OCR(nn.Module):
    def __init__(self, num_classes, dropout=0., rnn_hidden_size=100,
                 batch_first=False, **kwargs):
        super(OCR, self).__init__(**kwargs)
        self.batch_first = batch_first
        self.crnn = CRNN(dropout=dropout,
                         rnn_hidden_size=rnn_hidden_size,
                         batch_first=batch_first)
        self.proj = nn.Linear(2 * rnn_hidden_size, num_classes)
        
    def forward(self, x):
        rnn_output = self.crnn(x)
        if self.batch_first:
            b, t, h = rnn_output.size()
            rnn_output = rnn_output.contiguous()
        else:
            t, b, h = rnn_output.size()
        # rnn with batch_first must .contiguous() before view()
        # .contiguous() add extra cost.
        output = self.proj(rnn_output.view(b * t, h))
        if self.batch_first:
            output = output.view(b, t, -1)
        else:
            output = output.view(t, b, -1)
        return output

In [5]:
# sometimes crnn_bf even faster than crnn
crnn_bf = OCR(5000, batch_first=True)
crnn = OCR(5000, batch_first=False)
print('#trainable parameters:', sum(p.numel() for p in crnn.parameters() if p.requires_grad))
print(crnn, '\n', '='*50, '\n')
inp = torch.randn(8, 1, 32, 560)
print('crnn')
%time out = crnn(inp)
print('crnn_bf')
%time out_bf = crnn_bf(inp)
print(out.shape, out_bf.shape)

#trainable parameters: 5520712
OCR(
  (crnn): CRNN(
    (cnn): Sequential(
      (ConvBlock-0): ConvReluBlock(
        (block): Sequential(
          (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): LeakyReLU(negative_slope=0.01)
          (2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): LeakyReLU(negative_slope=0.01)
        )
      )
      (Pool-0): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
      (ConvBlock-1): ConvReluBlock(
        (block): Sequential(
          (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): LeakyReLU(negative_slope=0.01)
          (2): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): Lea

In [1]:
import mxnet as mx
from mxnet.gluon.rnn.rnn_layer import LSTM


class CRNN_mxnet:
    def __init__(self, num_classes, dropout=0., rnn_hidden_size=100,
                 inference=True, img_width=560, num_label=20):
        '''
        same as conv-lite-lstm in CnOcr
        :param inference: boolean
            indicates evaluation without training
        '''
        # 560 => 140 - 1 = 139
        seq_len_cmpr_ratio = 4
        self.seq_len = img_width // seq_len_cmpr_ratio - 1
        self.dropout = dropout
        self.inference = inference
        self.num_classes = num_classes
        self.rnn_hidden_size = rnn_hidden_size
    
    def convRelu(self, idx, input_data, kernel_size, layer_size, padding_size,
                 batch_norm=True):
        layer = mx.symbol.Convolution(
            name='conv-%d' % idx,
            data=input_data,
            kernel=kernel_size,
            pad=padding_size,
            num_filter=layer_size,
        )
        if batch_norm:
            layer = mx.sym.BatchNorm(data=layer, name='batchnorm-%d' % idx)
        layer = mx.sym.LeakyReLU(data=layer, name='leakyrelu-%d' % idx)
        return layer
    
    def bottle_conv(self, idx, input_data, kernel_size, layer_size, padding_size,
                    batch_norm=True):
        bottle_channel = layer_size // 2
        layer = mx.symbol.Convolution(
            name='conv-%d-1-1x1' % idx,
            data=input_data,
            kernel=(1, 1),
            pad=(0, 0),
            num_filter=bottle_channel,
        )
        layer = mx.sym.LeakyReLU(data=layer, name='leakyrelu-%d-1' % idx)
        layer = mx.symbol.Convolution(
            name='conv-%d' % idx,
            data=layer,
            kernel=kernel_size,
            pad=padding_size,
            num_filter=bottle_channel,
        )
        layer = mx.sym.LeakyReLU(data=layer, name='leakyrelu-%d-2' % idx)
        layer = mx.symbol.Convolution(
            name='conv-%d-2-1x1' % idx,
            data=layer,
            kernel=(1, 1),
            pad=(0, 0),
            num_filter=layer_size,
        )
        if batch_norm:
            layer = mx.sym.BatchNorm(data=layer, name='batchnorm-%d' % idx)
        layer = mx.sym.LeakyReLU(data=layer, name='leakyrelu-%d' % idx)
        return layer

    def gen_network(self, data):
        kernel_size = [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
        padding_size = [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]
        layer_size = [min(32 * 2 ** (i + 1), 512) for i in range(len(kernel_size))]
        
        net = self.convRelu(
            0, data, kernel_size[0], layer_size[0], padding_size[0]
        )
        net = self.convRelu(
            1, net, kernel_size[1], layer_size[1], padding_size[1], True
        )
        net = mx.sym.Pooling(
            data=net, name='pool-0', pool_type='max', kernel=(2, 2), stride=(2, 2)
        )
        net = self.convRelu(
            2, net, kernel_size[2], layer_size[2], padding_size[2]
        )
        net = self.convRelu(
            3, net, kernel_size[3], layer_size[3], padding_size[3], True
        )
        x = net = mx.sym.Pooling(
            data=net, name='pool-1', pool_type='max', kernel=(2, 2), stride=(2, 2)
        )
        net = self.bottle_conv(4, net, kernel_size[4], layer_size[4], padding_size[4])
        net = self.bottle_conv(5, net, kernel_size[5], layer_size[5], padding_size[5], True) + x
        net = mx.symbol.Pooling(
            data=net, name='pool-2', pool_type='max', kernel=(2, 2), stride=(2, 1)
        )
        net = self.bottle_conv(6, net, (4, 1), layer_size[5], (0, 0))
        if self.dropout > 0.:
            net = mx.symbol.Dropout(data=net, p=self.dropout)

        # res: bz x emb_size x seq_len
        net = mx.symbol.squeeze(net, axis=2)  
        net = mx.symbol.transpose(net, axes=(2, 0, 1))
        seq_model = LSTM(self.rnn_hidden_size, 2, bidirectional=True)
        hidden_concat = seq_model(net)
        return hidden_concat

    def get_network(self, data=None):
        # placeholder of input data
        self.data = mx.sym.Variable('data')
        # Note that the name of label is `label` instead of the \
        # default `softmax_label` in mxnet
        self.label = mx.sym.Variable('label')
        output = self.gen_network(self.data)
        # => (seq_len * batch_size, rnn_hidden_size)
        output = mx.symbol.reshape(output, shape=(-3, -2))
        # => ((seq_len * batch_size), num_classes)
        pred = mx.sym.FullyConnected(data=output,
                                     num_hidden=self.num_classes,
                                     name='pred_fc')
        if self.inference:
            return mx.sym.softmax(data=pred, name='softmax')
        else:
            # training with CTC loss
            # => (seq_len, batch_size, num_classes)
            pred_ctc = mx.sym.Reshape(data=pred, shape=(-4, self.seq_len, -1, 0))
            loss = mx.sym.contrib.ctc_loss(data=pred_ctc, label=self.label)
            ctc_loss = mx.sym.MakeLoss(loss)
            softmax_class = mx.symbol.SoftmaxActivation(data=pred)
            softmax_loss = mx.sym.MakeLoss(softmax_class)
            softmax_loss = mx.sym.BlockGrad(softmax_loss)
            return mx.sym.Group([softmax_loss, ctc_loss])

In [2]:
import numpy as np


def inference(samples, alphabet, batch_size,
              network, data_shape, context, prefix, epoch):
    '''
    alphabet contains num_classes - 1 elements,
    because it does not contain black token
    '''
    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
    if network is not None:
        sym = network
    # DCMMC: useless codes
#     pred_fc = sym.get_internals()['pred_fc_output']
#     sym = mx.sym.softmax(data=pred_fc)
    mod = mx.mod.Module(
        symbol=sym, context=context, data_names=['data', ], label_names=None
    )
    mod.bind(for_training=False, data_shapes=data_shape)
    # DCMMC: in jupyter environment, you need restart juputer kernel
    # to release all the models before you create new model instances.
    mod.set_params(arg_params, aux_params, allow_missing=False)
    
    mod.forward(samples)
    prob = mod.get_outputs()[0].asnumpy()
    # => (seq_len, batch_size, num_classes)
    prob = np.reshape(prob, (-1, batch_size, prob.shape[1]))
    res = []
    for i in range(batch_size):
        lp = np.argmax(prob[:, i, :], axis=-1)
        res.append(''.join([
            alphabet[ele - 1] for idx, ele in enumerate(
                lp) if (lp[idx] and (idx == 0 or (lp[idx] != lp[idx - 1])))
        ]))
    return res


class SimpleBatch(object):
    def __init__(self, data_names, data, label_names=list(), label=list()):
        self._data = data
        self._label = label
        self._data_names = data_names
        self._label_names = label_names

        self.pad = 0
        self.index = None  # TODO: what is index?

    @property
    def data(self):
        return self._data

    @property
    def label(self):
        return self._label

    @property
    def data_names(self):
        return self._data_names

    @property
    def label_names(self):
        return self._label_names

    @property
    def provide_data(self):
        return [(n, x.shape) for n, x in zip(self._data_names, self._data)]

    @property
    def provide_label(self):
        return [(n, x.shape) for n, x in zip(self._label_names, self._label)]

In [3]:
import os
from PIL import Image, ImageOps


# cpu mode:
# gpus = ''
gpus = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = gpus
context = [mx.context.gpu(i) for i in range(len(gpus))] if len(gpus) else \
    [mx.context.cpu()]
num_classes = 6426
batch_size = 2
img_size = (32, 560)
network = CRNN_mxnet(num_classes, inference=True).get_network()
data_shape = [('data', (batch_size, 1) + img_size)]
prefix = '/data/xiaowentao/.cnocr/1.1.0/conv-lite-lstm/cnocr-v1.1.0-conv-lite-lstm'
alphabet = {idx: v.strip() for idx, v in enumerate(
    open('./cnocr/examples/label_cn.txt').readlines())}
alphabet[len(alphabet) - 1] = alphabet[len(alphabet) - 1].replace('<space>', ' ')
assert all([len(v) for v in alphabet.values()])
# original version of cnocr is 45
# epoch = 45
epoch = 47

imgs = [
    np.array(
        ImageOps.expand(
            Image.open('./cnocr/examples/chn-00199981.jpg').convert('L'), (0, 0, 280, 0)),
        dtype='float32'),
    np.array(
        ImageOps.expand(
            Image.open('./cnocr/examples/chn-00199985.jpg').convert('L'), (0, 0, 280, 0)),
        dtype='float32')
]
imgs = [img / 255. for img in imgs]
imgs = mx.nd.expand_dims(mx.nd.array(imgs), 1)
print(imgs.shape)
samples = SimpleBatch(data_names=['data'], data=[imgs])
res = inference(samples, alphabet, batch_size,
                network, data_shape, context, prefix, epoch)
print(res)

(2, 1, 32, 560)
['掉比悟厉。谌查门蠕坑', '.马靼蘑熨距颖猬要藕等']


In [2]:
from thinc.api import MXNetWrapper, chain, Softmax, prefer_gpu
import mxnet as mx
import os
import numpy as np
import h5py
from tqdm.notebook import tqdm
from mxnet.gluon import SymbolBlock
import cupy

gpus = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = gpus
is_gpu = prefer_gpu()
print("GPU:", is_gpu)
context = [mx.context.gpu(i) for i in range(len(gpus))] if len(gpus) else \
    [mx.context.cpu()]
num_classes = 6426
crnn_instance = CRNN_mxnet(num_classes, inference=True)
network = crnn_instance.get_network()
input_symbol = crnn_instance.data
prefix = '/data/xiaowentao/.cnocr/1.1.0/conv-lite-lstm/cnocr-v1.1.0-conv-lite-lstm'
epoch = 47
batch_size = 32
data_shape = [('data', (batch_size, 1, 32, 560))]
label_width = 20

pred_fc = network.get_internals()['pred_fc_output']
sym = mx.sym.softmax(data=pred_fc)
# It seems that Thinc Shim only support mxnet Gluon.
# Therefore, we first wrap the Symbol network into Gluon Block.
network = SymbolBlock(outputs=sym, inputs=input_symbol)
# load the parameetrs!
network.collect_params().load('%s-%04d' % (prefix, epoch) + '.params',
                              ctx=context)
# Yet another way!
# with open('/tmp/crnn.json', 'w') as f:
#     f.write(sym.tojson())
# network = SymbolBlock.imports('/tmp/crnn.json', ['data'],
#                               '%s-%04d' % (prefix, epoch) + '.params',
#                               ctx=context)
network.hybridize(static_alloc=True, static_shape=True)

# MXNet doesn't provide a Softmax layer but a .softmax() operation/method for \
# prediction and it integrates an internal softmax during training. So to be able\
# to integrate it with the rest of the components, you combine it with a Softmax() \
# Thinc layer using the chain combinator.
wrapper_mxnet_crnn = MXNetWrapper(network)
# mx_model = chain(wrapper_mxnet_crnn, Softmax())
mx_model = wrapper_mxnet_crnn

imgs = []
golds = []
num_batch = 4
with h5py.File('./data_generated/dataset_fonts.h5', 'r') as d:
    for idx in range(batch_size * num_batch):
        idx = str(idx)
        imgs.append(d[idx]['img'][...] / 255.)
        label = str(d[idx]['y'][...])
        golds.append(label)
imgs = np.expand_dims(np.array(imgs, dtype=np.float32), axis=1)
print('imgs:', imgs.shape)
imgs = mx_model.ops.asarray(imgs, dtype='float32')
alphabet = {(idx+1): tok.strip() for idx, tok in enumerate(
    open('./cnocr/examples/label_cn.txt').readlines())}
alphabet[len(alphabet)] = ' '
# blank token
alphabet[0] = '#'
assert [len(tok) == 1 for tok in alphabet.values()]
alphabet_inv = {v: k for k, v in alphabet.items()}
golds_ids = [[alphabet_inv[c] for c in g] for g in golds]
golds_ids = [g + [0] * (label_width - len(g)) for g in golds_ids]
golds_ids = np.array(golds_ids, dtype=np.int32)
golds_ids = mx_model.ops.asarray(golds_ids, dtype='float32')

# mx_model.initialize(X=imgs[:4], Y=np.ones([4 * 139, num_classes], dtype=np.float32)[:4])
batches = mx_model.ops.multibatch(batch_size, imgs, golds_ids, shuffle=True)

res = []
ground = []
for X, Y in tqdm(batches, leave=False):
    Yh = mx_model.predict(X)
    assert Yh.shape == (batch_size * 139, num_classes)
    Yh = cupy.asnumpy(Yh)
    Y = cupy.asnumpy(Y)
    prob = np.reshape(Yh, (-1, batch_size, Yh.shape[1]))
    for i in range(batch_size):
        lp = np.argmax(prob[:, i, :], axis=-1)
#         print(lp[:10])
        res.append(''.join([
            alphabet[ele] for idx, ele in enumerate(
                lp) if (lp[idx] and (idx == 0 or (lp[idx] != lp[idx - 1])))
        ]))
        ground.append(''.join([alphabet[c] for c in Y[i]]))
print(res[:4], '\n', ground[:4])

GPU: True


	data: None
  input_sym_arg_type = in_param.infer_type()[0]


imgs: (128, 1, 32, 560)


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

['让客户由衷的体验奢侈之旅，尤其是万元以上', '0821:39:02)专家[warren', '离开区足的很多教因，其中包括因在凶春因泰', '一边与身旁的夏煊泽开玩笑，要分一半奖品给'] 
 ['让客户由衷的体验奢侈之旅，尤其是万元以上', '0821:39:02)专家[warren', '离开辽足的很多教练，其中包括现在长春亚泰', '一边与身旁的夏煊泽开玩笑，要分一半奖品给']


In [3]:
!nvidia-smi

Sun Apr 26 00:31:26 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.14       Driver Version: 430.14       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:18:00.0  On |                  N/A |
| 22%   32C    P2    61W / 250W |   2272MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:3B:00.0  On |                  N/A |
| 22%   37C    P2    59W / 250W |   4287MiB / 11019MiB |      2%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:86:00.0  On |                  N/A |
| 22%   