In [1]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

In [2]:
from dataset import load_table, Dataset
ch2id, spell2id = load_table()

# 该配置下大概需要4G显存
batch_size = 32
len_thresh = (10, 50)  # 长度阈值

train_data = Dataset('data/data_clean.txt', batch_size,
                     len_thresh, shuffle=True)
test_data = Dataset('eval/eval_clean.txt', batch_size,
                    len_thresh, shuffle=True)

(1532803, 50) (1532803, 50)
(2135, 50) (2135, 50)


## 搭建网络

In [3]:
import tensorflow as tf

voc_size = len(ch2id)
t_size = len_thresh[1]

  from ._conv import register_converters as _register_converters


In [4]:
tf.reset_default_graph()

X = tf.placeholder(tf.int32, [None, t_size])
Y = tf.placeholder(tf.int32, [None, t_size])
is_training = tf.placeholder(tf.bool)    # 训练标识位

### Embedding

In [5]:
emb_size = 300
with tf.variable_scope('spell_emb'):
    lookup_table = tf.get_variable(dtype=tf.float32, shape=[voc_size, emb_size],
                                   initializer=tf.truncated_normal_initializer(mean=0,
                                                                               stddev=0.01),
                                   name='emb_lookup')
    lookup_table = tf.concat((tf.zeros([1, emb_size]),
                              lookup_table[1:, :]), axis=0)    # Empty对应的idx为0，将其emb全设为0
    spell_emb = tf.nn.embedding_lookup(lookup_table, X)    # (None,t_size,emb_size)

Instructions for updating:
Colocations handled automatically by placer.


### Pre-net

In [6]:
# 论文中的Pre-net
unit_fc = [emb_size, emb_size//2]
drop_rate = 0.5

with tf.variable_scope('Pre-net'):
    prenet = tf.layers.dense(spell_emb, units=unit_fc[0],
                             activation=tf.nn.relu)    # (None,t_size,unit_fc[0])
    prenet = tf.layers.dropout(prenet, rate=drop_rate,
                               training=is_training)    # (None,t_size,unit_fc[0])
    prenet = tf.layers.dense(prenet, units=unit_fc[1],
                             activation=tf.nn.relu)    # (None,t_size,unit_fc[1])
    prenet = tf.layers.dropout(prenet, rate=drop_rate,
                               training=is_training)    # (None,t_size,unit_fc[1])

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


### Conv1D bank

In [7]:
def batch_norm(inputs, training=True, act_f=None):
    '''
    自定义batch_norm，使用fusedBN
    '''
    # 把维度扩展成4维，然后使用更快的fusedBN
    shape_I = inputs.get_shape()
    rank_I = shape_I.ndims

    if rank_I in [2, 3, 4]:
        if rank_I == 2:
            inputs = tf.expand_dims(inputs, axis=1)    # (X, X, 1)
            inputs = tf.expand_dims(inputs, axis=2)    # (X, X, 1, 1)
        elif rank_I == 3:
            inputs = tf.expand_dims(inputs, axis=1)  # (X, X, X, 1)

    inputs = tf.layers.batch_normalization(inputs, training=training,
                                           fused=True)

    # 恢复成原来的维度
    if rank_I == 2:
        inputs = tf.squeeze(inputs, axis=[1, 2])    # (X, X)
    elif rank_I == 3:
        inputs = tf.squeeze(inputs, axis=1)    # (X, X, X)

    if act_f:
        inputs = act_f(inputs)

    return inputs

In [8]:
K = 16
with tf.variable_scope('Conv1D_bank'):
    n_filters = emb_size//2

    # 使用[1,K]个大小的卷积核提取信息，并拼接在一起，同TextCNN
    # k=1
    conv1d_bank = tf.layers.conv1d(prenet, filters=n_filters, kernel_size=1,
                                   padding='same', use_bias=False)    # (None,t_size,n_filters)

    # k=2,3,...,K
    for k in range(2, K+1):
        conv = tf.layers.conv1d(prenet, filters=n_filters, kernel_size=k,
                                padding='same', use_bias=False)
        conv1d_bank = tf.concat((conv1d_bank, conv),
                                axis=-1)    # (None,t_size,k*n_filters)

    conv1d_bank = batch_norm(conv1d_bank, training=is_training,
                             act_f=tf.nn.relu)    # (None,t_size,K*n_filters)

# 在t维度上做maxpool，同TextCNN
max_pooling = tf.layers.max_pooling1d(conv1d_bank, pool_size=2, strides=1,
                                      padding='same')    # (None,t_size,K*n_filters)

Instructions for updating:
Use keras.layers.conv1d instead.
Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Use keras.layers.max_pooling1d instead.


### Conv1D projections

In [9]:
with tf.variable_scope('conv1d_projections'):
    conv1d_pro = tf.layers.conv1d(max_pooling, filters=n_filters, kernel_size=5,
                                  padding='same', use_bias=False)    # (None,t_size,n_filters)
    conv1d_pro = batch_norm(conv1d_pro, training=is_training, act_f=tf.nn.relu)

    conv1d_pro = tf.layers.conv1d(conv1d_pro, filters=n_filters, kernel_size=5,
                                  padding='same', use_bias=False)    # (None,t_size,n_filters)
    conv1d_pro = batch_norm(conv1d_pro, training=is_training, act_f=tf.nn.relu)

In [10]:
encoding = prenet+conv1d_pro    # 残差连接，(None,t_size,n_filters)

### Highway layers

In [11]:
def highway_block(inputs, units, scope=None):
    '''
    高速网络块
    '''
    with tf.variable_scope(scope):
        H = tf.layers.dense(inputs, units, activation=tf.nn.relu, name='H')
        T = tf.layers.dense(inputs, units, activation=tf.nn.sigmoid,
                            bias_initializer=tf.constant_initializer(-1.0), name='T')

    return H*T+inputs*(1-T)

In [12]:
n_highway_block = 4
for i in range(n_highway_block):
    encoding = highway_block(encoding, units=emb_size//2,
                             scope='highway_{}'.format(i))    # (None, t_size, emb_size//2)

### BiRNN

In [13]:
with tf.variable_scope('BiRNN'):
    gru_fw = tf.nn.rnn_cell.GRUCell(emb_size//2)
    gru_bw = tf.nn.rnn_cell.GRUCell(emb_size//2)
    rnn_out, _ = tf.nn.bidirectional_dynamic_rnn(gru_fw, gru_bw, encoding,
                                                 dtype=tf.float32)

    # (None,None,emb_size//2*2)，双向RNN*2
    encoding = tf.concat(rnn_out, axis=2)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


### Others

In [14]:
logits = tf.layers.dense(encoding, len(ch2id), use_bias=False, name='logit')
preds = tf.to_int32(tf.arg_max(logits, dimension=-1))

with tf.name_scope('Eval'):
    non_empty_mask = tf.to_float(tf.not_equal(
        Y, tf.zeros_like(Y)))    # 0代表Empty，不参与计算
    all_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y,
                                                              logits=logits)    # 无差别loss
    loss = tf.reduce_sum(all_loss*non_empty_mask) / \
        tf.reduce_sum(non_empty_mask)    # 非空loss
    acc = tf.reduce_sum(tf.to_float(tf.equal(preds, Y)) * non_empty_mask) / \
        tf.reduce_sum(non_empty_mask)

# train_op
with tf.name_scope('train_op'):
    lr = 1e-3
    glob_step = tf.Variable(0, name='global_step', trainable=False)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = tf.train.AdamOptimizer(lr) \
            .minimize(loss, global_step=glob_step)

init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True    # 按需使用显存

Instructions for updating:
Use `tf.math.argmax` instead
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


## 训练网络

In [15]:
with tf.Session(config=config) as sess:
    sess.run(init)
    epochs = 1    # 太慢了，只跑一次

    batch_cnt = 0
    for epoch in range(epochs):
        for batch_data, batch_labels in train_data.next_batch():
            batch_cnt += 1
            loss_val, acc_val, _ = sess.run([loss, acc, train_op],
                                            feed_dict={X: batch_data, Y: batch_labels,
                                                       is_training: True})

            if batch_cnt % 500 == 0:
                print('epoch: {}, batch_loss: {}, batch_acc: {}'
                      .format(epoch+1, loss_val, acc_val))

            if batch_cnt % 3000 == 0:
                test_acc_val = sess.run(acc, feed_dict={X: test_data.data, Y: test_data.target,
                                                        is_training: False})
                print('epoch: {}, test_acc: {}'.format(epoch+1, test_acc_val))

    Y_pred = sess.run(preds, feed_dict={X: test_data.data, Y: test_data.target,
                                        is_training: False})

epoch: 1, batch_loss: 2.215376377105713, batch_acc: 0.6859421730041504
epoch: 1, batch_loss: 1.5334725379943848, batch_acc: 0.7321063280105591
epoch: 1, batch_loss: 1.2114081382751465, batch_acc: 0.7555321455001831
epoch: 1, batch_loss: 0.9934782981872559, batch_acc: 0.7843137383460999
epoch: 1, batch_loss: 0.7513704895973206, batch_acc: 0.8320935368537903
epoch: 1, batch_loss: 0.6397449374198914, batch_acc: 0.8294416069984436
epoch: 1, test_acc: 0.8576877117156982
epoch: 1, batch_loss: 0.5762260556221008, batch_acc: 0.8477580547332764
epoch: 1, batch_loss: 0.4585951268672943, batch_acc: 0.8772093057632446
epoch: 1, batch_loss: 0.4294915795326233, batch_acc: 0.8851774334907532
epoch: 1, batch_loss: 0.4553367793560028, batch_acc: 0.8808837532997131
epoch: 1, batch_loss: 0.4109019637107849, batch_acc: 0.8803879022598267
epoch: 1, batch_loss: 0.4234901964664459, batch_acc: 0.8888888955116272
epoch: 1, test_acc: 0.9053552746772766
epoch: 1, batch_loss: 0.3857410252094269, batch_acc: 0.8908

## 评估

In [None]:
import distance
import pickle
import numpy as np


id2ch = pickle.load(open('data/id2ch.pkl', 'rb'))

with open('eval/eval_res.csv', 'w', encoding='utf-8') as fd:
    fd.write('True,Pred,CER\n')
    total_cer = 0

    for y_test, y_pred in zip(test_data.target, Y_pred):
        s_len = np.count_nonzero(y_test)
        y_test_ch = ''.join([id2ch[idx]
                             for idx in y_test])[:s_len].replace('_', '')
        y_pred_ch = ''.join([id2ch[idx]
                             for idx in y_pred])[:s_len].replace('_', '')
        cer = distance.levenshtein(y_test_ch, y_pred_ch)/s_len

        fd.write('{},{},{:.2f}\n'.format(y_test_ch, y_pred_ch, cer))

        total_cer += cer

    fd.write('Total CER: {:.2f}\n'.format(total_cer/test_data.target.shape[0]))