##### 根據 `seq2seq_eng_to_chn.ipynb` 改寫成 中翻英

In [1]:
import re
import gc
import string
import pandas as pd
from tqdm import tqdm
from string import digits
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import tensorflow
# Error Solved: Fail to find the dnn implementation
# => https://github.com/tensorflow/tensorflow/issues/24496
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense, Embedding
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import model_from_json
import pickle as pkl
import numpy as np


In [2]:
def clean_eng_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()

    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
#     text = re.sub(str([x for x in digits]), " ", text)

    return text

def clean_che_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    text = re.sub(r"[，。、？！：；「」《》·‘•“”?]", "", text)
    text = re.sub(r"\u200b", "", text)
    text = re.sub(str([x for x in digits]), "", text)
    text = re.sub(str([x for x in string.ascii_lowercase]), "", text)

    return text

def preprocess(text_list):
    text_ = [x.lower() for x in text_list]
    text_ = [re.sub("'", '', x) for x in text_]
    return text_

def removePunc(text_list):
    table = str.maketrans('', '', string.punctuation)
    remove_punc_text = []
    for sent in text_list:
        sentence = [w.translate(table) for w in sent.split(' ')]
        remove_punc_text.append(' '.join(sentence))
    return remove_punc_text


#### import dataset

In [14]:
data_path = 'language_data.csv'
df = pd.read_csv(data_path)
df.columns = ['targets', 'inputs']

# df.shape

input_sentences = df.inputs.values.tolist()#[:NUM_SAMPLE]
target_sentences = df.targets.values.tolist()#[:NUM_SAMPLE]


#### cleansing

In [15]:
input_sentences = [clean_che_text(x) for x in input_sentences]
input_sentences = preprocess(input_sentences)
input_sentences = removePunc(input_sentences)
target_sentences = [clean_eng_text(x) for x in target_sentences]
target_sentences = preprocess(target_sentences)
target_sentences = removePunc(target_sentences)

# 句首加'\t'當作起始標誌，句末加'\n'當作終止標誌
target_sentences = ['\t ' + x + ' \n' for x in target_sentences]

# 確認中英文各自所有的 unique字符
target_ = []
for x in target_sentences:
    for a in x.split(' '):
        target_.append(a)

input_characters = sorted(list(set(pd.DataFrame(input_sentences)[0].unique().sum())))
target_characters = sorted(pd.DataFrame(target_)[0].unique())


#### 生成 LSTM 三維 input
把句子中各字符轉換成 one-hot 編碼，生成LSTM需要的三维输入 `n_samples`, `timestamp`, `one-hot` features

- `NUM_SAMPLES`，样本条数，这里是输入的句子条数
- `INPUT_LENGTH`，输入数据的时刻t的长度，这里为最长的英文句子长度
- `OUTPUT_LENGTH`，输出数据的时刻t的长度，这里为最长的中文句子长度
- `INPUT_FEATURE_LENGTH`，每个时刻进入encoder的lstm单元的数据xtxt的维度，这里为英文中出现的字符数
- `OUTPUT_FEATURE_LENGTH`，每个时刻进入decoder的lstm单元的数据xtxt的维度，这里为中文中出现的字符数


In [19]:
NUM_SAMPLES = int(len(input_sentences))
INUPT_LENGTH = int(max([len(x) for x in input_sentences]))
OUTPUT_LENGTH = int(max([len(x.split(' ')) for x in target_sentences]))
INPUT_FEATURE_LENGTH = int(len(input_characters))
OUTPUT_FEATURE_LENGTH = int(len(target_characters))
print(f'NUM_SAMPLES: {NUM_SAMPLES}, INUPT_LENGTH: {INUPT_LENGTH}, OUTPUT_LENGTH: {OUTPUT_LENGTH}, INPUT_FEATURE_LENGTH: {INPUT_FEATURE_LENGTH}, OUTPUT_FEATURE_LENGTH: {OUTPUT_FEATURE_LENGTH}')

input_dict = {char:index for index, char in enumerate(input_characters)}
input_dict_reverse = {index:char for index, char in enumerate(input_characters)}
target_dict = {char:index for index, char in enumerate(target_characters)}
target_dict_reverse = {index:char for index, char in enumerate(target_characters)}

# encoder输入、decoder输入输出初始化为三维向量
encoder_input = np.zeros((NUM_SAMPLES, INUPT_LENGTH, INPUT_FEATURE_LENGTH), dtype='uint8')
decoder_input = np.zeros((NUM_SAMPLES, OUTPUT_LENGTH, OUTPUT_FEATURE_LENGTH), dtype='uint8')
decoder_output = np.zeros((NUM_SAMPLES, OUTPUT_LENGTH, OUTPUT_FEATURE_LENGTH), dtype='uint8')

# 將 input_sentence 進行字符級 one-hot 編碼
for seq_index, seq in enumerate(input_sentences):
    for char_index, char in enumerate(seq):
        encoder_input[seq_index, char_index, input_dict[char]] = 1.0

# 將 target_sentence 進行字符級 one-hot 編碼
for seq_index, seq in enumerate(target_sentences):
    for char_index, char in enumerate(seq.split(' ')):
        decoder_input[seq_index, char_index, target_dict[char]] = 1.0
        if char_index > 0:
            # 训练模型时decoder的输入要比输出晚一个时间步，这样才能对输出监督
            decoder_output[seq_index, char_index-1, target_dict[char]] = 1.0


encoder_input[0]
decoder_input[0]
decoder_output[0]

NUM_SAMPLES: 23444, INUPT_LENGTH: 42, OUTPUT_LENGTH: 36, INPUT_FEATURE_LENGTH: 3420, OUTPUT_FEATURE_LENGTH: 6628


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

#### 構建 encoder-decoder 及 infer 模型

In [20]:
def create_model(n_input, n_output, n_units):
    ###encoder
    encode_input = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    _, encoder_h, encoder_c = encoder(encode_input)
    encoder_state = [encoder_h, encoder_c]

    ###decoder
    decode_input = Input(shape=(None, n_output))
    decoder = LSTM(n_units, return_sequences=True, return_state=True)
    decode_output, _, _ = decoder(decode_input, initial_state=encoder_state)
    decoder_dense = Dense(n_output, activation='softmax')
    decode_output = decoder_dense(decode_output)

    model = Model([encode_input, decode_input], decode_output)
    
    encoder_infer = Model(encode_input, encoder_state)

    decoder_state_input_h = Input(shape=(n_units, ))
    decoder_state_input_c = Input(shape=(n_units, ))
    decoder_state_input = [decoder_state_input_h, decoder_state_input_c]

    decoder_infer_output, decoder_infer_state_h, decoder_infer_state_c = decoder(decode_input, initial_state=decoder_state_input)
    decoder_infer_state = [decoder_infer_state_h, decoder_infer_state_c]
    decoder_infer_output = decoder_dense(decoder_infer_output)
    decoder_infer = Model([decode_input] + decoder_state_input, [decoder_infer_output] + decoder_infer_state)

    return model, encoder_infer, decoder_infer

#### 建構預測 function

In [21]:
def predict_chinese(source, encoder_infer, decoder_infer, n_steps, features):
    # 先推理 encoder，获得预测输入序列的隐状态
    state = encoder_infer.predict(source)
    predict_seq = np.zeros((1, 1, features))
    # 標記起始符處
    predict_seq[0, 0, target_dict['\t']] = 1
    
    output = ''
    for i in range(n_steps):
        yhat, h, c = decoder_infer.predict([predict_seq] + state)
        char_index = np.argmax(yhat[0, -1, :])
        char = target_dict_reverse[char_index]
        output += char + ' '  #輸出中文不需空格
        state = [h, c]
        predict_seq = np.zeros((1, 1, features))
        predict_seq[0, 0, char_index] = 1
        if char == '\n':  # 遇到終止符號則停止拚句
            break
    
    return output

#### 模型訓練

In [28]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.


In [29]:
model, encoder_infer, decoder_infer = create_model(n_input=INPUT_FEATURE_LENGTH,
                                                   n_output=OUTPUT_FEATURE_LENGTH,
                                                   n_units=latent_dim)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit([encoder_input, decoder_input], decoder_output, 
          batch_size=batch_size, epochs=epochs, validation_split=0.2)
model.save('seq2seq__chn_to_eng.h5')

    

Train on 18755 samples, validate on 4689 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x23cfd9cd948>

#### 進行英翻中推論預測

In [30]:
for i in range(1000, 1200):
    test = encoder_input[i:i+1, :, :]
    print(test.shape)
    out = predict_chinese(test, encoder_infer, decoder_infer, OUTPUT_LENGTH, OUTPUT_FEATURE_LENGTH)
    print(input_sentences[i])
    print(out)


(1, 42, 3420)
我喜欢阅读
i like reading 
 
(1, 42, 3420)
我喜欢跑步
i like running 
 
(1, 42, 3420)
我喜欢阅读
i like reading 
 
(1, 42, 3420)
我爱我的生活
i love my life 
 
(1, 42, 3420)
我爱我的生命
i love my life 
 
(1, 42, 3420)
我爱我的妻子
i love my wife 
 
(1, 42, 3420)
我愛派對
i love tom 
 
(1, 42, 3420)
我遇见一个朋友
i met a friend 
 
(1, 42, 3420)
我必须拒绝
i have to say no 
 
(1, 42, 3420)
我该回家了
i have to go home 
 
(1, 42, 3420)
我需要加薪
i need a stamp 
 
(1, 42, 3420)
我需要一張郵票
i need a stamp 
 
(1, 42, 3420)
我尽快需要
i need it as quickly as possible 
 
(1, 42, 3420)
我需要我的大衣
i need my coat 
 
(1, 42, 3420)
我需要知道
i need to know 
 
(1, 42, 3420)
我经常打嗝
i often visit 
 
(1, 42, 3420)
我说过了闭嘴
i said shut up 
 
(1, 42, 3420)
我看到了五個男人
i saw five men 
 
(1, 42, 3420)
我看见五个男人
i saw five men 
 
(1, 42, 3420)
我看見她游泳
i saw her swim 
 
(1, 42, 3420)
我应该去做
i should do it 
 
(1, 42, 3420)
我学韩语
i study play tennis 
 
(1, 42, 3420)
我想去玩
i want to play 
 
(1, 42, 3420)
我以前是醫生
i was a doctor 
 
(1, 42, 3420)
我在學習
i was learning 
 
(1, 42, 3420)
