In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import numpy as np
import os
import re
import io
import time
import jieba
from random import shuffle
from tensorflow.python.ops import array_ops

### preprocess

In [5]:
' '.join(jieba.cut('一对乌鸦飞到我们屋顶上的巢里，它们好像专门为拉木而来的。'))

'一对 乌鸦 飞 到 我们 屋顶 上 的 巢里 ， 它们 好像 专门 为拉木 而 来 的 。'

In [9]:
zh_path = '/home/code-master/Documents/deeplearningProject/translate/ai_challenger_translation_train_20170904/translation_train_data_20170904/train.zh'
en_path = '/home/code-master/Documents/deeplearningProject/translate/ai_challenger_translation_train_20170904/translation_train_data_20170904/train.en'

In [4]:
def preprocess_sentence(w):
    w = re.sub(r'([?!,.，。！？])', r' \1 ', w) # 在单词与其后的标点符号间插入空格
    w = re.sub(r'[" "]+', ' ', w)
    w = re.sub(r'[^a-zA-Z\u4e00-\u9fa5?!,.，。！？]+', ' ',w) # 将所有不相关的字符替换为空格
    w = w.strip()
    w = '<start> ' + w + ' <end>' # 给句子前后加上开始和结束预测
    return w

In [5]:
zh_sentence = ' '.join(jieba.cut('一对乌鸦飞到我们屋顶上的巢里，它们好像专门为拉木而来的。'))
en_sentence = 'A pair of crows had come to nest on our roof as if they had come for Lhamo.'

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.582 seconds.
Prefix dict has been built successfully.


In [6]:
print(preprocess_sentence(zh_sentence))
print(preprocess_sentence(en_sentence))

<start> 一对 乌鸦 飞 到 我们 屋顶 上 的 巢里 ， 它们 好像 专门 为拉木 而 来 的 。 <end>
<start> A pair of crows had come to nest on our roof as if they had come for Lhamo . <end>


In [26]:
def create_dataset(num_example):
    zh_lines = open(zh_path).read().split('\n')
    en_lines = open(en_path).read().split('\n')
    inds = np.random.permutation(len(zh_lines))
    zh_lang = [preprocess_sentence(' '.join(jieba.cut(zh_lines[i]))) for i in inds[:num_example]]
    en_lang = [preprocess_sentence(en_lines[i]) for i in inds[:num_example]]
    return zh_lang, en_lang

In [27]:
zh, en = create_dataset(30000)

In [28]:
print(zh[-1])
print(en[-1])

<start> 你 不仅仅 想要 做 化疗 。 <end>
<start> You don t just want chemo . <end>


In [29]:
def write_data(data, path):
    with open(path, mode='w') as wf:
        wf.write(data)

In [30]:
write_data('\n'.join(zh), './data/mini_zh_en/train.zh')
write_data('\n'.join(en), './data/mini_zh_en/train.en')

In [2]:
zh_lang = open('data/mini_zh_en/train.zh').read().split('\n')
en_lang = open('data/mini_zh_en/train.en').read().split('\n')
print(zh_lang[-1])
print(en_lang[-1])

<start> 你 不仅仅 想要 做 化疗 。 <end>
<start> You don t just want chemo . <end>


In [2]:
def tokenize(lang):
    lang_tokenize = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenize.fit_on_texts(lang)
    tensor = lang_tokenize.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenize

In [3]:
def load_dataset(num_examples):
    #zh_lang, en_lang = create_dataset(num_examples)
    zh_lang = open('data/mini_zh_en/train.zh').read().split('\n')
    en_lang = open('data/mini_zh_en/train.en').read().split('\n')
    zh_tensor, zh_tokenize = tokenize(zh_lang)
    en_tensor, en_tokenize = tokenize(en_lang)
    return zh_tensor, en_tensor, zh_tokenize, en_tokenize

In [4]:
num_examples = 30000
zh_tensor, en_tensor, zh_tokenize, en_tokenize = load_dataset(num_examples)

In [6]:
zh_tensor[:3]

array([[    1,   219,  3149,  2364,  3515,  9154,  1795,  3150,     4,
        14157,  6749,     5,    10,  3957,   396, 14158,     4,  1504,
          216,     4,   743, 14159,     3,     2,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [    1,     6,   229,    60,    10,   175,    21,   302,     2,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     

In [7]:
en_tensor[:3]

array([[    2,    23,  3290,    23,    18,  1533,  4205,     4,   300,
            4,     5,  5861,   551,  2301,  1473,     4,     9,  1080,
        10910,  4864,     5,   335,    26,   300,    10,     5,   166,
            1,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    2,     6,   176,    14,    25,     9,    92,   25

In [8]:
def convert(token, tensor):
    for t in tensor:
        if t != 0:
            print(t,token.index_word[t])

In [9]:
convert(zh_tokenize, zh_tensor[1])
convert(en_tokenize, en_tensor[1])

1 <start>
6 我
229 以为
60 那
10 是
175 所
21 好
302 学校
2 <end>
2 <start>
6 i
176 thought
14 it
25 was
9 a
92 good
255 school
1 .
3 <end>


In [10]:
print(zh_tensor.shape, en_tensor.shape)

(30000, 66) (30000, 118)


In [5]:
zh_tensor_train, zh_tensor_val, en_tensor_train, en_tensor_val = train_test_split(zh_tensor, en_tensor, test_size=0.2)

In [12]:
print(zh_tensor_train.shape, zh_tensor_val.shape, en_tensor_train.shape, en_tensor_val.shape)

(24000, 66) (6000, 66) (24000, 118) (6000, 118)


### build train set by tf.data

In [6]:
BATCH_SIZE = 64

In [7]:
BUFFER_SIZE = len(zh_tensor_train)

In [8]:
dataset = tf.data.Dataset.from_tensor_slices((zh_tensor_train, en_tensor_train))

In [9]:
dataset = dataset.shuffle(BUFFER_SIZE, reshuffle_each_iteration=True)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [37]:
example_zh_batch, example_en_batch = next(iter(dataset.take(3)))

In [35]:
print(example_zh_batch.shape, example_en_batch.shape)

(64, 66) (64, 118)


In [38]:
print(example_zh_batch.numpy())

[[   1  212 6334 ...    0    0    0]
 [   1    7   77 ...    0    0    0]
 [   1    6  463 ...    0    0    0]
 ...
 [   1  120   49 ...    0    0    0]
 [   1   16 1295 ...    0    0    0]
 [   1   58   10 ...    0    0    0]]


### build model

In [10]:
zh_train_lens = zh_tensor_train.shape[1]
en_train_lens = en_tensor_train.shape[1]
vocab_zh_size = len(zh_tokenize.word_index) + 1
vocab_en_size = len(en_tokenize.word_index) + 1
emb_size = 256
units = 1024
batch_size = 64

In [41]:
print(zh_train_lens, en_train_lens, vocab_zh_size, vocab_en_size)

66 118 34672 23595


In [11]:
# encoder
inp_enc = tf.keras.layers.Input(shape=(zh_train_lens,))
enc_emb = tf.keras.layers.Embedding(vocab_zh_size, emb_size)(inp_enc)
inp_out, h_state, c_state = tf.keras.layers.LSTM(units, return_state=True)(enc_emb)

# decoder
inp_dec = tf.keras.layers.Input(shape=(en_train_lens,))
dec_emb = tf.keras.layers.Embedding(vocab_en_size, emb_size)(inp_dec)
dec_lstm = tf.keras.layers.LSTM(units, return_sequences=True)

In [12]:
dec_lstm.states[0] = inp_out

In [71]:
dec_out = dec_lstm(dec_emb)
out = tf.keras.layers.Dense(vocab_en_size)(dec_out)

In [72]:
model = tf.keras.Model([inp_enc, inp_dec], out)

In [73]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 118)]        0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 118, 256)     6040320     input_8[0][0]                    
__________________________________________________________________________________________________
lstm_7 (LSTM)                   (None, 118, 1024)    5246976     embedding_7[0][0]                
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 66)]         0                                            
_______________________________________________________________________________________