In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import numpy as np
import os
import re
import io
import time
from random import shuffle

In [2]:
import jieba

### preprocess

In [5]:
' '.join(jieba.cut('一对乌鸦飞到我们屋顶上的巢里，它们好像专门为拉木而来的。'))

'一对 乌鸦 飞 到 我们 屋顶 上 的 巢里 ， 它们 好像 专门 为拉木 而 来 的 。'

In [9]:
zh_path = '/home/code-master/Documents/deeplearningProject/translate/ai_challenger_translation_train_20170904/translation_train_data_20170904/train.zh'
en_path = '/home/code-master/Documents/deeplearningProject/translate/ai_challenger_translation_train_20170904/translation_train_data_20170904/train.en'

In [4]:
def preprocess_sentence(w):
    w = re.sub(r'([?!,.，。！？])', r' \1 ', w) # 在单词与其后的标点符号间插入空格
    w = re.sub(r'[" "]+', ' ', w)
    w = re.sub(r'[^a-zA-Z\u4e00-\u9fa5?!,.，。！？]+', ' ',w) # 将所有不相关的字符替换为空格
    w = w.strip()
    w = '<start> ' + w + ' <end>' # 给句子前后加上开始和结束预测
    return w

In [5]:
zh_sentence = ' '.join(jieba.cut('一对乌鸦飞到我们屋顶上的巢里，它们好像专门为拉木而来的。'))
en_sentence = 'A pair of crows had come to nest on our roof as if they had come for Lhamo.'

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.582 seconds.
Prefix dict has been built successfully.


In [6]:
print(preprocess_sentence(zh_sentence))
print(preprocess_sentence(en_sentence))

<start> 一对 乌鸦 飞 到 我们 屋顶 上 的 巢里 ， 它们 好像 专门 为拉木 而 来 的 。 <end>
<start> A pair of crows had come to nest on our roof as if they had come for Lhamo . <end>


In [26]:
def create_dataset(num_example):
    zh_lines = open(zh_path).read().split('\n')
    en_lines = open(en_path).read().split('\n')
    inds = np.random.permutation(len(zh_lines))
    zh_lang = [preprocess_sentence(' '.join(jieba.cut(zh_lines[i]))) for i in inds[:num_example]]
    en_lang = [preprocess_sentence(en_lines[i]) for i in inds[:num_example]]
    return zh_lang, en_lang

In [27]:
zh, en = create_dataset(30000)

In [28]:
print(zh[-1])
print(en[-1])

<start> 你 不仅仅 想要 做 化疗 。 <end>
<start> You don t just want chemo . <end>


In [29]:
def write_data(data, path):
    with open(path, mode='w') as wf:
        wf.write(data)

In [30]:
write_data('\n'.join(zh), './data/mini_zh_en/train.zh')
write_data('\n'.join(en), './data/mini_zh_en/train.en')

In [16]:
def tokenize(lang):
    lang_tokenize = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenize.fit_on_texts(lang)
    tensor = lang_tokenize.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenize

In [18]:
def load_dataset(num_examples):
    zh_lang, en_lang = create_dataset(num_examples)
    zh_tensor, zh_tokenize = tokenize(zh_lang)
    en_tensor, en_tokenize = tokenize(en_lang)
    return zh_tensor, en_tensor, zh_tokenize, en_tokenize

In [31]:
num_examples = 30000
zh_tensor, en_tensor, zh_tokenize, en_tokenize = load_dataset(num_examples)

In [32]:
zh_tensor[:3]

array([[    1,  2346,   356,   553,    70,     3,     2,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [    1,    35,   442,    18,   100,   242,    85,     9,     2,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     

In [33]:
en_tensor[:3]

array([[   2,   15,   13,   49,    6,   82,   87,   60,  150,    1,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [   2,   74,  124,  213,   16,  417,   12,    3,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [23]:
def convert(token, tensor):
    for t in tensor:
        if t != 0:
            print(t,token.index_word[t])

In [34]:
convert(zh_tokenize, zh_tensor[1])
convert(en_tokenize, en_tensor[1])

1 <start>
35 这
442 到底
18 有
100 多
242 重要
85 呢
9 ？
2 <end>
2 <start>
74 how
124 much
213 does
16 that
417 matter
12 ?
3 <end>


In [35]:
print(zh_tensor.shape, en_tensor.shape)

(30000, 84) (30000, 125)


In [36]:
zh_tensor_train, zh_tensor_val, en_tensor_train, en_tensor_val = train_test_split(zh_tensor, en_tensor, test_size=0.2)

In [37]:
print(zh_tensor_train.shape, zh_tensor_val.shape, en_tensor_train.shape, en_tensor_val.shape)

(24000, 84) (6000, 84) (24000, 125) (6000, 125)


### build train set by tf.data

In [39]:
BATCH_SIZE = 64

In [38]:
dataset = tf.data.Dataset.from_tensor_slices((zh_tensor_train, en_tensor_train))

In [40]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [41]:
example_zh_batch, example_en_batch = next(iter(dataset))

In [42]:
print(example_zh_batch.shape, example_en_batch.shape)

(64, 84) (64, 125)


In [43]:
print(example_zh_batch.numpy())

[[    1   725 30634 ...     0     0     0]
 [    1     6    10 ...     0     0     0]
 [    1    44     6 ...     0     0     0]
 ...
 [    1    34  1997 ...     0     0     0]
 [    1    10     5 ...     0     0     0]
 [    1    32   666 ...     0     0     0]]
