In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import numpy as np
import os
import re
import io
import time
from random import shuffle

In [2]:
import jieba

In [5]:
' '.join(jieba.cut('一对乌鸦飞到我们屋顶上的巢里，它们好像专门为拉木而来的。'))

'一对 乌鸦 飞 到 我们 屋顶 上 的 巢里 ， 它们 好像 专门 为拉木 而 来 的 。'

In [9]:
zh_path = '/home/code-master/Documents/deeplearningProject/translate/ai_challenger_translation_train_20170904/translation_train_data_20170904/train.zh'
en_path = '/home/code-master/Documents/deeplearningProject/translate/ai_challenger_translation_train_20170904/translation_train_data_20170904/train.en'

In [4]:
def preprocess_sentence(w):
    w = re.sub(r'([?!,.，。！？])', r' \1 ', w) # 在单词与其后的标点符号间插入空格
    w = re.sub(r'[" "]+', ' ', w)
    w = re.sub(r'[^a-zA-Z\u4e00-\u9fa5?!,.，。！？]+', ' ',w) # 将所有不相关的字符替换为空格
    w = w.strip()
    w = '<start> ' + w + ' <end>' # 给句子前后加上开始和结束预测
    return w

In [5]:
zh_sentence = ' '.join(jieba.cut('一对乌鸦飞到我们屋顶上的巢里，它们好像专门为拉木而来的。'))
en_sentence = 'A pair of crows had come to nest on our roof as if they had come for Lhamo.'

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.582 seconds.
Prefix dict has been built successfully.


In [6]:
print(preprocess_sentence(zh_sentence))
print(preprocess_sentence(en_sentence))

<start> 一对 乌鸦 飞 到 我们 屋顶 上 的 巢里 ， 它们 好像 专门 为拉木 而 来 的 。 <end>
<start> A pair of crows had come to nest on our roof as if they had come for Lhamo . <end>


In [7]:
def create_dataset(num_example):
    zh_lines = open(zh_path).read().split('\n')
    en_lines = open(en_path).read().split('\n')
    inds = np.random.permutation(len(zh_lines))
    zh_lang = [preprocess_sentence(zh_lines[i]) for i in inds[:num_example]]
    en_lang = [preprocess_sentence(en_lines[i]) for i in inds[:num_example]]
    return zh_lang, en_lang

In [13]:
zh, en = create_dataset(30000)

In [14]:
print(zh[-1])
print(en[-1])

<start> 哦 你知道了 ， 是么 ？ <end>
<start> Oh . . . You know about that , huh ? <end>


In [12]:
def write_data(data, path):
    with open(path, mode='w') as wf:
        wf.write(data)

In [15]:
write_data('\n'.join(zh), './data/mini_zh_en/train.zh')
write_data('\n'.join(en), './data/mini_zh_en/train.en')

In [16]:
def tokenize(lang):
    lang_tokenize = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenize.fit_on_texts(lang)
    tensor = lang_tokenize.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenize

In [18]:
def load_dataset(num_examples):
    zh_lang, en_lang = create_dataset(num_examples)
    zh_tensor, zh_tokenize = tokenize(zh_lang)
    en_tensor, en_tokenize = tokenize(en_lang)
    return zh_tensor, en_tensor, zh_tokenize, en_tokenize

In [None]:
num_examples