In [None]:
import os
import json
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras import callbacks
import numpy as np

batch_size = 16
epochs = 25
latent_dim = 256 # LSTM 的单元个数
num_samples = 1000 # 训练样本的大小

en_characters = set()
ch_characters = set()

# Opening JSON file
file = open('/content/drive/MyDrive/NatureLanguageProgram/training_data/translation2019zh_valid.json')
en_txt = []
ch_txt = []
txt = []
for line in file.readlines():
  data = json.loads(line)
  en_txt.append(data['english'])
  ch_txt.append(data['chinese'])
  txt.append(data)

# for i in ch_txt:
#   print(i)
# for i in en_txt:
#   top = jieba.analyse.extract_tags(i)
#   en.append(top)
# for i in ch_txt:
#   top = jieba.analyse.extract_tags(i)
#   zh.append(top)

for char in en_txt:
  if char not in en_characters:
    en_characters.add(char)
    # print("in char in en_txt ")
    # print(char)
for char in ch_txt:
  if char not in ch_characters:
    ch_characters.add(char)
    # print("in char in ch_txt ")
    # print(char)   
input_characters = sorted(list(en_characters))
target_characters = sorted(list(ch_characters))
num_encoder_tokens = len(en_characters)
num_decoder_tokens = len(ch_characters)
max_encoder_seq_length = max([ len(txt) for txt in en_txt])
max_decoder_seq_length = max([ len(txt) for txt in ch_txt])

print('Nunmber of samples:', len(en_txt))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length of input:', max_encoder_seq_length)
print('Max sequence length of outputs:', max_decoder_seq_length)

en_token_index = dict( [(char, i)for i, char in enumerate(input_characters)] )
ch_token_index = dict( [(char, i) for i, char in enumerate(target_characters)] )

encoder_input_data = np.zeros((len(en_txt), max_encoder_seq_length, num_encoder_tokens), dtype=np.float32)
decoder_input_data = np.zeros((len(en_txt), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)
decoder_target_data = np.zeros((len(en_txt), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)

for i, (en_txt, ch_txt) in enumerate(zip(en_txts, ch_txts)):
    # 对编码器的输入序列做one-hot
    for t, char in enumerate(en_txt):
        encoder_input_data[i, t, en_token_index[char]] = 1.0
    
    # 对解码器的输入与输出做序列做one-hot
    for t, char in enumerate(ch_txt):
        decoder_input_data[i, t, ch_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data 不包含开始字符，并且比decoder_input_data提前一步
            decoder_target_data[i, t-1, ch_token_index[char]] = 1.0

# 定义编码器的输入
# encoder_inputs (None, num_encoder_tokens), None表示可以处理任意长度的序列
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# 编码器，要求其返回状态
encoder = LSTM(latent_dim, return_state=True)

# 调用编码器，得到编码器的输出（输入其实不需要），以及状态信息 state_h 和 state_c
encoder_outpus, state_h, state_c = encoder(encoder_inputs)

# 丢弃encoder_outputs, 我们只需要编码器的状态
encoder_state = [state_h, state_c]

# 定义解码器的输入
# 同样的，None表示可以处理任意长度的序列
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# 接下来建立解码器，解码器将返回整个输出序列
# 并且返回其中间状态，中间状态在训练阶段不会用到，但是在推理阶段将是有用的
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# 将编码器输出的状态作为初始解码器的初始状态
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_state)

# 添加全连接层
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 定义整个模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 定义回调函数
#callback_list = [callbacks.EarlyStopping(patience=10)]
# 编译模型
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# 训练
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs = epochs,
          validation_split=0.2)
# 保存模型
model.save('s2s_2.h5')


Nunmber of samples: 39323
Number of unique input tokens: 39323
Number of unique output tokens: 39319
Max sequence length of input: 373
Max sequence length of outputs: 197
