In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import requests
import json
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras import callbacks
import numpy as np

batch_size = 64
epochs = 100
latent_dim = 256 # LSTM 的单元个数
num_samples = 1000 # 训练样本的大小

en_characters = set()
ch_characters = set()

# Opening JSON file
file = open('/content/new2.json',encoding='utf-8')
#file = open('translation2019zh_valid.json', encoding='utf-8')
en_txt = []
ch_txt = []
txt = []
for line in file.readlines():
  data = json.loads(line)
  en_txt.append(data['english'])
  ch_txt.append(data['chinese'])
  txt.append(data)

# for i in ch_txt:
#   print(i)
# for i in en_txt:
#   top = jieba.analyse.extract_tags(i)
#   en.append(top)
# for i in ch_txt:
#   top = jieba.analyse.extract_tags(i)
#   zh.append(top)

for word in en_txt:
  # print("word = ")
  # print(word)
  for char in word:
    # print("char = ")
    # print(char)
    if char not in en_characters:
      en_characters.add(char)

for word in ch_txt:
  for char in word:
    if char not in ch_characters:
      ch_characters.add(char)
    # print("in char in ch_txt ")
    # print(char)   
input_characters = sorted(list(en_characters))
target_characters = sorted(list(ch_characters))
num_encoder_tokens = len(en_characters)
num_decoder_tokens = len(ch_characters)
max_encoder_seq_length = max([ len(txt) for txt in en_txt])
max_decoder_seq_length = max([ len(txt) for txt in ch_txt])

print('Nunmber of samples:', len(en_txt))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length of input:', max_encoder_seq_length)
print('Max sequence length of outputs:', max_decoder_seq_length)

en_token_index = dict( [(char, i)for i, char in enumerate(input_characters)] )
ch_token_index = dict( [(char, i) for i, char in enumerate(target_characters)] )

encoder_input_data = np.zeros((len(en_txt), max_encoder_seq_length, num_encoder_tokens), dtype=np.float32)
decoder_input_data = np.zeros((len(en_txt), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)
decoder_target_data = np.zeros((len(en_txt), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)

for i, (en_txt, ch_txt) in enumerate(zip(en_txt, ch_txt)):
    # 对编码器的输入序列做one-hot
  for t, char in enumerate(en_txt):
    encoder_input_data[i,t,en_token_index[char]] = 1.0
    # 对解码器的输入与输出做序列做one-hot
  for t, char in enumerate(ch_txt):
    decoder_input_data[i,t,ch_token_index[char]] = 1.0
    if t > 0:
      # decoder_target_data 不包含开始字符，并且比decoder_input_data提前一步
      decoder_target_data[i, t-1, ch_token_index[char]] = 1.0

# 定义编码器的输入
# encoder_inputs (None, num_encoder_tokens), None表示可以处理任意长度的序列
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# 编码器，要求其返回状态
encoder = LSTM(latent_dim, return_state=True)

# 调用编码器，得到编码器的输出（输入其实不需要），以及状态信息 state_h 和 state_c
encoder_outpus, state_h, state_c = encoder(encoder_inputs)

# 丢弃encoder_outputs, 我们只需要编码器的状态
encoder_state = [state_h, state_c]

# 定义解码器的输入
# 同样的，None表示可以处理任意长度的序列
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# 接下来建立解码器，解码器将返回整个输出序列
# 并且返回其中间状态，中间状态在训练阶段不会用到，但是在推理阶段将是有用的
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# 将编码器输出的状态作为初始解码器的初始状态
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_state)

# 添加全连接层
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 定义整个模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 定义回调函数
#callback_list = [callbacks.EarlyStopping(patience=10)]
# 编译模型
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# 训练
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs = epochs,
          validation_split=0.2)
# 保存模型
model.save('s2s_2.h5')


Nunmber of samples: 1000
Number of unique input tokens: 113
Number of unique output tokens: 2400
Max sequence length of input: 218
Max sequence length of outputs: 92
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/10

In [None]:
from keras.models import Model

In [3]:
import os
import requests
import json
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras import callbacks
import numpy as np

batch_size = 64
epochs = 100
latent_dim = 256 # LSTM 的单元个数
num_samples = 1000 # 训练样本的大小

en_characters = set()
ch_characters = set()

# Opening JSON file
file = open('/content/new2.json',encoding='utf-8')
#file = open('translation2019zh_valid.json', encoding='utf-8')
en_txt = []
ch_txt = []
txt = []
for line in file.readlines():
  data = json.loads(line)
  en_txt.append(data['english'])
  ch_txt.append(data['chinese'])
  txt.append(data)

# for i in ch_txt:
#   print(i)
# for i in en_txt:
#   top = jieba.analyse.extract_tags(i)
#   en.append(top)
# for i in ch_txt:
#   top = jieba.analyse.extract_tags(i)
#   zh.append(top)

for word in en_txt:
  # print("word = ")
  # print(word)
  for char in word:
    # print("char = ")
    # print(char)
    if char not in en_characters:
      en_characters.add(char)

for word in ch_txt:
  for char in word:
    if char not in ch_characters:
      ch_characters.add(char)
    # print("in char in ch_txt ")
    # print(char)   
input_characters = sorted(list(en_characters))
target_characters = sorted(list(ch_characters))
num_encoder_tokens = len(en_characters)
num_decoder_tokens = len(ch_characters)
max_encoder_seq_length = max([ len(txt) for txt in en_txt])
max_decoder_seq_length = max([ len(txt) for txt in ch_txt])

print('Nunmber of samples:', len(en_txt))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length of input:', max_encoder_seq_length)
print('Max sequence length of outputs:', max_decoder_seq_length)

en_token_index = dict( [(char, i)for i, char in enumerate(input_characters)] )
ch_token_index = dict( [(char, i) for i, char in enumerate(target_characters)] )

encoder_input_data = np.zeros((len(en_txt), max_encoder_seq_length, num_encoder_tokens), dtype=np.float32)
decoder_input_data = np.zeros((len(en_txt), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)
decoder_target_data = np.zeros((len(en_txt), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)

for i, (en_txt, ch_txt) in enumerate(zip(en_txt, ch_txt)):
    # 对编码器的输入序列做one-hot
  for t, char in enumerate(en_txt):
    encoder_input_data[i,t,en_token_index[char]] = 1.0
    # 对解码器的输入与输出做序列做one-hot
  for t, char in enumerate(ch_txt):
    decoder_input_data[i,t,ch_token_index[char]] = 1.0
    if t > 0:
      # decoder_target_data 不包含开始字符，并且比decoder_input_data提前一步
      decoder_target_data[i, t-1, ch_token_index[char]] = 1.0

# 定义编码器的输入
# encoder_inputs (None, num_encoder_tokens), None表示可以处理任意长度的序列
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# 编码器，要求其返回状态
encoder = LSTM(latent_dim, return_state=True)

# 调用编码器，得到编码器的输出（输入其实不需要），以及状态信息 state_h 和 state_c
encoder_outpus, state_h, state_c = encoder(encoder_inputs)

# 丢弃encoder_outputs, 我们只需要编码器的状态
encoder_state = [state_h, state_c]

# 定义解码器的输入
# 同样的，None表示可以处理任意长度的序列
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# 接下来建立解码器，解码器将返回整个输出序列
# 并且返回其中间状态，中间状态在训练阶段不会用到，但是在推理阶段将是有用的
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# 将编码器输出的状态作为初始解码器的初始状态
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_state)

# 添加全连接层
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 定义整个模型
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 定义回调函数
#callback_list = [callbacks.EarlyStopping(patience=10)]
# 编译模型
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# 训练
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs = epochs,
          validation_split=0.2)
# 保存模型
model.save('s2s_2.h5')



# 定义 sampling 模型
# 定义 encoder 模型，得到输出encoder_states
encoder_model = Model(encoder_inputs, encoder_state)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# 得到解码器的输出以及中间状态
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs]+decoder_states)

# 建立 数字->字符 的字典，用于恢复
reverse_input_char_index = dict([(i, char) for char, i in input_token_index.items()])
reverse_target_char_index = dict([(i, char) for char, i in target_token_index.items()])

def decode_sequence(input_seq):
    # 将输入序列进行编码
    states_value = encoder_model.predict(input_seq)
    
    # 生成一个size=1的空序列
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # 将这个空序列的内容设置为开始字符
    target_seq[0, 0, target_token_index['\t']] = 1.
    
    # 进行字符恢复
    # 简单起见，假设batch_size = 1
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        # 退出条件：生成 \n 或者 超过最大序列长度
        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length :
            stop_condition = True
            
        # 更新target_seq
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        
        # 更新中间状态
        states_value = [h, c]
        
    return decoded_sentence

for seq_index in range(1000, 1100):
    # batch_size = 1
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)


Nunmber of samples: 1000
Number of unique input tokens: 113
Number of unique output tokens: 2400
Max sequence length of input: 218
Max sequence length of outputs: 92
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/10

NameError: ignored