In [26]:
import numpy as np
import pandas as pd

from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import optimizers

In [27]:
data_path = 'cmn.txt'
batch_size = 64

In [28]:
data = pd.read_csv("cmn.txt", sep='\t', header=None, names=["english","chinses"])
data = data[:10000]
print(data.shape)
data.head()

(10000, 2)


Unnamed: 0,english,chinses
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Wait!,等等！
4,Hello!,你好。


In [29]:
data["english"] = data["english"].apply(lambda x: "\t " + x + " \n")
data["chinses"] = data["chinses"].apply(lambda x: "\t " + x + " \n")

In [30]:
def word_tokenizer(df):
    tokenizer = Tokenizer(num_words=8000, char_level=True)
    dcaptions = df.values
    tokenizer.fit_on_texts(dcaptions)
    vocab_size = len(tokenizer.word_index) + 1    
    dtexts = tokenizer.texts_to_sequences(dcaptions)    
    maxlen = np.max([len(text) for text in dtexts])
    index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])
    word_index = dict([(word,index) for word, index in tokenizer.word_index.items()])  
        
    return tokenizer, dtexts, vocab_size, maxlen, index_word, word_index

In [31]:
tokenizer_eng, dtexts_eng, vocab_size_english, maxlen_english, index_word_english, word_index_english = word_tokenizer(data["english"])
data["english_token"] = dtexts_eng
vocab_size_english, maxlen_english

(76, 34)

In [32]:
tokenizer_chi, dtexts_chi, vocab_size_chinses, maxlen_chinses, index_word_chinses, word_index_chinses = word_tokenizer(data["chinses"])
data["chinese_token"] = dtexts_chi
vocab_size_chinses, maxlen_chinses

(2624, 24)

In [33]:
data.head()

Unnamed: 0,english,chinses,english_token,chinese_token
0,\t Hi. \n,\t 嗨。 \n,"[8, 1, 31, 7, 13, 1, 9]","[2, 1, 1265, 4, 1, 3]"
1,\t Hi. \n,\t 你好。 \n,"[8, 1, 31, 7, 13, 1, 9]","[2, 1, 7, 26, 4, 1, 3]"
2,\t Run. \n,\t 你用跑的。 \n,"[8, 1, 57, 16, 10, 13, 1, 9]","[2, 1, 7, 117, 261, 6, 4, 1, 3]"
3,\t Wait! \n,\t 等等！ \n,"[8, 1, 32, 5, 7, 4, 44, 1, 9]","[2, 1, 187, 187, 90, 1, 3]"
4,\t Hello! \n,\t 你好。 \n,"[8, 1, 31, 2, 14, 14, 3, 44, 1, 9]","[2, 1, 7, 26, 4, 1, 3]"


In [34]:
def get_seq_data(row):
    eng_x, chi_x, chi_y = [], [], []
    english_token = row["english_token"]
    chinese_token = row["chinese_token"]
    for i in range(1,maxlen_english):
        if i < len(english_token):
            eng_text = english_token[:i]
            eng_text = pad_sequences([eng_text],maxlen=maxlen_english).flatten()
            eng_x.append(eng_text)
        else:
            eng_x.append(np.zeros(maxlen_english))
    for i in range(1,maxlen_chinses):
        if i < len(chinese_token):
            chi_text, chi_target = chinese_token[:i], chinese_token[i]
            chi_text = pad_sequences([chi_text],maxlen=maxlen_chinses).flatten()
            chi_target = to_categorical(chi_target,num_classes = vocab_size_chinses)       
            chi_x.append(chi_text)
            chi_y.append(chi_target)
        else:
            chi_x.append(np.zeros(maxlen_chinses))
            chi_y.append(np.zeros(vocab_size_chinses))
            
   
    return eng_x, chi_x, chi_y              

In [35]:
def data_generator(df, batch_size):
    batch_eng_x, batch_chi_x, batch_chi_y = [], [], []
    count = 0
    while True:
        for i,row in df.iterrows():
            count += 1
            eng_x, chi_x, chi_y = get_seq_data(row)
            batch_eng_x.append(eng_x)
            batch_chi_x.append(chi_x)
            batch_chi_y.append(chi_y)

            if count == batch_size:
                batch_eng_x = np.array(batch_eng_x)
                batch_chi_x = np.array(batch_chi_x)
                batch_chi_y = np.array(batch_chi_y)
                yield [[batch_eng_x, batch_chi_x],batch_chi_y]
                batch_eng_x, batch_chi_x, batch_chi_y = [], [], []
                count = 0

In [36]:
gerato = data_generator(data, batch_size)

In [37]:
a = next(gerato)

In [38]:
a[0][0].shape, a[0][1].shape, a[1].shape

((64, 33, 34), (64, 23, 24), (64, 23, 2624))

In [39]:
latent_dim = 256
encoder_inputs = Input(shape=(None, maxlen_english))
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_inputs)

decoder_inputs = Input(shape=(None, maxlen_chinses))
decoder_outputs, _, _ =  LSTM(latent_dim, return_sequences=True, return_state=True)(decoder_inputs, initial_state=[state_h, state_c])

decoder_outputs = Dense(vocab_size_chinses, activation='softmax')(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [40]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, None, 34)     0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, None, 24)     0                                            
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, 256), (None, 297984      input_7[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, None, 256),  287744      input_8[0][0]                    
                                                                 lstm_3[0][1]                     
          

In [42]:
model.compile(optimizer=optimizers.rmsprop(lr=1e-3), 
              loss='categorical_crossentropy')
model_history = model.fit_generator(data_generator(data, batch_size),                                               
                                    steps_per_epoch= data.shape[0]/batch_size,                                   
                                    epochs=1, 
                                    verbose=2   
                                    )

Epoch 1/1
 - 13s - loss: 1.8816


In [43]:
#  model.save('s2sv5.h5')
model = load_model('s2sv5.h5')

In [44]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
eng_input (InputLayer)          (None, None, 34)     0                                            
__________________________________________________________________________________________________
chi_input (InputLayer)          (None, None, 24)     0                                            
__________________________________________________________________________________________________
endoder (LSTM)                  [(None, 256), (None, 297984      eng_input[0][0]                  
__________________________________________________________________________________________________
decoder (LSTM)                  [(None, None, 256),  287744      chi_input[0][0]                  
                                                                 endoder[0][1]                    
          

In [45]:
# load model version
latent_dim = 256
encoder_inputs = model.input[0]
_, encoder_statesh, encoder_statesc = model.layers[2].output
encoder_model = Model(encoder_inputs, [encoder_statesh, encoder_statesc])

decoder_inputs = model.input[1]
decoder_states_inputs = [Input(shape=(latent_dim,)), decoder_state_input_c]
decoder_outputs, state_h, state_c = model.layers[3](decoder_inputs, initial_state=decoder_states_inputs)

decoder_outputs = model.layers[4](decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + [state_h, state_c])

In [None]:
# encoder_model = Model(encoder_inputs, [encoder_states])
# decoder_state_input_h = Input(shape=(latent_dim,))
# decoder_state_input_c = Input(shape=(latent_dim,))
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

# decoder_states = [state_h, state_c]
# decoder_outputs = decoder_dense(decoder_outputs)
# decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

In [47]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(np.array([input_seq]))
    decoded_sentence = "\t"
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        
        sequence = tokenizer_chi.texts_to_sequences([decoded_sentence])[0]
        sequence = np.array([pad_sequences([sequence],maxlen_chinses)])
        
        output_tokens, h, c = decoder_model.predict([sequence] + states_value)
   
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_word_chinses[sampled_token_index]       
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > maxlen_chinses):
            stop_condition = True

        states_value = [h, c]    
       
              
    return decoded_sentence

In [49]:
for seq_index in range(5):   
    eng_x, chi_x, chi_y = get_seq_data(data.iloc[seq_index])
    decoded_sentence = decode_sequence(eng_x)
    decoded_sentence = decoded_sentence.strip()
    print('Input sentence:', data.iloc[seq_index]["english"].strip())
    try:
        print('Decoded sentence:', decoded_sentence)
    except:       
        print('Decoded sentence:', decoded_sentence.encode('ascii', 'replace'))        

Input sentence: Hi.
Decoded sentence: 你好。
Input sentence: Hi.
Decoded sentence: 你好。
Input sentence: Run.
Decoded sentence: 你用跑的。
Input sentence: Wait!
Decoded sentence: 等等！
Input sentence: Hello!
Decoded sentence: 你好。
