In [1]:
import keras
import json
from datetime import datetime
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import pickle

Using TensorFlow backend.


In [2]:
with open('pad_encode_text.pk', 'rb') as f:
    pad_encode_text = pickle.load(f)
with open('pad_decode_text.pk', 'rb') as f:
    pad_decode_text = pickle.load(f)
with open('tokenizer.pk', 'rb') as f:
    tokenizer = pickle.load(f)

In [3]:
# all kinds parameters
min_length = 2
max_length = 20
VOC_SIZE = 10000
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
batch_size = 64
epochs = 100
fit_size = 20000

In [4]:
# read embedding file
embeddings_index = {}
with open("./glove/glove.6B.100d.txt", 'r',  encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [5]:
# generate embedding matrix
embedding_matrix = np.zeros((VOC_SIZE + 1, EMBEDDING_DIM))
count = 0
for i in range(1, VOC_SIZE):
    embedding_vector = embeddings_index.get(tokenizer.index_word[i])
    if embedding_vector is not None:
        count += 1
        embedding_matrix[i] = embedding_vector

In [6]:
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers import TimeDistributed
from keras.layers import concatenate

In [7]:
embedding_layer = Embedding(VOC_SIZE + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,              
                            trainable=False)


W1212 00:03:30.261675  3236 deprecation_wrapper.py:119] From C:\others\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [8]:
encoder_inputs = Input(shape=(max_length, ))
encoder_embedding = embedding_layer(encoder_inputs)
encoder_LSTM_1 = LSTM(HIDDEN_DIM, return_sequences=True)(encoder_embedding)
encoder_LSTM_2 = LSTM(HIDDEN_DIM, return_sequences=True)(encoder_LSTM_1)
encoder_LSTM_3 = LSTM(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM_3(encoder_LSTM_2)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_length, ))
decoder_embedding = embedding_layer(decoder_inputs)
decoder_LSTM_1 = LSTM(HIDDEN_DIM, return_sequences=True)(decoder_embedding, initial_state=encoder_states)
decoder_LSTM_2 = LSTM(HIDDEN_DIM, return_sequences=True)(decoder_LSTM_1)
decoder_LSTM_3 = LSTM(HIDDEN_DIM, return_sequences=True, return_state=True)
outputs, _, _, = decoder_LSTM_3(decoder_LSTM_2)
decoder_dense = Dense(VOC_SIZE+1, activation='softmax')
decoder_outputs = decoder_dense(outputs)

W1212 00:04:09.286657  3236 deprecation_wrapper.py:119] From C:\others\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1212 00:04:09.288625  3236 deprecation_wrapper.py:119] From C:\others\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1212 00:04:09.296632  3236 deprecation_wrapper.py:119] From C:\others\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1212 00:04:09.297627  3236 deprecation_wrapper.py:119] From C:\others\anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.



In [9]:
model = Model([encoder_inputs, decoder_inputs], [decoder_outputs])
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

W1212 00:04:15.861125  3236 deprecation_wrapper.py:119] From C:\others\anaconda\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 100)      1000100     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 20, 256)      365568      embedding_1[0][0]                
__________

In [None]:
from tqdm import tqdm_notebook
batchCount = int(pad_decode_text.shape[0]/batch_size)+1
for e in range(1, epochs+1):
    print('-'*15, 'Epoch %d' % e, '-'*15)
    for i in tqdm_notebook(range((int)(batchCount))):
        pad_encode_text_sample = pad_encode_text[i*batch_size:(i+1)*batch_size]
        pad_decode_text_sample = pad_decode_text[i*batch_size:(i+1)*batch_size]
        size = len(pad_decode_text_sample)
        one_hot_target_text = np.zeros((size, max_length, VOC_SIZE+1), dtype='int')
        for k, seqs in enumerate(pad_decode_text_sample):
            for j, seq in enumerate(seqs):
                if j > 0:
                    one_hot_target_text[k][j-1][seq] = 1
        model.train_on_batch([pad_encode_text_sample, pad_decode_text_sample], one_hot_target_text)

In [None]:
model.save('s2s_3lstm_voc10000.h5')