In [1]:
import keras
import json
from datetime import datetime
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import pickle

Using TensorFlow backend.


In [3]:
with open('pad_encode_text.pk', 'rb') as f:
    pad_encode_text = pickle.load(f)
with open('pad_decode_text.pk', 'rb') as f:
    pad_decode_text = pickle.load(f)
with open('tokenizer.pk', 'rb') as f:
    tokenizer = pickle.load(f)

In [4]:
# all kinds parameters
min_length = 2
max_length = 20
VOC_SIZE = 10000
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
batch_size = 64
epochs = 100
fit_size = 20000

In [5]:
# read embedding file
embeddings_index = {}
with open("./glove/glove.6B.100d.txt", 'r',  encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [8]:
# generate embedding matrix
embedding_matrix = np.zeros((VOC_SIZE + 1, EMBEDDING_DIM))
count = 0
for i in range(1, VOC_SIZE):
    embedding_vector = embeddings_index.get(tokenizer.index_word[i])
    if embedding_vector is not None:
        count += 1
        embedding_matrix[i] = embedding_vector

9539


In [6]:
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers import TimeDistributed
from keras.layers import concatenate

In [11]:
embedding_layer = Embedding(VOC_SIZE + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,              
                            trainable=False)

In [12]:
encoder_inputs = Input(shape=(max_length, ), name='encoder_inputs')
encoder_embedding = embedding_layer(encoder_inputs)
encoder_LSTM = LSTM(HIDDEN_DIM, return_state = True, name='encoder_LSTM')
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_length, ), name='decoder_inputs')
decoder_embedding = embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(HIDDEN_DIM, return_sequences=True, return_state=True, name='decoder_LSTM')
outputs, _, _, = decoder_LSTM(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOC_SIZE+1, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(outputs)

In [14]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_inputs (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
encoder_inputs (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 20, 100)      1000100     encoder_inputs[0][0]             
                                                                 decoder_inputs[0][0]             
__________________________________________________________________________________________________
encoder_LSTM (LSTM)             [(None, 256), (None, 365568      embedding_2[0][0]                
__________

In [None]:
# training
for epoch in range(epochs):
    for i in range(int(pad_decode_text.shape[0]/fit_size)+1):
        pad_encode_text_sample = pad_encode_text[i*fit_size:(i+1)*fit_size]
        pad_decode_text_sample = pad_decode_text[i*fit_size:(i+1)*fit_size]
        one_hot_target_text = np.zeros((len(pad_decode_text_sample), max_length, VOC_SIZE+1), dtype='bool')
        for k, seqs in enumerate(pad_decode_text_sample):
            for j, seq in enumerate(seqs):
                if j > 0:
                    one_hot_target_text[k][j-1][seq] = 1
        model.fit([pad_encode_text_sample, pad_decode_text_sample], one_hot_target_text,
              batch_size=batch_size,
              epochs=1,
              validation_split=0.1)

In [None]:
model.save('s2s_voc10000.h5')