In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
# link1: https://www.tensorflow.org/tutorials/seq2seq
# link2: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [3]:
from gensim.corpora import Dictionary
from sklearn.preprocessing import OneHotEncoder

class SentenceToCharVecEncoder:
    def __init__(self, dictionary):
        self.dictionary = dictionary
        numchars = len(self.dictionary)
        self.onehot_encoder = OneHotEncoder()
        self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1)))
        
    def encode_sentence(self, sent):
        return self.onehot_encoder.transform(
            np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1))
        )
    
    def encode_sentences(self, sentences, sparse=False):
        if sparse:
            return np.array(map(lambda sent: self.encode_sentence(sent), sentences))
        else:
            return np.array(map(lambda sent: self.encode_sentence(sent).toarray(), sentences))
    
def initSentenceToCharVecEncoder(textfile):
    text = filter(lambda t: len(t)>0, [t.strip() for t in textfile])
    dictionary = Dictionary(map(lambda line: [c for c in line], text))
    return SentenceToCharVecEncoder(dictionary)

In [4]:
import urllib2

sent_encoder = initSentenceToCharVecEncoder(urllib2.urlopen('http://norvig.com/big.txt', 'r'))

In [5]:
sent_encoder.encode_sentence('abAtrE.')

<7x92 sparse matrix of type '<type 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [6]:
sent_encoder.encode_sentences(['I love you.', 'seq2seq model in Keras', 'language model'])

array([ array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
       array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
       array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])], dtype=object)

In [7]:
sent_encoder.encode_sentences(['I love you.', 'seq2seq model in Keras', 'language model'], sparse=True)

array([ <11x92 sparse matrix of type '<type 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>,
       <22x92 sparse matrix of type '<type 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>,
       <14x92 sparse matrix of type '<type 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>], dtype=object)

In [8]:
numchars = len(sent_encoder.dictionary)
latent_dim = numchars + 20

In [9]:
textfile = urllib2.urlopen('http://norvig.com/big.txt', 'r')
text = filter(lambda t: len(t)>0, [t.strip() for t in textfile])

In [10]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, numchars))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, numchars))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(numchars, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# preparing training data
