In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
import urllib2
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
# link1: https://www.tensorflow.org/tutorials/seq2seq
# link2: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [4]:
from gensim.corpora import Dictionary
from sklearn.preprocessing import OneHotEncoder

class SentenceToCharVecEncoder:
    def __init__(self, dictionary):
        self.dictionary = dictionary
        numchars = len(self.dictionary)
        self.onehot_encoder = OneHotEncoder()
        self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1)))
        
    def encode_sentence(self, sent):
        return self.onehot_encoder.transform(
            np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1))
        )
    
    def encode_sentences(self, sentences, sparse=True):
        if sparse:
            return map(lambda sent: self.encode_sentence(sent), sentences)
        else:
            return map(lambda sent: self.encode_sentence(sent).toarray(), sentences)
    
def initSentenceToCharVecEncoder(textfile):
    text = filter(lambda t: len(t)>0, [t.strip()+'\n' for t in textfile])
    dictionary = Dictionary(map(lambda line: [c for c in line], text))
    return SentenceToCharVecEncoder(dictionary)

In [5]:
textfile = urllib2.urlopen('http://norvig.com/big.txt', 'r')
text = filter(lambda t: len(t)>0, [t.strip() for t in textfile])

In [6]:
chartovec_encoder = initSentenceToCharVecEncoder(text)

In [7]:
numchars = len(chartovec_encoder.dictionary)
latent_dim = numchars + 20

print numchars
print latent_dim

93
113


In [8]:
max_sentlen = max(map(lambda t: t.shape[0], chartovec_encoder.encode_sentences(text)))
print max_sentlen

2541


In [9]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, numchars))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, numchars))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(numchars, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [10]:
# preparing training data

In [11]:
encoder_input = chartovec_encoder.encode_sentences(text[:-1])
decoder_input = chartovec_encoder.encode_sentences(text[1:])
decoder_output = chartovec_encoder.encode_sentences(text[1:])

In [12]:
print(len(encoder_input))
print(map(lambda e: e.shape, encoder_input[:100]))

103500
[(64, 93), (25, 93), (45, 93), (68, 93), (68, 93), (42, 93), (68, 93), (68, 93), (34, 93), (68, 93), (68, 93), (68, 93), (68, 93), (55, 93), (63, 93), (63, 93), (64, 93), (40, 93), (30, 93), (40, 93), (42, 93), (11, 93), (17, 93), (29, 93), (79, 93), (37, 93), (17, 93), (15, 93), (2, 93), (22, 93), (8, 93), (23, 93), (25, 93), (23, 93), (31, 93), (23, 93), (32, 93), (40, 93), (40, 93), (41, 93), (38, 93), (38, 93), (40, 93), (35, 93), (2, 93), (1146, 93), (1300, 93), (978, 93), (337, 93), (114, 93), (20, 93), (175, 93), (24, 93), (150, 93), (394, 93), (67, 93), (820, 93), (383, 93), (241, 93), (13, 93), (12, 93), (31, 93), (26, 93), (25, 93), (489, 93), (62, 93), (492, 93), (76, 93), (217, 93), (74, 93), (208, 93), (109, 93), (124, 93), (41, 93), (59, 93), (649, 93), (40, 93), (491, 93), (146, 93), (228, 93), (39, 93), (138, 93), (19, 93), (137, 93), (161, 93), (23, 93), (1126, 93), (193, 93), (170, 93), (270, 93), (183, 93), (298, 93), (25, 93), (8, 93), (232, 93), (39, 93), (2

In [13]:
print(len(decoder_input))
print(map(lambda e: e.shape, decoder_input[:100]))

103500
[(25, 93), (45, 93), (68, 93), (68, 93), (42, 93), (68, 93), (68, 93), (34, 93), (68, 93), (68, 93), (68, 93), (68, 93), (55, 93), (63, 93), (63, 93), (64, 93), (40, 93), (30, 93), (40, 93), (42, 93), (11, 93), (17, 93), (29, 93), (79, 93), (37, 93), (17, 93), (15, 93), (2, 93), (22, 93), (8, 93), (23, 93), (25, 93), (23, 93), (31, 93), (23, 93), (32, 93), (40, 93), (40, 93), (41, 93), (38, 93), (38, 93), (40, 93), (35, 93), (2, 93), (1146, 93), (1300, 93), (978, 93), (337, 93), (114, 93), (20, 93), (175, 93), (24, 93), (150, 93), (394, 93), (67, 93), (820, 93), (383, 93), (241, 93), (13, 93), (12, 93), (31, 93), (26, 93), (25, 93), (489, 93), (62, 93), (492, 93), (76, 93), (217, 93), (74, 93), (208, 93), (109, 93), (124, 93), (41, 93), (59, 93), (649, 93), (40, 93), (491, 93), (146, 93), (228, 93), (39, 93), (138, 93), (19, 93), (137, 93), (161, 93), (23, 93), (1126, 93), (193, 93), (170, 93), (270, 93), (183, 93), (298, 93), (25, 93), (8, 93), (232, 93), (39, 93), (293, 93), (

In [14]:
print(len(decoder_output))
print(map(lambda e: e.shape, decoder_output[:100]))

103500
[(25, 93), (45, 93), (68, 93), (68, 93), (42, 93), (68, 93), (68, 93), (34, 93), (68, 93), (68, 93), (68, 93), (68, 93), (55, 93), (63, 93), (63, 93), (64, 93), (40, 93), (30, 93), (40, 93), (42, 93), (11, 93), (17, 93), (29, 93), (79, 93), (37, 93), (17, 93), (15, 93), (2, 93), (22, 93), (8, 93), (23, 93), (25, 93), (23, 93), (31, 93), (23, 93), (32, 93), (40, 93), (40, 93), (41, 93), (38, 93), (38, 93), (40, 93), (35, 93), (2, 93), (1146, 93), (1300, 93), (978, 93), (337, 93), (114, 93), (20, 93), (175, 93), (24, 93), (150, 93), (394, 93), (67, 93), (820, 93), (383, 93), (241, 93), (13, 93), (12, 93), (31, 93), (26, 93), (25, 93), (489, 93), (62, 93), (492, 93), (76, 93), (217, 93), (74, 93), (208, 93), (109, 93), (124, 93), (41, 93), (59, 93), (649, 93), (40, 93), (491, 93), (146, 93), (228, 93), (39, 93), (138, 93), (19, 93), (137, 93), (161, 93), (23, 93), (1126, 93), (193, 93), (170, 93), (270, 93), (183, 93), (298, 93), (25, 93), (8, 93), (232, 93), (39, 93), (293, 93), (

In [15]:
chartovec_encoder.dictionary.token2id

{u'\t': 68,
 u'\n': 0,
 u' ': 1,
 u'!': 53,
 u'"': 41,
 u'#': 33,
 u'$': 78,
 u'%': 84,
 u'&': 77,
 u"'": 70,
 u'(': 34,
 u')': 35,
 u'*': 45,
 u'+': 83,
 u',': 42,
 u'-': 69,
 u'.': 38,
 u'/': 82,
 u'0': 59,
 u'1': 36,
 u'2': 60,
 u'3': 80,
 u'4': 79,
 u'5': 37,
 u'6': 55,
 u'7': 50,
 u'8': 73,
 u'9': 51,
 u':': 54,
 u';': 74,
 u'<': 85,
 u'=': 90,
 u'>': 86,
 u'?': 75,
 u'@': 87,
 u'A': 2,
 u'B': 3,
 u'C': 28,
 u'D': 29,
 u'E': 4,
 u'F': 46,
 u'G': 5,
 u'H': 6,
 u'I': 43,
 u'J': 63,
 u'K': 64,
 u'L': 62,
 u'M': 56,
 u'N': 61,
 u'O': 65,
 u'P': 7,
 u'Q': 76,
 u'R': 52,
 u'S': 8,
 u'T': 9,
 u'U': 66,
 u'V': 47,
 u'W': 48,
 u'X': 71,
 u'Y': 44,
 u'Z': 81,
 u'[': 57,
 u']': 58,
 u'^': 91,
 u'_': 88,
 u'a': 30,
 u'b': 10,
 u'c': 11,
 u'd': 12,
 u'e': 13,
 u'f': 14,
 u'g': 15,
 u'h': 16,
 u'i': 31,
 u'j': 17,
 u'k': 18,
 u'l': 19,
 u'm': 20,
 u'n': 21,
 u'o': 22,
 u'p': 39,
 u'q': 72,
 u'r': 23,
 u's': 24,
 u't': 25,
 u'u': 26,
 u'v': 27,
 u'w': 40,
 u'x': 49,
 u'y': 32,
 u'z': 67,
 u'|': 