In [8]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

Using TensorFlow backend.


In [20]:
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'fra.txt'

In [10]:
import re
import string
from unicodedata import normalize
def clean(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lowercase
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(line)
    return cleaned

In [11]:
import glob
input_texts=[]
target_texts=[]
with open('fra.txt',encoding="utf8") as f:
    for line in f:
        data=line.split("\t")
        input_texts.append(data[0])
        target_texts.append(data[1])
target_texts=clean(target_texts)
input_texts=clean(input_texts)

In [12]:
# Vectorize the data.
input_words = set()
target_words = set()

for d in input_texts:
    for word in d:
        if word not in input_words:
            input_words.add(word)
for d in target_texts:
    for word in d:
        if word not in target_words:
            target_words.add(word)


In [13]:
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(txt) for txt in input_words])
max_decoder_seq_length = max([len(txt) for txt in target_words])

print('Number of samples:', len(input_words))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)
print(input_words)

Number of samples: 320
Number of unique input tokens: 320
Number of unique output tokens: 683
Max sequence length for inputs: 9
Max sequence length for outputs: 13
['a', 'above', 'after', 'agree', 'ahead', 'aim', 'alive', 'alone', 'am', 'angry', 'answer', 'armed', 'ask', 'attack', 'awake', 'away', 'awesome', 'awful', 'back', 'bad', 'bald', 'bark', 'be', 'beat', 'beats', 'bed', 'beg', 'below', 'birds', 'bless', 'blind', 'broke', 'busy', 'call', 'calm', 'came', 'can', 'care', 'catch', 'cheer', 'cheers', 'clean', 'cold', 'come', 'cook', 'cool', 'cop', 'course', 'crazy', 'cried', 'cringed', 'cry', 'cuff', 'cured', 'cute', 'dark', 'dead', 'deaf', 'deep', 'did', 'die', 'died', 'do', 'dogs', 'done', 'dont', 'down', 'drive', 'drop', 'drunk', 'dying', 'early', 'excuse', 'failed', 'fair', 'fantastic', 'far', 'fast', 'fat', 'feel', 'fell', 'find', 'fine', 'fire', 'first', 'fit', 'fly', 'follow', 'food', 'for', 'forget', 'forgot', 'free', 'full', 'fun', 'fussy', 'get', 'give', 'glad', 'go', 'going

In [14]:
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [15]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, word in enumerate(input_text):
        if(input_token_index[word]):
            encoder_input_data[i, t, input_token_index[word]] = 1.
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        if(target_token_index[word]):
            decoder_input_data[i, t, target_token_index[word]] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_token_index[word]] = 1.

In [16]:
print(encoder_input_data.shape)

(840, 9, 320)


In [17]:
print(decoder_input_data.shape)

(840, 13, 683)


In [18]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [26]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [22]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [27]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Train on 672 samples, validate on 168 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fe5ae1ba8>

In [28]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_word_index = dict(
    (i, word) for word, i in input_token_index.items())
reverse_target_word_index = dict(
    (i, word) for word, i in target_token_index.items())

In [29]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    
    #target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [31]:
for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', " ".join(input_texts[seq_index]))
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: go
Decoded sentence: tomvualleraller
-
Input sentence: run
Decoded sentence: tompasalleraller
-
Input sentence: run
Decoded sentence: tompasalleraller
-
Input sentence: fire
Decoded sentence: tompasalleraller
-
Input sentence: help
Decoded sentence: cavualleraller
-
Input sentence: jump
Decoded sentence: cavualleraller
-
Input sentence: stop
Decoded sentence: cavualleraller
-
Input sentence: stop
Decoded sentence: cavualleraller
-
Input sentence: stop
Decoded sentence: cavualleraller
-
Input sentence: wait
Decoded sentence: cavualleraller
