In [1]:
from model_keras import S2sModel

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np


In [3]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = '/home/congyu/dataset/TEXT/fra-eng/fra.txt'

In [4]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
    
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text, _ = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # get chars for char index dictionary
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 93
Max sequence length for inputs: 16
Max sequence length for outputs: 59


In [5]:
# modify token_index of target
first_target = target_characters[0]
target_token_index["\t"], target_token_index[first_target] = target_token_index[first_target], target_token_index["\t"]

last_target = target_characters[-1]
target_token_index["\n"], target_token_index[last_target] = target_token_index[last_target], target_token_index["\n"]


In [6]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

# IMPORTANT !
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    
    # padding the input sequence
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
            
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.

In [7]:
model = S2sModel(**locals())

In [8]:
encoder_input_data.shape

(10000, 16, 71)

In [9]:
model.fit(encoder_input_data.reshape(10000, 16, 71, 1), decoder_input_data, decoder_target_data)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 8000 samples, validate on 2000 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [10]:
model.predict(encoder_input_data[0:1].reshape(1, 16, 71, 1))

array([[6.58987934e-11, 1.75071762e-06, 6.93463062e-06, 5.14246698e-04,
        1.15462473e-10, 8.65724992e-10, 2.16341889e-10, 4.49111290e-08,
        3.58793828e-10, 1.89049082e-10, 3.64267869e-08, 1.37417082e-08,
        1.31206361e-08, 2.08828932e-09, 1.75798182e-10, 1.07099246e-10,
        8.89859630e-10, 1.38830350e-10, 2.39671061e-10, 2.08696749e-09,
        2.52921706e-09, 3.12607949e-06, 8.81350015e-09, 2.70260170e-09,
        1.53443178e-08, 2.20545129e-08, 2.31085675e-08, 1.48013353e-08,
        3.16671911e-10, 1.82800919e-09, 1.17921930e-08, 3.24152793e-08,
        1.15311771e-09, 9.24025834e-09, 1.32597098e-08, 9.97149208e-09,
        1.25579680e-08, 3.01033474e-08, 2.02090789e-08, 7.06497860e-10,
        1.76507187e-09, 3.19181481e-08, 4.71276929e-09, 1.04965192e-08,
        1.60429858e-09, 3.68821077e-07, 9.18334409e-10, 4.40873436e-08,
        1.86244051e-08, 1.27585082e-07, 7.23423454e-09, 4.74862727e-09,
        1.39738589e-07, 2.76636769e-07, 1.74764891e-09, 3.401800