In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import os
import time
import io
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns


def initialization(seed=42):
    keras.backend.clear_session()
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [2]:
from datetime import date
 
dt=date.fromordinal( 1 )
print(dt)
 
print( dt.strftime( "%d, %Y" ) )
print(  dt.isoformat() )

0001-01-01
01, 0001
0001-01-01


In [3]:
from datetime import date
 
# cannot use strftime()'s %B format since it depends on the locale
MONTHS = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
def random_dates( n_dates ):
    min_date = date(1000,1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()
    
    ordinals = np.random.randint( max_date-min_date, size=n_dates ) + min_date
    dates = [ date.fromordinal( ordinal) for ordinal in ordinals ]
    x = [ MONTHS[dt.month-1] + " " + dt.strftime( "%d, %Y" ) for dt in dates ]
    y = [ dt.isoformat() for dt in dates ]
    return x, y

In [4]:
np.random.seed(42)
 
n_dates = 3
x_example, y_example = random_dates( n_dates )
print( "{:25s}{:25s}".format("Input", "Target") )
print( "-"*50 )
for idx in range(n_dates):
    print( "{:25s}{:25s}".format(x_example[idx], y_example[idx]) )

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


In [5]:
INPUT_CHARS = "".join( sorted( set( "".join(MONTHS) 
                                    + "0123456789, " )
                             ) )
INPUT_CHARS

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [7]:
def date_str_to_ids( date_str, chars=INPUT_CHARS ):
    return [ chars.index(c) for c in date_str ]
 
date_str_to_ids(x_example[0], INPUT_CHARS)

[19, 23, 31, 34, 23, 28, 21, 23, 32, 0, 4, 2, 1, 0, 9, 2, 9, 7]

In [8]:
OUTPUT_CHARS = "0123456789-"
date_str_to_ids( y_example[0], OUTPUT_CHARS )

[7, 0, 7, 5, 10, 0, 9, 10, 2, 0]

In [9]:
def prepare_date_strs( date_strs, chars=INPUT_CHARS ):            #ragg #veriable length
    X_ids = [ date_str_to_ids(dt, chars) for dt in date_strs ]# [[nested_list_veriable_length],[nested_list]...]
    X = tf.ragged.constant( X_ids, ragged_rank=1 )
    return (X+1).to_tensor() # +1 for id start from 1
 
def create_dataset( n_dates ):
    x,y = random_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS),\
           prepare_date_strs(y, OUTPUT_CHARS)

In [10]:
np.random.seed(42)
 
X_train, Y_train = create_dataset( 10000 )
X_valid, Y_valid = create_dataset( 2000 )
X_test, Y_test = create_dataset( 2000 )
 
Y_train[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 8,  1,  8,  6, 11,  1, 10, 11,  3,  1], dtype=int32)>

In [11]:
sos_id = len(OUTPUT_CHARS) + 1 #==12
 
def shifted_output_sequences(Y):
    sos_tokens = tf.fill( dims=(len(Y),1), 
                          value=sos_id )
    return tf.concat([ sos_tokens, Y[:,:-1] ],
                       axis=1 )
 
X_train_decoder = shifted_output_sequences(Y_train)
X_valid_decoder = shifted_output_sequences(Y_valid)
X_test_decoder = shifted_output_sequences(Y_test)
Y_train

<tf.Tensor: shape=(10000, 10), dtype=int32, numpy=
array([[ 8,  1,  8, ..., 11,  3,  1],
       [ 9,  6,  8, ..., 11,  2,  6],
       [ 8,  2,  1, ..., 11,  2,  2],
       ...,
       [10,  8,  7, ..., 11,  4,  1],
       [ 2,  2,  3, ..., 11,  3,  8],
       [ 8,  9,  4, ..., 11,  3, 10]], dtype=int32)>

In [12]:
# pip install tensorflow-addons
import tensorflow_addons as tfa
from tensorflow import keras

np.random.seed(42)
tf.random.set_seed(42)

encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

################################# encoder
encoder_inputs = keras.layers.Input(shape=[None],
                                    dtype=np.int32)  # None: num_time_steps

sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)
# INPUT_CHARS = ' ,0123456789ADFJMNOSabceghilmnoprstuvy'
# len(INPUT_CHARS) = 38
encoder_embeddings = keras.layers.Embedding(
    input_dim=len(INPUT_CHARS) +
    1,  #+1 since (X+1).to_tensor() #+1 for id start from 1
    output_dim=encoder_embedding_size)(encoder_inputs)

encoder = keras.layers.LSTM(units, return_state=True)  # return_sequences=False
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

################################# decoder
# OUTPUT_CHARS = '0123456789-'
# len(OUTPUT_CHARS) = 11
decoder_inputs = keras.layers.Input(shape=[None],
                                    dtype=np.int32)  # None: num_time_steps
decoder_embedding_layer = keras.layers.Embedding(  # +1 again for 'SOS'
    input_dim=len(OUTPUT_CHARS) + 2,  # +1 for id start from 1 
    output_dim=decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# why uses keras.layers.LSTMCell? During inference, we use one step output as next step input
# keras.layers.LSTMCell processes one step within the whole time sequence input
decoder_cell = keras.layers.LSTMCell(units)  # one step or one word
#+1 since (X+1).to_tensor() # +1 for id start from 1 and we don't need to +1 again for predicting 'sos' with 0 probability
output_layer = keras.layers.Dense(len(OUTPUT_CHARS) + 1)
# https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/TrainingSampler
# A training sampler that simply reads its inputs.
# its role is to tell the decoder at each step what it should pretend the
# previous output was.
# During inference, this should be the embedding of the token that was actually output
# During training, it should be the embedding of the previous target token
# time_major : Python bool. Whether the tensors in inputs are time major.
#              If False (default), they are assumed to be batch major.
sampler = tfa.seq2seq.sampler.TrainingSampler()
# In tfa.seq2seq.BasicDecoder
# The tfa.seq2seq.Sampler instance passed as argument is responsible to
# sample from the output distribution and
# produce the input for the next decoding step.
# https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/BasicDecoder
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
                                                 sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state,
    # sequence_length = sequence_lengths
)

Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)
#final_outputs.rnn_outputs access to the logits ==>"softmax" for normalization==>Y_proba

model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs],
                           outputs=[Y_proba])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 32)     1248        input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 32)     416         input_3[0][0]                    
______________________________________________________________________________________________

In [13]:
# pip install tensorflow-addons
import tensorflow_addons as tfa
from tensorflow import keras
 
np.random.seed(42)
tf.random.set_seed(42)
 
encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128
 
################################# encoder 
encoder_inputs = keras.layers.Input( shape=[18], dtype=np.int32 )# 18: num_time_steps
 
sequence_lengths = keras.layers.Input( shape=[], dtype=np.int32 )
# INPUT_CHARS = ' ,0123456789ADFJMNOSabceghilmnoprstuvy'
# len(INPUT_CHARS) = 38
encoder_embeddings = keras.layers.Embedding(
                        input_dim = len(INPUT_CHARS)+1, #+1 since (X+1).to_tensor() #+1 for id start from 1
                        output_dim=encoder_embedding_size
                     )(encoder_inputs)
 
encoder = keras.layers.LSTM(units, return_state=True) # return_sequences=False
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]
 
 
################################# decoder
# OUTPUT_CHARS = '0123456789-'
# len(OUTPUT_CHARS) = 11
decoder_inputs = keras.layers.Input( shape=[10], dtype=np.int32 )# 10: num_time_steps
 
decoder_embedding_layer = keras.layers.Embedding(           # +1 again for 'SOS'
                            input_dim = len(OUTPUT_CHARS)+2,# +1 for id start from 1 
                            output_dim=decoder_embedding_size
                          )
decoder_embeddings = decoder_embedding_layer( decoder_inputs )
 
# why uses keras.layers.LSTMCell? During inference, we use one step output as next step input
# keras.layers.LSTMCell processes one step within the whole time sequence input
decoder_cell = keras.layers.LSTMCell(units) # one step or one word
#+1 since (X+1).to_tensor() # +1 for id start from 1 and we don't need to +1 again for predicting 'sos' with 0 probability
output_layer = keras.layers.Dense( len(OUTPUT_CHARS)+1 )
# https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/TrainingSampler
# A training sampler that simply reads its inputs.
# its role is to tell the decoder at each step what it should pretend the 
# previous output was. 
# During inference, this should be the embedding of the token that was actually output 
# During training, it should be the embedding of the previous target token
# time_major : Python bool. Whether the tensors in inputs are time major. 
#              If False (default), they are assumed to be batch major.
sampler = tfa.seq2seq.sampler.TrainingSampler()
# In tfa.seq2seq.BasicDecoder
# The tfa.seq2seq.Sampler instance passed as argument is responsible to 
# sample from the output distribution and 
# produce the input for the next decoding step. 
# https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/BasicDecoder
decoder = tfa.seq2seq.basic_decoder.BasicDecoder( decoder_cell,
                                                  sampler,
                                                  output_layer=output_layer )
final_outputs, final_state, final_sequence_lengths = decoder( decoder_embeddings,
                                                              initial_state=encoder_state,
                                                              # sequence_length = sequence_lengths
                                                            )
 
Y_proba = keras.layers.Activation( "softmax" )( final_outputs.rnn_output )
 
 
model = keras.models.Model( inputs=[encoder_inputs, decoder_inputs],
                            outputs=[Y_proba] )
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 18)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 18, 32)       1248        input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 10, 32)       416         input_6[0][0]                    
____________________________________________________________________________________________

In [14]:
optimizer = keras.optimizers.Nadam()
model.compile( loss="sparse_categorical_crossentropy", optimizer=optimizer,
               metrics=["accuracy"] )
history = model.fit( [X_train, X_train_decoder], Y_train, epochs=15,
                     validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [15]:
def ids_to_date_strs( ids, chars=OUTPUT_CHARS ):
                      # " " since (X+1).to_tensor() # +1 for id start from 1
    return [ "".join([ (" "+chars)[index] for index in sequence ])
             for sequence in ids ]
             
# since we use X = tf.ragged.constant( X_ids, ragged_rank=1 ) # 内部非均匀
max_input_length = X_train.shape[1] # 18 
 
def prepare_date_strs_padded( date_strs ):
    X = prepare_date_strs( date_strs )
    if X.shape[1] <max_input_length:
        X = tf.pad(X, [ [ 0, 0 ], # not to fill the batch_size dimension
                        [ 0, max_input_length-X.shape[1] ] # fill the sequences dimension(veriable length)
                      ])
    return X
 
max_output_length = Y_train.shape[1] #10
def predict_date_strs(date_strs): # during inference
    X = prepare_date_strs_padded(date_strs)
    Y_pred = tf.fill(dims=(len(X), 1), value=sos_id)
    for index in range(max_output_length):
        pad_size = max_output_length - Y_pred.shape[1]
        X_decoder = tf.pad(Y_pred, [[0, 0], 
[0, pad_size]])
        Y_probas_next = model.predict([X, X_decoder])[:, index:index+1]
        Y_pred_next = tf.argmax(Y_probas_next, axis=-1, output_type=tf.int32)
        Y_pred = tf.concat([Y_pred, Y_pred_next], axis=1)
    return ids_to_date_strs(Y_pred[:, 1:])
 
predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']

In [16]:
################ during inference
 
inference_sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler(
                        embedding_fn = decoder_embedding_layer
                      )
inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
                        decoder_cell, 
                        inference_sampler,                    ##########
                        output_layer=output_layer,
                        maximum_iterations = max_output_length##########
                    )
batch_size = tf.shape(encoder_inputs)[:1]
start_tokens = tf.fill( dims=batch_size, value=sos_id )##############
final_outputs, final_state, final_sequence_lengths = inference_decoder(
    start_tokens,# decoder_cell # keras.layers.LSTMCell(units) # one step or one word
    initial_state = encoder_state,
    start_tokens=start_tokens,
    end_token=0
)
# Y_proba = keras.layers.Activation( "softmax" )( final_outputs.rnn_output )
# final_outputs.rnn_outputs access to the logits ==>"softmax" for normalization==>Y_proba
# sample_id is the argmax of the rnn_output
inference_model = keras.models.Model( inputs=[encoder_inputs],
                                      outputs=[final_outputs.sample_id]#######not outputs=[Y_proba]
                                    )

In [17]:
def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
    # " " since we are using 0 as the padding token ID
    return [
        "".join([(" " + chars)[index] for index in sequence])
        for sequence in ids
    ]


# since we use X = tf.ragged.constant( X_ids, ragged_rank=1 ) # 内部非均匀
max_input_length = X_train.shape[1]  # 18


def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    if X.shape[1] < max_input_length:
        X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
    return X


### max_output_length = Y_train.shape[1] #10


def fast_predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    Y_pred = inference_model.predict(X)
    return ids_to_date_strs(Y_pred)


fast_predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']