In [1]:
import tensorflow as tf
tf.__version__

'2.4.1'

In [3]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
#import utils
from ipynb.fs.full.utils import*
import tensorflow_addons as tfa


class Seq2Seq(keras.Model):
    def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
        super().__init__()
        self.units = units

        # encoder
        self.enc_embeddings = keras.layers.Embedding(
            input_dim=enc_v_dim, output_dim=emb_dim,  # [enc_n_vocab, emb_dim]
            embeddings_initializer=tf.initializers.RandomNormal(0, 0.1))
        
        self.encoder = keras.layers.LSTM(units=units, return_sequences=True, return_state=True)

        # decoder
        self.dec_embeddings = keras.layers.Embedding(
            input_dim=dec_v_dim, output_dim=emb_dim,  # [dec_n_vocab, emb_dim]
            embeddings_initializer=tf.initializers.RandomNormal(0, 0.1))
        
        self.decoder_cell = keras.layers.LSTMCell(units=units)
        decoder_dense = keras.layers.Dense(dec_v_dim)
        
        '''
        Training decoder with TrainingSampler shares the same decoder cell(LSTM cell) and output layer(dense)
        with predicting decoder with GreedyEmbeeding Sampler.
        '''
        # train decoder
        self.decoder_train = tfa.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            sampler=tfa.seq2seq.sampler.TrainingSampler(), # sampler for train which read its input, and correct wrong output  
            output_layer=decoder_dense)
        
        # predict decoder
        self.decoder_eval = tfa.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(),       # sampler for predict
            output_layer=decoder_dense
        )

        self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        self.opt = keras.optimizers.Adam(learning_rate=0.01)
        self.max_pred_len = max_pred_len
        self.start_token = start_token
        self.end_token = end_token

    def encode(self, x):
        embedded = self.enc_embeddings(x)
        '''
        In LSTM, there are two initial states (hidden state a_0, and cell state), so init_s contains both.
        '''
        init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))]
        # outputs (all hidden state of each time step), last hidden state(a), last cell state(c)
        o, h, c = self.encoder(embedded, initial_state=init_s) 
        return [h, c]

    def inference(self, x):
        s = self.encode(x)
        
        '''
        def initialize(self, embedding, start_tokens, end_token, initial_state):
        """Initialize the decoder.
        Args:
          embedding: A `Tensor` (or `Variable`) to pass as the `params` argument
            for `tf.nn.embedding_lookup`. This overrides `embedding_fn` set in
            the constructor.
          start_tokens: Start the decoding from these tokens.
            A `int32` `Tensor` of shape `[batch_size]`.
          end_token: The token that marks the end of decoding.
            A `int32` scalar `Tensor`.
          initial_state: The initial cell state as a (possibly nested) structure
            of `Tensor` and `TensorArray`.
        Returns:
          `(finished, start_inputs, initial_state)`.
          
        1. self.dec_embeddings.variables[0] returns word matrix.
        2. start_inputs is y_i, and initial state is S_i
        '''
        done, i, s = self.decoder_eval.initialize(
            self.dec_embeddings.variables[0],
            start_tokens=tf.fill([x.shape[0], ], self.start_token),
            end_token=self.end_token,
            initial_state=s)
        
        '''
        def step(self, time, inputs, state, training=None):
        Perform a decoding step.
        Args:
          time: scalar `int32` tensor.
          inputs: A (structure of) input tensors.
          state: A (structure of) state tensors and TensorArrays.
          training: Python boolean.
          
        Returns:
          `(outputs, next_state, next_inputs, finished)`.
          
        1. For predict output, we don't update layer's variables, so we set training=False.
        2. Also, we use for loop to predict output at each timestep.
        '''
        
        pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32)
        for l in range(self.max_pred_len):
            o, s, i, done = self.decoder_eval.step(
                time=l, inputs=i, state=s, training=False)
            pred_id[:, l] = o.sample_id
            
        return pred_id

    def train_logits(self, x, y, seq_len):
        s = self.encode(x)
        
                
        print(s[0].shape)
        print(s[1].shape)
        dec_in = y[:, :-1]   # ignore <EOS>
        print("dec_in:", dec_in.shape)
        dec_emb_in = self.dec_embeddings(dec_in)
        print("dec_emb_in:", dec_emb_in.shape)
        
        print("seq_len", seq_len.shape)
        o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len)
        logits = o.rnn_output
        return logits

    def step(self, x, y, seq_len):
        
        with tf.GradientTape() as tape:
            logits = self.train_logits(x, y, seq_len)
            dec_out = y[:, 1:]  # ignore <GO>
            
            print("x:", x.shape)
            print("y:", y.shape)
            print("seq_len:", seq_len.shape)
            print("dec_out:", dec_out.shape)
            print("logits:", logits.shape)
            print("dec_out:", dec_out.shape)
            loss = self.cross_entropy(dec_out, logits)
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.opt.apply_gradients(zip(grads, self.trainable_variables))
        return loss.numpy()


def train():
    # get and process data
    data = DateData(4000)
    print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
    print("vocabularies: ", data.vocab)
    print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
          "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))

    model = Seq2Seq(
        data.num_word, data.num_word, emb_dim=16, units=32,
        max_pred_len=10, start_token=data.start_token, end_token=data.end_token)

    # training
    for t in range(100):
        bx, by, _ = data.sample(128)
        decoder_len = np.ones((by.shape[0],), dtype=int)*10
        loss = model.step(bx, by, decoder_len)
        
        if t % 25 == 0:
            target = data.idx2str(by[0, 1:-1])
            pred = model.inference(bx[0:1])
            res = data.idx2str(pred[0])
            src = data.idx2str(bx[0])
            print("t: ", t,
                  "| loss: %.3f" % loss,
                  "| input: ", src,
                  "| target: ", target,
                  "| inference: ", res, pred[0], pred)

    return model

In [50]:
data = DateData(4000)

In [51]:
data.start_token

14

In [45]:
bx, by, decoder_len = data.sample(128)
by.shape

(128, 11)

In [46]:
by[0]

array([14,  4, 10,  2, 21,  2,  4, 12, 12,  9, 13])

In [38]:
decoder_len

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10])

In [6]:
len(data.vocab)

27

In [8]:
data.num_word

27

In [14]:
by.shape

(128, 11)

In [12]:
bx, by, decoder_len = data.sample(128)

In [29]:
decoder_len.shape

(128,)

In [4]:
model = train()

Chinese time order: yy/mm/dd  ['31-04-26', '04-07-18', '33-06-06'] 
English time order: dd/M/yyyy  ['26/Apr/2031', '18/Jul/2004', '06/Jun/2033']
vocabularies:  {'Jul', '4', 'Nov', 'Aug', 'Jun', '3', '/', 'Apr', '2', 'Jan', '<GO>', 'Oct', '6', '0', 'Sep', '<EOS>', 'Feb', '1', 'May', '9', 'Mar', '<PAD>', '-', '7', '5', 'Dec', '8'}
x index sample: 
31-04-26
[6 4 1 3 7 1 5 9] 
y index sample: 
<GO>26/Apr/2031<EOS>
[14  5  9  2 15  2  5  3  6  4 13]
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
t:  0 | loss: 3.296 | input:  96-06-17 | target:  17/Jun/1996 | inference:  ////////// [2 2 2 2 2 2 2 2 2 2] [[2 2 2 2 2 2 2 2 2 2]]
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10

(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 

(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 11)
seq_len: (128,)
dec_out: (128, 10)
logits: (128, 10, 27)
dec_out: (128, 10)
(128, 32)
(128, 32)
dec_in: (128, 10)
dec_emb_in: (128, 10, 16)
seq_len (128,)
x: (128, 8)
y: (128, 

In [16]:
data = DateData(400)

In [18]:
dec_in = data.y[:, :-1]   # ignore <EOS>
dec_emb_in = model.dec_embeddings(dec_in)

In [20]:
data.y[:, :-1].shape

(400, 10)

In [21]:
model.dec_embeddings(dec_in).shape

TensorShape([400, 10, 16])

In [11]:
bx, by, decoder_len = data.sample(32)

In [15]:
bx[0]

array([12,  9,  1,  3,  9,  1,  4, 10])

In [5]:
data = DateData(4000)

In [8]:
data.x.shape

(4000, 8)

In [9]:
data.y.shape

(4000, 11)

In [6]:
data.start_token

14

In [7]:
data.end_token

13

In [29]:
data.x[1]

array([ 3,  7,  1,  3, 10,  1,  4, 11])

In [30]:
data.y[1]

array([14,  4, 11,  2, 20,  2,  5,  3,  3,  7, 13])

In [25]:
data.vocab

{'-',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '<EOS>',
 '<GO>',
 '<PAD>',
 'Apr',
 'Aug',
 'Dec',
 'Feb',
 'Jan',
 'Jul',
 'Jun',
 'Mar',
 'May',
 'Nov',
 'Oct',
 'Sep'}

In [23]:
data.start_token

14

In [24]:
data.end_token

13

In [10]:
tf.fill([data.x.shape[0], ], 14)

<tf.Tensor: shape=(4000,), dtype=int32, numpy=array([14, 14, 14, ..., 14, 14, 14])>

In [3]:
SAMPLE = 20
TIME_STEP = 10
FEATURE = 100

x = np.random.random((SAMPLE, TIME_STEP, FEATURE)).astype(np.float32)
y = np.random.random((TIME_STEP, FEATURE))

lstm = keras.layers.LSTM(units=5, return_sequences=True, return_state=True)
# outputs (all hidden states of each time step), last hidden state(a), last cell state(c)
o, h, c = lstm(x)

In [27]:
o.shape

TensorShape([20, 10, 5])

In [28]:
h.shape

TensorShape([20, 5])

In [29]:
c.shape

TensorShape([20, 5])

In [30]:
o[:, -1, :] == h

<tf.Tensor: shape=(20, 5), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True]])>