In [1]:
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
import numpy as np
#import utils
from ipynb.fs.full.utils import*
import tensorflow_addons as tfa

fake = Faker()
Faker.seed(12345)
random.seed(12345)

def load_date():
    # Define format of the data we would like to generate
    FORMATS = ['short',
               'medium',
               'long',
               'full',
               'full',
               'full',
               'full',
               'full',
               'full',
               'full',
               'full',
               'full',
               'full',
               'd MMM YYY', 
               'd MMMM YYY',
               'dd MMM YYY',
               'd MMM, YYY',
               'd MMMM, YYY',
               'dd, MMM YYY',
               'd MM YY',
               'd MMMM YYY',
               'MMMM d YYY',
               'MMMM d, YYY',
               'dd.MM.YY']

    # change this if you want it to work with another language
    LOCALES = ['en_US']
    
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()
        
    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30
    

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], 
                     list(range(len(human_vocab) + 2))))    
    inv_human = {v:k for k,v in human.items()}
    
    inv_machine = dict(enumerate(sorted(machine_vocab) + ['<go>', '<eos>']))
    machine = {v:k for k,v in inv_machine.items()}
     
    return dataset, human, inv_human, machine, inv_machine

def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y), Xoh, Yoh

def string_to_int(string, length, vocab):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"
    
    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"
    
    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """
    
    #make lower to standardize
    string = string.lower()
    string = string.replace(',','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    #print (rep)
    return rep


def int_to_string(ints, inv_vocab):
    """
    Output a machine readable list of characters based on a list of indexes in the machine's vocabulary
    
    Arguments:
    ints -- list of integers representing indexes in the machine's vocabulary
    inv_vocab -- dictionary mapping machine readable indexes to machine readable characters 
    
    Returns:
    l -- list of characters corresponding to the indexes of ints thanks to the inv_vocab mapping
    """
    
    l = [inv_vocab[i] for i in ints]
    return l

In [2]:
class Seq2Seq(keras.Model):
    def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, max_pred_len, start_token, end_token):
        super().__init__()
        self.units = units

        # encoder
        self.enc_embeddings = keras.layers.Embedding(
            input_dim=enc_v_dim, output_dim=emb_dim,  # [enc_n_vocab, emb_dim]
            embeddings_initializer=tf.initializers.RandomNormal(0, 0.1))
        
        self.encoder = keras.layers.LSTM(units=units, return_sequences=True, return_state=True)

        # decoder
        self.dec_embeddings = keras.layers.Embedding(
            input_dim=dec_v_dim, output_dim=emb_dim,  # [dec_n_vocab, emb_dim]
            embeddings_initializer=tf.initializers.RandomNormal(0, 0.1))
        
        self.decoder_cell = keras.layers.LSTMCell(units=units)
        decoder_dense = keras.layers.Dense(dec_v_dim)
        
        '''
        Training decoder with TrainingSampler shares the same decoder cell(LSTM cell) and output layer(dense)
        with predicting decoder with GreedyEmbeeding Sampler.
        '''
        # train decoder
        self.decoder_train = tfa.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            sampler=tfa.seq2seq.sampler.TrainingSampler(), # sampler for train which read its input, and correct wrong output  
            output_layer=decoder_dense)
        
        # predict decoder
        self.decoder_eval = tfa.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(),       # sampler for predict
            output_layer=decoder_dense
        )

        self.cross_entropy = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        self.opt = keras.optimizers.Adam(learning_rate=0.01,beta_1=0.99, beta_2=0.999, epsilon=10**-8)
        self.max_pred_len = max_pred_len
        self.start_token = start_token
        self.end_token = end_token

    def encode(self, x):
        embedded = self.enc_embeddings(x)
        '''
        In LSTM, there are two initial states (hidden state a_0, and cell state), so init_s contains both.
        '''
        init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))]
        # outputs (all hidden state of each time step), last hidden state(a), last cell state(c)
        o, h, c = self.encoder(embedded, initial_state=init_s) 
        return [h, c]

    def inference(self, x):
        s = self.encode(x)
        
        '''
        def initialize(self, embedding, start_tokens, end_token, initial_state):
        """Initialize the decoder.
        Args:
          embedding: A `Tensor` (or `Variable`) to pass as the `params` argument
            for `tf.nn.embedding_lookup`. This overrides `embedding_fn` set in
            the constructor.
          start_tokens: Start the decoding from these tokens.
            A `int32` `Tensor` of shape `[batch_size]`.
          end_token: The token that marks the end of decoding.
            A `int32` scalar `Tensor`.
          initial_state: The initial cell state as a (possibly nested) structure
            of `Tensor` and `TensorArray`.
        Returns:
          `(finished, start_inputs, initial_state)`.
          
        1. self.dec_embeddings.variables[0] returns word matrix.
        2. start_inputs is y_i, and initial state is S_i
        '''
        done, i, s = self.decoder_eval.initialize(
            self.dec_embeddings.variables[0],
            start_tokens=tf.fill([x.shape[0], ], self.start_token),
            end_token=self.end_token,
            initial_state=s)
        
        '''
        def step(self, time, inputs, state, training=None):
        Perform a decoding step.
        Args:
          time: scalar `int32` tensor.
          inputs: A (structure of) input tensors.
          state: A (structure of) state tensors and TensorArrays.
          training: Python boolean.
          
        Returns:
          `(outputs, next_state, next_inputs, finished)`.
          
        1. For predict output, we don't update layer's variables, so we set training=False.
        2. Also, we use for loop to predict output at each timestep.
        '''
        
        pred_id = np.zeros((x.shape[0], self.max_pred_len), dtype=np.int32)
        for l in range(self.max_pred_len):
            o, s, i, done = self.decoder_eval.step(
                time=l, inputs=i, state=s, training=False)
            pred_id[:, l] = o.sample_id
            
        return pred_id

    def train_logits(self, x, y, seq_len):
        s = self.encode(x)
        
#         print(s[0].shape)
#         print(s[1].shape)
        dec_in = y[:, :-1]   # ignore <EOS>
#         print("dec_in:", dec_in.shape)
        dec_emb_in = self.dec_embeddings(dec_in)
#         print("dec_emb_in:", dec_emb_in.shape)
        
#         print("seq_len", seq_len.shape)
        o, _, _ = self.decoder_train(dec_emb_in, s, sequence_length=seq_len)
        
#         print(o.rnn_output.shape)
        logits = o.rnn_output
        return logits

    def step(self, x, y, seq_len):
        
        with tf.GradientTape() as tape:
            logits = self.train_logits(x, y, seq_len)
            dec_out = y[:, 1:]  # ignore <GO>
            
#             print("x:", x.shape)
#             print("y:", y.shape)
#             print("seq_len:", seq_len.shape)
#             print("dec_out:", dec_out.shape)
#             print("logits:", logits.shape)
            
            loss = self.cross_entropy(dec_out, logits)
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.opt.apply_gradients(zip(grads, self.trainable_variables))
        return loss.numpy()

In [11]:
def train(sample_num, iteration):
    # get and process data
    m = sample_num
    dataset, human_vocab, inv_human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)
    Tx = 30
    Ty = 10
    X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)
    Y = np.insert(Y, 0, 11, axis=1)
    Y = np.insert(Y, Y.shape[1], 12, axis=1)
    decoder_len = np.ones((Y.shape[0],), dtype=int)*11
    
    print("X's shape: ", X.shape)
    print("X's vocab: ", len(inv_human_vocab))
    print("Y's shape: ", Y.shape)
    print("Y's vocab: ", len(inv_machine_vocab))
    print("Decoder_len.shape: ", decoder_len.shape)

    
#     print("Chinese time order: yy/mm/dd ", data.date_cn[:3], "\nEnglish time order: dd/M/yyyy ", data.date_en[:3])
#     print("vocabularies: ", data.vocab)
#     print("x index sample: \n{}\n{}".format(data.idx2str(data.x[0]), data.x[0]),
#           "\ny index sample: \n{}\n{}".format(data.idx2str(data.y[0]), data.y[0]))

    
    model = Seq2Seq(
        37, 13, emb_dim=16, units=32,
        max_pred_len=11, start_token=11, end_token=12)

    # training
    for t in range(iteration):
        loss = model.step(X, Y, decoder_len)
        
        if t % 50 == 0:
            index = random.randint(0, m-1)
            target = "".join(int_to_string(Y[index, 1:-1], inv_machine_vocab))
            pred = model.inference(X[index:index+1])
            
#             print("Prediciton: ", pred)
            inf = "".join(int_to_string(pred[0], inv_machine_vocab))
            src = "".join(int_to_string(X[index:index+1][0], inv_human_vocab)).replace('<pad>', '')
            print("iteration: ", t,
                  "| loss: %.3f" % loss,
                  "| input: ", src,
                  "| target: ", target,
                  "| inference: ", inf,)

    return model

In [None]:
# m = 100,000, ite = 1000
model = train()

100%|███████████████████████████████████████████████████████████████████████| 100000/100000 [00:03<00:00, 29175.91it/s]


X's shape:  (100000, 30)
X's vocab:  37
Y's shape:  (100000, 12)
Y's vocab:  13
Decoder_len.shape:  (100000,)
t:  0 | loss: 2.559 | input:  9/10/70 | target:  1970-09-10 | inference:  00000000000
t:  20 | loss: 1.660 | input:  15.11.19 | target:  2019-11-15 | inference:  1990-0-01<eos><eos>
t:  40 | loss: 1.140 | input:  january 1 2016 | target:  2017-01-01 | inference:  1997-01-19<eos>
t:  60 | loss: 0.976 | input:  19 aug 2005 | target:  2005-08-19 | inference:  1997-01-19<eos>
t:  80 | loss: 0.926 | input:  wednesday july 4 1990 | target:  1990-07-04 | inference:  1998-02-12<eos>
t:  100 | loss: 0.896 | input:  tuesday january 10 1995 | target:  1995-01-10 | inference:  1975-02-25<eos>
t:  120 | loss: 0.870 | input:  16 november 2007 | target:  2007-11-16 | inference:  1974-07-25<eos>
t:  140 | loss: 0.840 | input:  04 oct 2003 | target:  2003-10-04 | inference:  1974-07-24<eos>
t:  160 | loss: 0.810 | input:  07 feb 1982 | target:  1982-02-07 | inference:  1998-08-11<eos>
t:  180 |

In [4]:
# m = 1000, ite = 1000
model = train()

100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 19645.36it/s]


X's shape:  (1000, 30)
X's vocab:  37
Y's shape:  (1000, 12)
Y's vocab:  13
Decoder_len.shape:  (1000,)
t:  0 | loss: 2.566 | input:  9 may 1998 | target:  1998-05-09 | inference:  -----------
t:  50 | loss: 1.190 | input:  thursday march 4 1982 | target:  1982-03-04 | inference:  1998-0-01-2
t:  100 | loss: 0.916 | input:  10/19/70 | target:  1970-10-19 | inference:  1998-02-29<eos>
t:  150 | loss: 0.849 | input:  2 july 1974 | target:  1974-07-02 | inference:  1970-07-21<eos>
t:  200 | loss: 0.801 | input:  28 feb 2005 | target:  2005-02-28 | inference:  2004-03-24<eos>
t:  250 | loss: 0.752 | input:  thursday february 20 1975 | target:  1975-02-20 | inference:  1974-03-28<eos>
t:  300 | loss: 0.698 | input:  saturday january 16 2010 | target:  2010-01-16 | inference:  2000-01-15<eos>
t:  350 | loss: 0.645 | input:  saturday december 22 1984 | target:  1984-12-22 | inference:  1974-02-23<eos>
t:  400 | loss: 0.588 | input:  19 january 1972 | target:  1972-01-19 | inference:  1982-01-

In [12]:
# m = 1000, ite = 5000
model = train(1000, 5000)

100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 23599.04it/s]


X's shape:  (1000, 30)
X's vocab:  37
Y's shape:  (1000, 12)
Y's vocab:  13
Decoder_len.shape:  (1000,)
iteration:  0 | loss: 2.571 | input:  wednesday april 25 2018 | target:  2018-04-25 | inference:  1111111-1-1
iteration:  50 | loss: 1.231 | input:  march 21 1990 | target:  1990-03-21 | inference:  1997-01-13<eos>
iteration:  100 | loss: 0.948 | input:  24.08.13 | target:  2013-08-24 | inference:  1970-02-18<eos>
iteration:  150 | loss: 0.899 | input:  4 january 1994 | target:  1994-01-04 | inference:  1976-09-19<eos>
iteration:  200 | loss: 0.870 | input:  12.07.71 | target:  1971-07-12 | inference:  1982-05-27<eos>
iteration:  250 | loss: 0.834 | input:  tuesday june 9 2020 | target:  2020-06-09 | inference:  1991-08-27<eos>
iteration:  300 | loss: 0.795 | input:  june 22 1993 | target:  1993-06-22 | inference:  1991-09-29<eos>
iteration:  350 | loss: 0.751 | input:  5 september 2016 | target:  2016-09-05 | inference:  1996-11-27<eos>
iteration:  400 | loss: 0.699 | input:  tuesda

iteration:  3750 | loss: 0.001 | input:  6 april 2014 | target:  2014-04-06 | inference:  2014-04-06<eos>
iteration:  3800 | loss: 0.001 | input:  8 dec 1979 | target:  1979-12-08 | inference:  1979-12-08<eos>
iteration:  3850 | loss: 0.001 | input:  3 october 1997 | target:  1997-10-03 | inference:  1997-10-03<eos>
iteration:  3900 | loss: 0.001 | input:  30 mar 1986 | target:  1986-03-30 | inference:  1986-03-30<eos>
iteration:  3950 | loss: 0.001 | input:  thursday march 12 1998 | target:  1998-03-12 | inference:  1998-03-12<eos>
iteration:  4000 | loss: 0.001 | input:  friday september 21 2012 | target:  2012-09-21 | inference:  2012-09-21<eos>
iteration:  4050 | loss: 0.001 | input:  thursday february 28 1985 | target:  1985-02-28 | inference:  1985-02-28<eos>
iteration:  4100 | loss: 0.001 | input:  sunday july 8 1984 | target:  1984-07-08 | inference:  1984-07-08<eos>
iteration:  4150 | loss: 0.001 | input:  sunday january 28 1990 | target:  1990-01-28 | inference:  1990-01-28<e

In [None]:
m = 10000
dataset, human_vocab, inv_human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)
Y = np.insert(Y, 0, 11, axis=1)
Y = np.insert(Y, Y.shape[1], 12, axis=1)
decoder_len = np.ones((Y.shape[0],), dtype=int)*11

In [15]:
t = 9998
pred = model.inference(X[t:t+1])

inf = "".join(int_to_string(pred[0], inv_machine_vocab))
src = "".join(int_to_string(X[t:t+1][0], inv_human_vocab)).replace('<pad>', '')
print(
  "| input: ", src,
  "| inference: ", inf,)


| input:  friday november 23 1979 | inference:  1979-11-23<eos>


In [25]:
X[t:t+1].shape

(1, 30)

In [31]:
inputs=inputs.reshape(1,30)

In [30]:
inputs = np.array(string_to_int('July 9 2021', 30, human_vocab))

In [32]:
t = 9998
pred = model.inference(inputs)

inf = "".join(int_to_string(pred[0], inv_machine_vocab))
src = "".join(int_to_string(inputs[0], inv_human_vocab)).replace('<pad>', '')
print(
  "| input: ", src,
  "| inference: ", inf,)

| input:  july 9 2021 | inference:  2012-05-07<eos>


In [29]:
int_to_string(X[0:1][0], inv_human_vocab)

['5',
 '/',
 '1',
 '0',
 '/',
 '7',
 '0',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']