Train an encoder–decoder model that can convert a date string from one format to another (e.g., from “April 22, 2019” to “2019-04-22”).

First step is to be able to generate a train dataset

In [4]:
import tensorflow as tf
import numpy as np

tf.random.set_seed(42)
np.random.seed(42)

months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
months_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

def is_leap_year(year):
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def generate_date_string():
    year = np.random.randint(1000, 9999)
    month = np.random.randint(1, 13)  # 1-12
    max_day = months_days[month-1]
    if month == 2 and is_leap_year(year):
        max_day = 29
    day = np.random.randint(1, max_day + 1)
    return f"{months[month-1]} {day}, {year}"

def generate_label_string(date_string):
    date_parts = date_string.split(" ")
    month, day, year = date_parts[0], date_parts[1], date_parts[2]
    day_numeric = int(day.replace(",", ""))
    month_numeric = months.index(month) + 1
    return f"{year}-{month_numeric:02d}-{day_numeric:02d}"

print(generate_date_string())
print(generate_label_string("April 22, 2019"))

April 29, 8270
2019-04-22


In [5]:
import string


input_chars = "".join(sorted(set("".join(months) + "0123456789, ")))
output_chars = "0123456789-"
input_vocab = { 
    '<PAD>': 0 #Padding character
}

for char in input_chars:
    input_vocab[char] = len(input_vocab)

output_vocab = {
    '<PAD>': 0,
}

for char in output_chars:
    output_vocab[char] = len(output_vocab)

print(output_vocab)
print(input_vocab)

input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)
id_to_input_char = {idx: char for char, idx in input_vocab.items()}
id_to_output_char = {idx: char for char, idx in output_vocab.items()}

def ids_to_string(sequence, id_to_char, pad_id=0):
    flat_sequence = np.asarray(sequence).flatten()
    return ''.join(
        id_to_char.get(int(idx), '')
        for idx in flat_sequence
        if int(idx) != pad_id
    )

def convert_string_to_id(string, vocab: dict) -> list[int]:
    return [vocab[char] for char in string]
        
def generate_label(date_string):
    return generate_label_string(date_string)

def generate_sample():
    date_string = generate_date_string()
    label = generate_label(date_string)
    tokenized_date = convert_string_to_id(date_string, input_vocab)
    tokenized_label = convert_string_to_id(label, output_vocab)
    return tokenized_date, tokenized_label

def generate_dataset(
    num_samples: int = 10000
) -> tf.data.Dataset:
    
    values = []
    labels = []
    
    for _ in range(num_samples):
        value, label = generate_sample()
        values.append(value)
        labels.append(label)

    values = tf.ragged.constant(values, dtype=tf.int32, ragged_rank=1)
    labels = tf.ragged.constant(labels, dtype=tf.int32, ragged_rank=1)

    values = (values).to_tensor()
    labels = (labels).to_tensor()
    
    return values, labels

{'<PAD>': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, '-': 11}
{'<PAD>': 0, ' ': 1, ',': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12, 'A': 13, 'D': 14, 'F': 15, 'J': 16, 'M': 17, 'N': 18, 'O': 19, 'S': 20, 'a': 21, 'b': 22, 'c': 23, 'e': 24, 'g': 25, 'h': 26, 'i': 27, 'l': 28, 'm': 29, 'n': 30, 'o': 31, 'p': 32, 'r': 33, 's': 34, 't': 35, 'u': 36, 'v': 37, 'y': 38}


There are some problems with working with this strings. The numeric ones are easy, but i struggled when working with the word April or month words. How can i do that?

Important, dont forget about SAS and EOS tokens, and sequence max lenght

In [6]:
train_x, train_y = generate_dataset(10000)
validation_x, validation_y = generate_dataset(2000)
test_x, test_y = generate_dataset(2000)

In [7]:


for i in range(10):
    x_batch, y_batch = generate_sample()
    converted = ids_to_string(y_batch, id_to_output_char)
    converted_x = ids_to_string(x_batch, id_to_input_char)
    print("Output: " + converted)
    print("Input: " + converted_x)
    print("---\n")
    


Output: 1768-08-27
Input: August 27, 1768
---

Output: 6557-08-15
Input: August 15, 6557
---

Output: 1196-05-04
Input: May 4, 1196
---

Output: 2389-04-07
Input: April 7, 2389
---

Output: 3990-11-14
Input: November 14, 3990
---

Output: 7253-10-05
Input: October 5, 7253
---

Output: 5667-08-19
Input: August 19, 5667
---

Output: 7205-01-19
Input: January 19, 7205
---

Output: 7988-12-07
Input: December 7, 7988
---

Output: 2992-11-28
Input: November 28, 2992
---



In [50]:
# Versión simple: solo input → output
embedding_size = 32
max_length_output = train_y.shape[1]
print(max_length_output)
padding_char_sum_constant = 1

encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=input_vocab_size,
        output_dim=embedding_size, 
        input_shape=[None]
    ),
    tf.keras.layers.LSTM(128)  # without return_sequences, generates a vector
])

decoder = tf.keras.Sequential([
    tf.keras.layers.RepeatVector(max_length_output),  # repeat the vector N times
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Dense(output_vocab_size, activation='softmax')
])

model = tf.keras.Sequential([
    encoder,
    decoder
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

10




In [51]:
model.fit(train_x, train_y, epochs=20, validation_data=(validation_x, validation_y))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x34476dfc0>

The first issue i noticed is that when generating the dates, in order to get every position to have the same meaning i should generate the output dates like 2025-12-02 not 2025-12-2, this is a way of making meaning regular and output length always the same.

It was difficult to add padding to the strings. The input and output vocabs should have a char '<PAD>' used to represent empty spaces in the chains due to the fact that not of them have the same length. And we must feed the model with a (batch, input_vocab_size, max_length_string)

That adds one char to the possible values of input and output.

In [54]:
def predict_date(date_string, model, max_input_len=18, max_output_len=10):
    tokenized = convert_string_to_id(date_string, input_vocab)

    input_tensor = np.zeros((1, max_input_len), dtype=np.int32)
    input_tensor[0, :len(tokenized)] = tokenized

    probs = model.predict(input_tensor, verbose=0)[0]  # [T_out, vocab]
    indices = np.argmax(probs, axis=-1)

    decoded = ids_to_string(indices, id_to_output_char, pad_id=0)
    return decoded.strip()

# Uso:
test = "May 3, 1290"
predicted = predict_date(test, model)
print(f"Input: {test}")
print(f"Expected: {generate_label(test)}")
print(f"Predicted: {predicted}")

Input: May 3, 1290
Expected: 1290-05-03
Predicted: 1290-05-03


So now that i have created a correct LSTM, lets use teacher forcing to improve the performance and speed up training. 

Teacher forcing consists in passing the word that the decoder should have output in the previous step regardless of what it actually outputs. For the first word, the decoder is given a <SOS> token and it is expected to end the sequence with an <EOF> token. 

It feeds the decoder the ground-truth token from the previous timestep instead of its own prediction, which speeds up and stabilizes training because gradients propagate through the correct context, exposes the model to valid target sequences at every step, preventing it from wandering into impossible states, breaks the chain of accumulated errors that would otherwise arise when an early mistake pollutes all subsequent inputs; and optimizes the exact conditional relationships the loss compares, so that at inference the decoder can switch to autoregressive mode having already learned how each true token leads to the next.

In [65]:
teacher_forcing_input_vocab = {
    '<PAD>': 0,
    '<SOS>': 1,
    '<EOS>': 2,
}

teacher_forcing_output_vocab = {
    '<PAD>': 0,
    '<SOS>': 1,
    '<EOS>': 2,
}

for char in input_chars:
    teacher_forcing_input_vocab[char] = len(teacher_forcing_input_vocab)

for char in output_chars:
    teacher_forcing_output_vocab[char] = len(teacher_forcing_output_vocab)

teacher_forcing_input_vocab_size = len(teacher_forcing_input_vocab)
teacher_forcing_output_vocab_size = len(teacher_forcing_output_vocab)
teacher_forcing_id_to_input_char = {idx: char for char, idx in teacher_forcing_input_vocab.items()}
teacher_forcing_id_to_output_char = {idx: char for char, idx in teacher_forcing_output_vocab.items()}

In [66]:
def generate_teacher_forcing_sample():
    date_string = generate_date_string()
    label = generate_label(date_string)
    tokenized_date = convert_string_to_id(date_string, teacher_forcing_input_vocab)
    tokenized_label = convert_string_to_id(label, teacher_forcing_output_vocab)
    return tokenized_date, tokenized_label

def generate_teacher_forcing_dataset(size: int):
    encoder_inputs = []
    decoder_inputs = []
    decoder_outputs = []
    
    for _ in range(size):
        value, label = generate_teacher_forcing_sample()
        decoder_in = [teacher_forcing_output_vocab['<SOS>']] + label
        decoder_out = label + [teacher_forcing_output_vocab['<EOS>']]

        encoder_inputs.append(value)
        decoder_inputs.append(decoder_in)
        decoder_outputs.append(decoder_out)

    encoder_inputs = tf.ragged.constant(encoder_inputs, dtype=tf.int32, ragged_rank=1)
    decoder_inputs = tf.ragged.constant(decoder_inputs, dtype=tf.int32, ragged_rank=1)
    decoder_outputs = tf.ragged.constant(decoder_outputs, dtype=tf.int32, ragged_rank=1)

    encoder_inputs = (encoder_inputs).to_tensor()
    decoder_inputs = (decoder_inputs).to_tensor()
    decoder_outputs = (decoder_outputs).to_tensor()
    
    return encoder_inputs, decoder_inputs, decoder_outputs



In [67]:
encoder_inputs_train, decoder_inputs_train, decoder_outputs_train = generate_teacher_forcing_dataset(10000)
encoder_inputs_valid, decoder_inputs_valid, decoder_outputs_valid = generate_teacher_forcing_dataset(2000)
encoder_inputs_test, decoder_inputs_test, decoder_outputs_test = generate_teacher_forcing_dataset(2000)

In [68]:
encoder_inputs_train.shape

TensorShape([10000, 18])

In [69]:
encoder_inputs_train[0]

<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([20, 33, 39, 26, 31, 24, 26, 35,  3,  6,  5,  4,  3, 14, 13, 14,  5,
        0], dtype=int32)>

In [70]:
decoder_inputs_train[0]

<tf.Tensor: shape=(11,), dtype=int32, numpy=array([ 1, 12, 11, 12,  3, 13,  4,  4, 13,  4,  3], dtype=int32)>

In [71]:
decoder_outputs_train[0]

<tf.Tensor: shape=(11,), dtype=int32, numpy=array([12, 11, 12,  3, 13,  4,  4, 13,  4,  3,  2], dtype=int32)>

The decoder input starts with the id 1 (SOS) character and the output with id 2 character (EOS)

In [72]:
encoder_inputs = tf.keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_inputs = tf.keras.layers.Input(shape=[None], dtype=tf.int32)

encoder_embedding = tf.keras.layers.Embedding(
    input_dim=teacher_forcing_input_vocab_size,
    output_dim=embedding_size,
    input_shape=[None]
)(encoder_inputs)

encoder_lstm_output, encoder_state_h, encoder_state_c = tf.keras.layers.LSTM(128, return_state=True)(encoder_embedding)
decoder_initial_state = [encoder_state_h, encoder_state_c]

decoder_embedding = tf.keras.layers.Embedding(
    input_dim=teacher_forcing_output_vocab_size,
    output_dim=embedding_size,
    input_shape=[None]
)(decoder_inputs)

decoder_lstm_output = tf.keras.layers.LSTM(128, return_sequences=True)(decoder_embedding, initial_state=decoder_initial_state)

decoder_dense = tf.keras.layers.Dense(teacher_forcing_output_vocab_size, activation='softmax')(decoder_lstm_output)

teacher_forcing_model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_dense)

teacher_forcing_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 input_14 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_14 (Embedding)    (None, None, 32)             1312      ['input_13[0][0]']            
                                                                                                  
 embedding_15 (Embedding)    (None, None, 32)             448       ['input_14[0][0]']            
                                                                                            

In [73]:
teacher_forcing_model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

teacher_forcing_model.fit([encoder_inputs_train, decoder_inputs_train], decoder_outputs_train, epochs=20, validation_data=([encoder_inputs_valid, decoder_inputs_valid], decoder_outputs_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x3610a7640>

It performed better acquiring 92% of accuracy in just 3 epochs, much better than the previous model. Now let`s make some predictions, instead of predicting one sentence at a time, the new model outputs one character at a time.

In [75]:
def predict_date(date_string, model, max_input_len=18, max_output_len=10):
    tokenized = convert_string_to_id(date_string, teacher_forcing_input_vocab)

    encoder_input = np.zeros((1, max_input_len), dtype=np.int32)
    encoder_input[0, :len(tokenized)] = tokenized

    decoder_tokens = [teacher_forcing_output_vocab['<SOS>']] #Dimension batch and first token

    output_ids = []
    for _ in range(max_output_len):
        decoder_input = np.zeros((1, len(decoder_tokens)), dtype=np.int32)
        decoder_input[0, :] = decoder_tokens

        probs = model.predict([encoder_input, decoder_input], verbose=0)

        next_char_idx = np.argmax(probs[0, -1])
        next_char = ids_to_string([next_char_idx], id_to_output_char, pad_id=0)

        output_ids.append(next_char_idx)
        decoder_tokens = decoder_tokens + [next_char_idx]

        if next_char == '<EOS>':
            break

    return ids_to_string(output_ids, teacher_forcing_id_to_output_char).strip()

predicted = predict_date("January 30, 1290", teacher_forcing_model)
print(predicted)

1290-01-30


Obviously, my first attempt to predict with the new model failed. It kept on an on predicting and never stopped. First i was not updating the decoders input on every time.

I also found these bugs in the way

Generated targets with the old output vocabulary, then wrapped those IDs with <SOS>/<EOS> from a brand-new vocabulary, and finally tried to decode the predictions with the new dictionary. Unsurprisingly, the model “learned” to emit what it was fed—numbers in the old ID space—but I interpreted them as the new tokens, so everything looked like <SOS> and <EOS>. I fixed it by regenerating the teacher-forcing dataset using a single, consistent teacher_forcing_output_vocab: every digit and the hyphen gets re-encoded in that dictionary before I add the special tokens, so the IDs I train on and the IDs I decode are exactly the same.

The second issue was my inference loop. I kept resetting decoder_input to a single timestep, meaning the LSTM saw only <SOS> (or just the last character) and had no memory of the prefix it was supposed to condition on, causing endless repeats. The fix was to maintain the full prefix in a decoder_tokens list. On each iteration I rebuild the tensor (1, len(decoder_tokens)), call model.predict([encoder_input, decoder_input]), grab only the logits from the final timestep (probs[0, -1]), choose that next ID, append it to both the output list and decoder_tokens, and stop when <EOS> appears or the max length is reached. Once both the vocabulary alignment and the autoregressive loop were corrected, the teacher-forcing model started producing accurate date conversions.