In [4]:
!pip install faker



In [5]:
import faker
import random
import babel.dates

fake = faker.Faker ()
faker.Faker.seed(12345)
random.seed(12345)

# Define format of the data we would like to generate
FORMATS = ['short', 'medium', 'long', 'full', 'full',
'full', 'full', 'full', 'full', 'full', 'full',
'full', 'full', 'd MMM YYY', 'd MMMM YYY',
'dd MMM YYY', 'd MMM, YYY', 'd MMMM, YYY',
'dd, MMM YYY', 'd MM YY', 'd MMMM YYY',
'MMMM d YYY', 'MMMM d, YYY', 'dd. MM. YY']
# change this if you want it to work with another language
LOCALES = ['nl_NL', 'de_DE']

def load_date():
  """
  Create a fake date
  :returns: tuple containing human readable string,
            machine readable string, and date object
  """
  dt = fake.date_object ()

  try:
    human_readable = babel.dates.format_date(
    dt,
    format=random. choice (FORMATS),
    locale=random.choice (LOCALES))
    human_readable = human_readable.lower()
    human_readable = human_readable.replace(',','')
    machine_readable = dt.isoformat()

  except AttributeError as e:
    return None, None, None

  return human_readable, machine_readable, dt

In [6]:
load_date()

('6 jul 1992', '1992-07-06', datetime.date(1992, 7, 6))

In [7]:
def load_dataset(m):
  return [load_date()[:2] for _ in range(m)]

In [8]:
ds = load_dataset(20000)

In [9]:
ds[:10]

[('montag 20. juli 1970', '1970-07-20'),
 ('montag 4. august 2014', '2014-08-04'),
 ('20 feb. 1986', '1986-02-20'),
 ('dienstag 28. november 1989', '1989-11-28'),
 ('donnerstag 19. juni 1980', '1980-06-19'),
 ('dinsdag 1 augustus 2000', '2000-08-01'),
 ('27 09 78', '1978-09-27'),
 ('zaterdag 18 september 1976', '1976-09-18'),
 ('24 mai 1993', '1993-05-24'),
 ('16 mei 2000', '2000-05-16')]

In [10]:
train_ds = ds[:10_000]
valid_ds = ds[10_000:15_000]
test_ds = ds[15000:]

In [11]:
len(train_ds), len(valid_ds), len(test_ds)

(10000, 5000, 5000)

In [12]:
train_human, train_machine = zip(*train_ds)
valid_human, valid_machine = zip(*valid_ds)
test_human, test_machine = zip(*test_ds)

In [13]:
train_human[:5]

('montag 20. juli 1970',
 'montag 4. august 2014',
 '20 feb. 1986',
 'dienstag 28. november 1989',
 'donnerstag 19. juni 1980')

In [14]:
train_machine[:5]

('1970-07-20', '2014-08-04', '1986-02-20', '1989-11-28', '1980-06-19')

In [15]:
import tensorflow as tf

text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize=None)
text_vec_layer.adapt(train_human)

In [16]:
human_text_vec_layer = text_vec_layer(train_human)

In [17]:
text_vec_layer.get_vocabulary()

['',
 '[UNK]',
 ' ',
 '1',
 '2',
 '0',
 'a',
 '9',
 'e',
 'r',
 'n',
 't',
 'g',
 '.',
 'i',
 'm',
 'd',
 'o',
 'u',
 '8',
 '7',
 's',
 'b',
 '3',
 'j',
 '5',
 '6',
 '4',
 'p',
 'z',
 'l',
 'v',
 'f',
 'k',
 'c',
 'w',
 '-',
 'ä',
 'h']

In [18]:
human_text_vec_layer[0]

<tf.Tensor: shape=(29,), dtype=int64, numpy=
array([15, 17, 10, 11,  6, 12,  2,  4,  5, 13,  2, 24, 18, 30, 14,  2,  3,
        7, 20,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0])>

In [19]:
text_vec_layer2 = tf.keras.layers.TextVectorization(split="character", standardize=None)
text_vec_layer2.adapt(train_machine)

In [20]:
machine_text_vec_layer = text_vec_layer2(train_machine)

In [21]:
text_vec_layer2(("6 December 2023"))

<tf.Tensor: shape=(15,), dtype=int64, numpy=array([12,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  5,  3,  5,  9])>

In [22]:
text_vec_layer2.get_vocabulary()

['', '[UNK]', '-', '0', '1', '2', '9', '8', '7', '3', '5', '4', '6']

In [23]:
machine_text_vec_layer[0]

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([4, 6, 8, 3, 2, 3, 8, 2, 5, 3])>

In [24]:
X_train = text_vec_layer(train_human)
y_train = text_vec_layer2(train_machine)

X_valid = text_vec_layer(valid_human)
y_valid = text_vec_layer2(valid_machine)

X_test = text_vec_layer(test_human)
y_test = text_vec_layer2(test_machine)

In [25]:
X_train.shape, y_train.shape

(TensorShape([10000, 29]), TensorShape([10000, 10]))

In [26]:
def get_model_1(input_vocab_size,
                output_vocab_size,
                output_seq_length=10,
                embedding_size=16,
                recurrent_units=64):
    encoder = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=input_vocab_size, output_dim=embedding_size, mask_zero=True),
        tf.keras.layers.GRU(units=recurrent_units)
    ])
    decoder = tf.keras.Sequential([
        tf.keras.layers.GRU(units=recurrent_units, return_sequences=True),
        tf.keras.layers.Dense(units=output_vocab_size, activation="softmax")
    ])

    model = tf.keras. Sequential ([
      encoder,
      tf.keras.layers.RepeatVector(output_seq_length),
      decoder

    ])

    return model

In [27]:
model = get_model_1(len(text_vec_layer.get_vocabulary()),
                    len(text_vec_layer2.get_vocabulary()))

In [28]:
model.compile(optimizer="adam", loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics="accuracy")

In [29]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=128, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7b8b008cb730>

In [30]:
import numpy as np

def convert_dates(model, dates):
  sequence = text_vec_layer(dates)
  prediction = model.predict(sequence)
  best_preds = tf.math.argmax(prediction, axis=-1)
  index = np.array(text_vec_layer2.get_vocabulary())

  return ["".join(index[pred]) for pred in best_preds]

In [31]:
convert_dates(model, ["1 mei 2023", "zondag 29 oktober 2023"])



['2023-06-23', '2023-11-22']

# Second model

In [32]:
text_vec_layer2v1 = tf.keras.layers.TextVectorization(split="character", standardize=None)
text_vec_layer2v1.adapt(["." + d + "*" for d in train_machine])

In [33]:
text_vec_layer2.get_vocabulary()

['', '[UNK]', '-', '0', '1', '2', '9', '8', '7', '3', '5', '4', '6']

In [34]:
X_train = tf.constant(train_human)
X_valid = tf.constant(valid_human)

X_train_dec = tf.constant(["." + d for d in train_machine])
X_valid_dec = tf.constant(["." + d for d in valid_machine])

y_train_dec = text_vec_layer2([d + "*" for d in train_machine])
y_valid_dec = text_vec_layer2([d + "*" for d in valid_machine])

In [35]:
X_train[0], X_valid[0], X_train_dec[0], X_valid_dec[0], y_train_dec[0], y_valid_dec[0]

(<tf.Tensor: shape=(), dtype=string, numpy=b'montag 20. juli 1970'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'15 januar 2015'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'.1970-07-20'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'.2015-01-15'>,
 <tf.Tensor: shape=(11,), dtype=int64, numpy=array([4, 6, 8, 3, 2, 3, 8, 2, 5, 3, 1])>,
 <tf.Tensor: shape=(11,), dtype=int64, numpy=array([ 5,  3,  4, 10,  2,  3,  4,  2,  4, 10,  1])>)

In [36]:
def get_model_2(encoder_vectorizer_layer,
  decoder_vectorizer_layer,
  embedding_size=16,
  recurrent_units=64):

  encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
  decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

  encoder_input_ids = text_vec_layer(encoder_inputs)
  decoder_input_ids = text_vec_layer2(decoder_inputs)

  encoder_embedding_layer = tf.keras.layers.Embedding(len(text_vec_layer.get_vocabulary()), embedding_size,  mask_zero=True)
  decoder_embedding_layer = tf.keras.layers.Embedding(len(text_vec_layer2.get_vocabulary()), embedding_size,  mask_zero=True)

  encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
  decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

  encoder = tf.keras.layers.GRU(units=recurrent_units, return_state=True)
  encoder_outputs, encoder_state = encoder(encoder_embeddings)

  decoder = tf.keras.layers.GRU(units=recurrent_units, return_sequences=True)
  decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

  output_layer = tf.keras.layers.Dense(len(text_vec_layer2.get_vocabulary()), activation="softmax")

  y_proba = output_layer(decoder_outputs)

  model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[y_proba])
  return model

In [37]:
model2 = get_model_2(text_vec_layer, text_vec_layer2v1)

In [38]:
model2.compile(loss="sparse_categorical_crossentropy",optimizer="adam", metrics=["accuracy"])

In [39]:
model2.fit((X_train, X_train_dec), y_train_dec,
          epochs=10, batch_size=128,
          validation_data=((X_valid, X_valid_dec), y_valid_dec))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b8af93761a0>

In [40]:
def convert_date_2(model, date):
  encoder_inputs = tf.constant([date])
  current_string = ".2023"
  last_predicted_character = ""

  while last_predicted_character != "*":
    current_string = current_string + last_predicted_character

    decoder_input = tf.constant([current_string])

    model_output = model((encoder_inputs, decoder_input))
    probas = model_output[:, -1]
    predicted_idx = tf.math.argmax(probas, axis=-1)[0]
    vocab = np.array(text_vec_layer2v1.get_vocabulary())
    last_predicted_character = vocab[predicted_idx]

  return current_string[1:]

In [41]:
convert_date_2(model2, "13 december 2023")

'2023-0.-19[UNK][UNK][UNK]9'