In [None]:
!pip install faker

In [None]:
import faker
import random
import tqdm
import babel.dates

fake = faker.Faker()
faker.Faker.seed(12345)
random.seed(12345)

# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# change this if you want it to work with another language
LOCALES = ['nl_NL', 'de_DE']


def load_date():
    """
        Loads some fake dates
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = babel.dates.format_date(dt, format=random.choice(FORMATS),
                                     locale=random.choice(LOCALES)) # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()

    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt


def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """

    dataset = []

    for i in range(m):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))

    return dataset

In [None]:
m = 20_000
dataset = load_dataset(m)

In [None]:
dataset[:20]

### Start of the assignment

In [None]:
import tensorflow as tf
import numpy as np

Chop original dataset in three parts. First 10000 for training,
next 5000 for validation and last 5000 for testing.

In [None]:
train = dataset[:10_000]
valid = dataset[10_000:15_000]
test = dataset[15_000:20_000]

In [None]:
train_human, train_machine = zip(*train)
valid_human, valid_machine = zip(*valid)
test_human, test_machine = zip(*test)

In [None]:
train_human[:10]

In [None]:
human_text_vec_layer = tf.keras.layers.TextVectorization(
    split="character",
    standardize=None
)
human_text_vec_layer.adapt(train_human)
print(human_text_vec_layer.get_vocabulary())
print(len(human_text_vec_layer.get_vocabulary()))

In [None]:
machine_text_vec_layer = tf.keras.layers.TextVectorization(
    split="character",
    standardize=None
)
machine_text_vec_layer.adapt(train_machine)
print(machine_text_vec_layer.get_vocabulary())
print(len(machine_text_vec_layer.get_vocabulary()))

In [None]:
human_text_vec_layer(["29 oktober 2023", "1 mei 1978 ??"])

Een eerste gemakkelijk model.

- GRU model voor encoder. Vector als uitvoer.
- Deze vector invoeren als iedere stap bij de decoder.


In [None]:
X_train = human_text_vec_layer(train_human)
y_train = machine_text_vec_layer(train_machine)
X_valid = human_text_vec_layer(valid_human)
y_valid = machine_text_vec_layer(valid_machine)
X_test = human_text_vec_layer(test_human)
y_test = machine_text_vec_layer(test_machine)
X_train.shape, y_train.shape

In [None]:
def get_model_1(input_vocab_size,
                output_vocab_size,
                output_seq_length=10,
                embedding_size=16,
                recurrent_units=64):

  encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=input_vocab_size,
        output_dim=embedding_size,
        mask_zero=True),
    tf.keras.layers.GRU(units=recurrent_units),
  ])

  decoder = tf.keras.Sequential([
    tf.keras.layers.GRU(units=recurrent_units, return_sequences=True),
    tf.keras.layers.Dense(
        units=output_vocab_size,
        activation="softmax"
    )
  ])

  model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.RepeatVector(output_seq_length),
    decoder
  ])

  return model

In [None]:
tf.keras.backend.clear_session()
model = get_model_1(
    input_vocab_size=len(human_text_vec_layer.get_vocabulary()),
    output_vocab_size=len(machine_text_vec_layer.get_vocabulary()),
)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
model.summary()

In [None]:
model.fit(X_train, y_train, batch_size=128, epochs=20, validation_data=(X_valid, y_valid))

In [None]:
datums = ["1 mei 2023", "zondag 29 oktober 2023"]
human_text_vec_layer(datums)

In [None]:
def convert_dates(model, dates):
  # Model: verwacht (batch, seq_length) als invoer.
  #        Retourneert, (batch, seq_out_length, num_out_tokens)
  # Datums: lijst van strings met input datums
  model_proba_predictions = model(human_text_vec_layer(dates))
  model_predictions = tf.math.argmax(model_proba_predictions, axis=-1) # (batch, seq_out_length)
  machine_vocabulary = np.asarray(machine_text_vec_layer.get_vocabulary())
  return [''.join(row) for row in machine_vocabulary[model_predictions]]


In [None]:
datums = ["1 mei 2023", "zondag 29 oktober 2023"]
convert_dates(model, datums)

## Tweede model

Bekijk het als een vertaalprobleem.

In [None]:
machine_trslt_text_vec_layer = tf.keras.layers.TextVectorization(
    split="character",
    standardize=None
)
machine_trslt_text_vec_layer.adapt(["." + date + "*" for date in train_machine])
print(machine_trslt_text_vec_layer.get_vocabulary())
print(len(machine_trslt_text_vec_layer.get_vocabulary()))

In [None]:
X_train = tf.constant(train_human)
X_valid = tf.constant(valid_human)

X_train_dec = tf.constant(["." + date for date in train_machine])
X_valid_dec = tf.constant(["." + date for date in valid_machine])

Y_train_dec = machine_trslt_text_vec_layer(tf.constant([date + "*" for date in train_machine]))
Y_valid_dec = machine_trslt_text_vec_layer(tf.constant([date + "*" for date in valid_machine]))


In [None]:
def get_model_2(encoder_vectorizer_layer,
                decoder_vectorizer_layer,
                embedding_size=16,
                recurrent_units=64):

  # Define input layers
  encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string, name="enc_input")
  decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string, name="dec_input")

  # Vectorize the strings
  encoder_input_ids = encoder_vectorizer_layer(encoder_inputs)
  decoder_input_ids = decoder_vectorizer_layer(decoder_inputs)

  # Define the embedding layers that sit in front of the encoder and the decoder
  encoder_embedding_layer = tf.keras.layers.Embedding(
      input_dim=len(encoder_vectorizer_layer.get_vocabulary()),
      output_dim=embedding_size,
      mask_zero=True,
      name="enc_embed"
  )

  decoder_embedding_layer = tf.keras.layers.Embedding(
      input_dim=len(decoder_vectorizer_layer.get_vocabulary()),
      output_dim=embedding_size,
      mask_zero=True,
      name="dec_embed"
  )

  # Apply the embedding layers to the integer identifiers of the tokens
  encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
  decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

  # Define the RNN using GRU cells. Make sure that the encoder returns the state
  # Note: when using a GRU the state and the output are the same but this is
  # not the case if we would switch to an LSTM
  encoder = tf.keras.layers.GRU(units=recurrent_units,
                                return_state=True,
                                name="enc_rnn")

  # Call the RNN on the encoder_embeddings
  encoder_outputs, encoder_state = encoder(encoder_embeddings)

  # Define the RNN for the decoder. Make sure that this one returns sequences
  # as we want to predict the next character at each time step
  decoder = tf.keras.layers.GRU(units=recurrent_units,
                                return_sequences=True,
                                name="dec_rnn")

  # Call the decoder with the decoder embeddings as input and the final state
  # of the encoder as the initial state
  decoder_outputs = decoder(inputs=decoder_embeddings, initial_state=encoder_state)

  # Define (time-distributed) fully connected layer.
  output_layer = tf.keras.layers.Dense(
      units=len(decoder_vectorizer_layer.get_vocabulary()),
      activation='softmax',
      name="fc_out"
      )

  # Apply the dense layer to each decoder output
  # in order to predict the next character.
  Y_proba = output_layer(decoder_outputs)

  # Define the model. It has two inputs and one output
  model = tf.keras.Model(
    inputs=[encoder_inputs, decoder_inputs],
    outputs=[Y_proba]
  )

  return model

In [None]:
model2 = get_model_2(
    encoder_vectorizer_layer=human_text_vec_layer,
    decoder_vectorizer_layer=machine_trslt_text_vec_layer
)

model2.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam", metrics=["accuracy"])

model2.summary()

In [None]:
model2.fit(
    (X_train, X_train_dec),
    Y_train_dec,
    epochs=30,
    batch_size=32,
    validation_data=((X_valid, X_valid_dec), Y_valid_dec)
    )

In [None]:
def convert_date_2(model2, date):
  # date should be a string
  # Convert a single date

  vocabulary = np.asarray(machine_trslt_text_vec_layer.get_vocabulary())

  encoder_inputs = tf.constant([date])
  current_string = "." # start of sequence token
  last_predicted_character = ''
  i = 0
  while last_predicted_character != '*': # End of sequence token
    current_string += last_predicted_character
    decoder_inputs = tf.constant([current_string])
    y_probas = model2([encoder_inputs, decoder_inputs])
    y_last_char_ids = tf.math.argmax(y_probas[:, -1], axis=-1)
    last_chars = vocabulary[y_last_char_ids]
    last_predicted_character = last_chars[0] # only one item in batch

  return current_string[1:]



In [None]:
convert_date_2(model2, "1 januari 1970")