# RNN - Version 1.0

## Setup

### Import TensorFlow and other libraries

In [1]:
import tensorflow as tf
import numpy as np
import time
import csv


# print the version of tensorflow
print(tf.__version__)

2.17.0


#### Import Modules

In [2]:
import sys
import os

notebook_dir = os.getcwd()
src_path = os.path.abspath(os.path.join(notebook_dir, '..'))
print(src_path)
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from shared.globals import *

# print some global var just to check
print(PAD)

/Users/pedromoura/Desktop/projects/arc/arc-model/src
19


Load the arc-agi_training_challenges_bpe Dataset

In [3]:
dataset_path = "../../../data/arc-agi_training_challenges_bpe.csv"

Add padding on sequences

In [4]:
def pad_sequence(curr_seq_len, curr_seq):
    # Calculate number of padding required for the current sequence
    num_pads = SEQ_LEN - curr_seq_len

    # Add padding at the beggining, followd by the sequence
    padded_sequence = [PAD] * num_pads + curr_seq
    return padded_sequence

In [5]:
padded_sequences = []

with open(dataset_path, mode='r') as f:
    reader = csv.reader(f)
    next(reader)  # skip header

    for row in reader:
        # Read row as integers
        curr_seq = list(map(int, row[3].split(' ')))
        curr_seq_len = int(row[2])

        # Add padding at the beginning of the sequence
        padded_sequences.append(pad_sequence(curr_seq_len, curr_seq))

Test to see if:
- Padded was added correctly
- Our sequence is a list of type int
- Sequences length match SEQ_LEN, which is 1500

In [6]:
i = 12
print("Sequence ", np.array(padded_sequences[i])) # just used np.array for demostrantion
print("Type: ", type(padded_sequences[i][0]))
print("Length: ", len(padded_sequences[i]))

Sequence  [ 19  19  19 ...  93 384   0]
Type:  <class 'int'>
Length:  1500


## Process the sequence

### Flatten sequence
Flatten the list of padded sequences into a single sequence:


In [7]:
flattened_sequence = [token for seq in padded_sequences for token in seq]

Now convert the flattened sequence to a TensorFlow dataset:

In [8]:
tf_dataset = tf.constant(flattened_sequence)
tf_dataset

<tf.Tensor: shape=(90789000,), dtype=int32, numpy=array([19, 19, 19, ...,  0, 12, 14], dtype=int32)>

Batch the dataset to create sequences of length SEQ_LEN


### Create training examples and targets


Use `tf.data.Dataset.from_tensor_slices` to get the slices of the list in the form of objects:
 - TODO:
 -- read more about this

In [9]:
sliced_dataset = tf.data.Dataset.from_tensor_slices(tf_dataset)
sliced_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

Batch the dataset to create sequences of length SEQ_LEN:

In [10]:
sequences = sliced_dataset.batch(SEQ_LEN, drop_remainder=True)

for seq in sequences.take(3):
    print(seq)

tf.Tensor([ 19  19  19 ...  10  46 312], shape=(1500,), dtype=int32)
tf.Tensor([ 19  19  19 ...  46 312   7], shape=(1500,), dtype=int32)
tf.Tensor([ 19  19  19 ... 312   7   0], shape=(1500,), dtype=int32)


2024-09-18 23:36:01.435623: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Map to input and target sequences:

In [11]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [12]:
dataset = sequences.map(split_input_target)

In [13]:
for input_example, target_example in dataset.take(3):
    print("Input :", input_example.numpy())
    print("Target:", target_example.numpy())

Input : [19 19 19 ... 93 10 46]
Target: [ 19  19  19 ...  10  46 312]
Input : [ 19  19  19 ...  10  46 312]
Target: [ 19  19  19 ...  46 312   7]
Input : [ 19  19  19 ...  46 312   7]
Target: [ 19  19  19 ... 312   7   0]


2024-09-18 23:36:01.899077: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### Create training batches

shuffle the data and pack it into batches.

In [14]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 1499), dtype=tf.int32, name=None), TensorSpec(shape=(64, 1499), dtype=tf.int32, name=None))>

## Build The Model

In [17]:
# Length of the vocabulary in StringLookup Layer
vocab_size = VOCAB_SIZE

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [18]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__()  # the __init__ on 2.17 passes "self" by default.
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            return_state=True
        )
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
      x = inputs
      x = self.embedding(x, training=training)
      if states is None: # initialized the state with zeros so dont crash on "Try the model" step
          batch_size = tf.shape(x)[0]
          states = tf.zeros([batch_size, self.gru.units], dtype=tf.float32)
      x, states = self.gru(x, initial_state=states, training=training)
      x = self.dense(x, training=training)

      if return_state:
          return x, states
      else:
          return x

In [19]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

## Try the model

Now run the model to see that it behaves as expected.

First check the shape of the output:

In [20]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 1499, 2048) # (batch_size, sequence_length, vocab_size)


In [21]:
model.summary()

## Train the model

In [22]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [23]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 1499, 2048)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(7.6292086, shape=(), dtype=float32)


A newly initialized model shouldn't be too sure of itself, the output logits should all have similar magnitudes. To confirm this you can check that the exponential of the mean loss is approximately equal to the vocabulary size. A much higher loss means the model is sure of its wrong answers, and is badly initialized:

- TODO:
    ask more about this

In [24]:
tf.exp(example_batch_mean_loss).numpy()

2057.4211

Configure the training procedure using the `tf.keras.Model.compile` method. Use `tf.keras.optimizers.Adam` with default arguments and the loss function.

In [25]:
model.compile(optimizer='adam', loss=loss)

### Configure checkpoints

Use a `tf.keras.callbacks.ModelCheckpoint` to ensure that checkpoints are saved during training:

In [26]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5") # added ".weights.h5" to fix issue

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Execute the training

In [27]:
EPOCHS = 10

In [28]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10


2024-09-18 23:44:20.033479: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:7: Filling up shuffle buffer (this may take a while): 9992 of 10000
2024-09-18 23:44:20.041248: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


[1m 47/945[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:37:37[0m 19s/step - loss: 2.4115

## Generate Sequence

The following makes a single step prediction:

In [41]:
class OneStep(tf.keras.Model):
  def __init__(self, model, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[VOCAB_SIZE])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [42]:
one_step_model = OneStep(model)

Run it in a loop to generate some text. Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

The easiest thing you can do to improve the results is to train it for longer (try `EPOCHS = 30`).

You can also experiment with a different start string, try adding another RNN layer to improve the model's accuracy, or adjust the temperature parameter to generate more or less random predictions.

If you want the model to generate text *faster* the easiest thing you can do is batch the text generation. In the example below the model generates 5 outputs in about the same time it took to generate 1 above. 

In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

## Export the generator

This single-step model can easily be [saved and restored](https://www.tensorflow.org/guide/saved_model), allowing you to use it anywhere a `tf.saved_model` is accepted.

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

In [None]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

## Advanced: Customized Training

The above training procedure is simple, but does not give you much control.
It uses teacher-forcing which prevents bad predictions from being fed back to the model, so the model never learns to recover from mistakes.

So now that you've seen how to run the model manually next you'll implement the training loop. This gives a starting point if, for example, you want to implement _curriculum  learning_ to help stabilize the model's open-loop output.

The most important part of a custom training loop is the train step function.

Use `tf.GradientTape` to track the gradients. You can learn more about this approach by reading the [eager execution guide](https://www.tensorflow.org/guide/eager).

The basic procedure is:

1. Execute the model and calculate the loss under a `tf.GradientTape`.
2. Calculate the updates and apply them to the model using the optimizer.

In [None]:
class CustomTraining(MyModel):
  @tf.function
  def train_step(self, inputs):
      inputs, labels = inputs
      with tf.GradientTape() as tape:
          predictions = self(inputs, training=True)
          loss = self.loss(labels, predictions)
      grads = tape.gradient(loss, model.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, model.trainable_variables))

      return {'loss': loss}

The above implementation of the `train_step` method follows [Keras' `train_step` conventions](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit). This is optional, but it allows you to change the behavior of the train step and still use keras' `Model.compile` and `Model.fit` methods.

In [None]:
model = CustomTraining(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [None]:
model.fit(dataset, epochs=1)

Or if you need more control, you can write your own complete custom training loop:

In [None]:
EPOCHS = 10

mean = tf.metrics.Mean()

for epoch in range(EPOCHS):
    start = time.time()

    mean.reset_states()
    for (batch_n, (inp, target)) in enumerate(dataset):
        logs = model.train_step([inp, target])
        mean.update_state(logs['loss'])

        if batch_n % 50 == 0:
            template = f"Epoch {epoch+1} Batch {batch_n} Loss {logs['loss']:.4f}"
            print(template)

    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_prefix.format(epoch=epoch))

    print()
    print(f'Epoch {epoch+1} Loss: {mean.result().numpy():.4f}')
    print(f'Time taken for 1 epoch {time.time() - start:.2f} sec')
    print("_"*80)

model.save_weights(checkpoint_prefix.format(epoch=epoch))