In [14]:
import tensorflow as tf
import numpy as np

In [28]:
#-----------HYPERPARAMETERS----------------#

SEQUENCE_LENGTH = [2, 4, 8, 16, 32]
HIDDEN_LENGTH = 20
N_FEATURES = 1

LEARNING_RATE = 1e-3
EPOCHS = 100
CUMSUM = True
NUM_OF_LAYERS = 3

In [29]:
def create_dataset_for_sum_prediction(n_samples, sequence_length, n_features, validation):
    """
    Function to create a dataset for the sum prediction task. The function generates random integers between 0 and 10 and calculates the sum of the sequence
    :param n_samples:  number of samples in the dataset
    :param sequence_length: length of the sequence
    :param n_features: number of features in the sequence
    :return: a tf.data.Dataset object containing the input and output pairs
    """
    x = tf.cast(np.random.randint(low=0, high=11, size=(n_samples, sequence_length, n_features)), tf.float32)
    sums = []
    for sample in range(n_samples):
        value = x[sample, :, :]
        sums.append(tf.reduce_sum(value))
    y = tf.cast(tf.expand_dims(tf.convert_to_tensor(sums), axis=1), tf.float32)
    if not validation:
        return tf.data.Dataset.from_tensor_slices((x, y)).shuffle(n_samples).batch(32).prefetch(tf.data.AUTOTUNE)
    else:
        return tf.data.Dataset.from_tensor_slices((x, y)).batch(512).prefetch(tf.data.AUTOTUNE)

In [30]:
def create_dataset_for_cum_sum_prediction(n_samples, sequence_length, n_features, validation):
    """
    Function to create a dataset for the cumulative sum prediction task. The function generates random integers between 0 and 10 and calculates the cumulative sum of the sequence
    :param n_samples: number of samples in the dataset
    :param sequence_length: length of the sequence
    :param n_features: number of features in the sequence
    :return: a tf.data.Dataset object containing the input and output pairs
    """
    x = tf.cast(np.random.randint(low=0, high=11, size=(n_samples, sequence_length, n_features)), tf.float32)
    sums = []
    for sample in range(n_samples):
        value = x[sample, :, :]
        sums.append(tf.cumsum(value, axis=0))
    y = tf.cast(tf.convert_to_tensor(sums), tf.float32)
    if not validation:
        return tf.data.Dataset.from_tensor_slices((x, y)).batch(32).prefetch(tf.data.AUTOTUNE)
    else:
        return tf.data.Dataset.from_tensor_slices((x, y)).shuffle(n_samples).batch(512).prefetch(tf.data.AUTOTUNE)

In [31]:
class PositionalEncoding(tf.keras.layers.Layer):
    # positional encoding to give model information on relative position of tokens in the sequence
    # positional encoding vector is added to  embedding vector
    
    def __init__(self, position, dim):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, dim)
        
    def get_angles(self, position, i, dim):
        position = tf.cast(position, dtype=tf.float32)
        i = tf.cast(i, dtype=tf.float32)
        dim = tf.cast(dim, dtype=tf.float32)
        
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(dim, tf.float32))
        return position * angles
    
    def positional_encoding(self, position, dim):
        
        angle_rads = self.get_angles(
            tf.range(position)[:, tf.newaxis],
            tf.range(dim)[tf.newaxis, :],
            dim
        )
        
        # apply sin to even indices
        sines = tf.math.sin(angle_rads[:, 0::2])
        # apply cos to odd indices
        cosines = tf.math.cos(angle_rads[:, 1::2])
        
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        
        return tf.cast(pos_encoding, tf.float32)
    
    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

In [32]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.dim = dim
        
        assert dim % self.num_heads == 0
        
        self.depth = dim // self.num_heads
        
        self.weights_query = tf.keras.layers.Dense(dim)
        self.weights_key = tf.keras.layers.Dense(dim)
        self.weights_value = tf.keras.layers.Dense(dim)
        
        self.dense = tf.keras.layers.Dense(dim)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, query, key, value, mask):
        batch_size = tf.shape(query)[0]
        
        query = self.weights_query(query)
        key = self.weights_key(key)
        value = self.weights_value(value)
        
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)
        
        scaled_attention, attention_weights = self.scaled_dot_product_attention(query, key, value, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.dim))
        
        output = self.dense(concat_attention)
        
        return output, attention_weights
    
    def scaled_dot_product_attention(self, query, key, value, mask):
        # calc attention weights
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        
        # scale tensor and add mask
        dk = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        
        output = tf.matmul(attention_weights, value)
        
        return output, attention_weights

In [33]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, dim, num_heads, dim_feedforward, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.multiheadattention = MultiHeadAttention(dim, num_heads)
    self.ffn = self.feed_forward(dim, dim_feedforward)

    self.normalization1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalization2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
  
  def feed_forward(self, dim, dim_feedforward):
    inputs = tf.keras.Input(shape=(None, dim), name="inputs")
    dense1 = tf.keras.layers.Dense(dim_feedforward, activation='relu')(inputs)
    outputs = tf.keras.layers.Dense(dim)(dense1)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model
    
  def call(self, x, training, mask):

    attention_output, _ = self.multiheadattention(x, x, x, mask) 
    attention_output = self.dropout1(attention_output, training=training)
    out1 = self.normalization1(x + attention_output) 
    
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.normalization2(out1 + ffn_output) 
    
    return out2

In [34]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, dim, num_heads, dim_feedforward, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.multiheadattention1 = MultiHeadAttention(dim, num_heads)
    self.multiheadattention2 = MultiHeadAttention(dim, num_heads)

    self.ffn = self.feed_forward(dim, dim_feedforward)
 
    self.normalization1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalization2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalization3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
  
  def feed_forward(self, dim, dim_feedforward):
    inputs = tf.keras.Input(shape=(None, dim), name="inputs")
    dense1 = tf.keras.layers.Dense(dim_feedforward, activation='relu')(inputs)
    outputs = tf.keras.layers.Dense(dim)(dense1)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attention1, attention_weights1 = self.multiheadattention1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attention1 = self.dropout1(attention1, training=training)
    out1 = self.normalization1(attention1 + x)
    
    attention2, attention_weights2 = self.multiheadattention2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attention2 = self.dropout2(attention2, training=training)
    out2 = self.normalization2(attention2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.normalization3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attention_weights1, attention_weights2

In [35]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, dim, num_heads, dim_feedforward, input_vocab_size,
               position, rate=0.1):
    super(Encoder, self).__init__()

    self.dim = dim
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, dim)
    self.pos_encoding = PositionalEncoding(position, self.dim)
    
    self.encoder = [EncoderLayer(dim, num_heads, dim_feedforward, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):
    seq_len = tf.shape(x)[1]
    
    # adding embeddings
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.dim, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.encoder[i](x, training, mask)
    
    return x 

In [36]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, dim, num_heads, dim_feedforward, target_vocab_size,
               position, rate=0.1):
    super(Decoder, self).__init__()

    self.dim = dim
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, dim)
    self.pos_encoding = PositionalEncoding(position, dim)
    
    self.decoder = [DecoderLayer(dim, num_heads, dim_feedforward, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.decoder[i](x, enc_output, training, look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    return x, attention_weights

In [37]:
class TransformerModel(tf.keras.Model):
    def __init__(self, num_layers: int, dim: int, num_heads: int, dim_feedforward: int,input_vocab_size: int, target_vocab_size: int, rate=0.1):
        super(TransformerModel, self).__init__()
     
        self.encoder = Encoder(num_layers, dim, num_heads, dim_feedforward, 
                           input_vocab_size, position=input_vocab_size, rate=rate)
        self.decoder = Decoder(num_layers, dim, num_heads, dim_feedforward, 
                            target_vocab_size, position=target_vocab_size, rate=rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        
        self.metrics_list = [tf.keras.metrics.Mean(name="loss")]
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
        self.loss_function = tf.keras.losses.MeanAbsoluteError()
    
    @property
    def metrics(self):
        return self.metrics_list

    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_state()
    
    def create_masks(self, inp, tar):
        seq = tf.cast(tf.math.equal(inp, 0), tf.float32)
        padding_mask = seq[:, tf.newaxis, tf.newaxis, :]
        
        target_size = tf.shape(tar)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((target_size, target_size)), -1, 0)

        seq_target = tf.cast(tf.math.equal(tar, 0), tf.float32)
        target_padding_mask = seq_target[:, tf.newaxis, tf.newaxis, :]
        future_token_mask = tf.maximum(target_padding_mask, look_ahead_mask)
        
        return padding_mask, future_token_mask

    @tf.function
    def call(self, input, target, padding_mask, 
            look_ahead_mask, training=False):

        enc_output = self.encoder(input, training, padding_mask)  
        dec_output, attention_weights = self.decoder(target, enc_output, training, look_ahead_mask, padding_mask)
        
        final_output = self.final_layer(dec_output)
        
        return final_output, attention_weights
    
    def train_step(self, data):
        """
        Standard train_step method
        :param data: 
        :return: 
        """
        sequence, label = data

        #prep seq & labels for seq2seq model: shifting so model learns to predict next token in sequence
        # remove last token from each target sequence
        # creates sequences shifted by one position compared to original targets for decoder input
        tar_inp = label[:, :-1]
        # remove first token from each target sequence
        # creates sequences shifted by one position compared to original targets as expected outputs for each input sequence & to compute loss
        tar_real = label[:, 1:]
        
        padding_mask, look_ahead_mask = self.create_masks(sequence, tar_inp)
  
        with tf.GradientTape() as tape:
            output, _ = self.call(sequence, tar_inp, padding_mask, look_ahead_mask, training=True)
            loss = self.loss_function(tar_real, output) + tf.reduce_sum(self.losses)
        gradients = tape.gradient(loss, self.trainable_variables)

        self.optimizer.apply_gradients(grads_and_vars=zip(gradients, self.trainable_variables))

        self.metrics[0].update_state(loss)

        return {m.name : m.result() for m in self.metrics}

    def test_step(self, data):
        """
        Standard test_step method
        :param data: 
        :return: 
        """
        sequence, label = data
        
         #prep seq & labels for seq2seq model: shifting so model learns to predict next token in sequence
        # remove last token from each target sequence
        # creates sequences shifted by one position compared to original targets for decoder input
        tar_inp = label[:, :-1]
        # remove first token from each target sequence
        # creates sequences shifted by one position compared to original targets as expected outputs for each input sequence & to compute loss
        tar_real = label[:, 1:]
        
        padding_mask, look_ahead_mask = self.create_masks(sequence, tar_inp)
  
        self.call(sequence, tar_inp, padding_mask, look_ahead_mask, training=True)
        

        # Forward pass through the model
        predictions, _ = self.call(sequence, tar_inp,
                                    False,  # Ensure no dropout during inference
                                    padding_mask,
                                    look_ahead_mask)

        # Compute loss and metrics
        loss = self.loss_function(tar_real, predictions) #+ tf.reduce_sum(self.losses)
        self.metrics[0].update_state(loss)

        return {m.name : m.result() for m in self.metrics}

In [38]:
#----------Training------------#

import tqdm

def training_loop(model, train, test, train_summary_writer, test_summary_writer):
    # Lists to store training and validation metrics across epochs
    train_loss = []
    val_loss = []
    train_acc = []
    val_acc = []

    # Loop through epochs
    for epoch in range(EPOCHS):

        # Training
        for data in tqdm.tqdm(train, position=0, leave=False, desc=f"Epoch {epoch}"):
            # Perform a training step using the model
            metrics = model.train_step(data)

            # Log training metrics to TensorBoard
            with train_summary_writer.as_default():
                for metric in model.metrics:
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        # Store training metrics for the epoch
        train_loss.append(metrics["loss"].numpy())

        # Print and reset training metrics
        if epoch % 20 == 0:
            print(f"EPOCH {epoch}")
            print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])
        model.reset_metrics()

        # Testing
        for data in test:
            # Perform a testing step using the model
            metrics = model.test_step(data)

            # Log validation metrics to TensorBoard
            with test_summary_writer.as_default():
                for metric in model.metrics:
                    tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        # Store validation metrics for the epoch
        val_loss.append(metrics["loss"].numpy())

        # Print validation metrics
        if epoch % 20 == 0:
            print([f"val_{key}: {value.numpy()}" for (key, value) in metrics.items()])

        # Reset validation metrics
        model.reset_metrics()

    # Return lists of training and validation metrics for analysis or plotting
    return train_loss, train_acc, val_loss, val_acc

In [39]:
#-----------Loop for different sequence lengths with custom training loop----------------#

config_name= "Transformer"

for length in SEQUENCE_LENGTH:
    train_dataset = create_dataset_for_cum_sum_prediction(1024, length, 1, False)
    val_dataset = create_dataset_for_cum_sum_prediction(512, length, 1, True)
    
    train_log_path = f"logs/{config_name}/{length}/train"
    test_log_path = f"logs/{config_name}/{length}/val"
    
    # log writer for training metrics
    train_summary_writer = tf.summary.create_file_writer(train_log_path)
    
    # log writer for validation metrics
    test_summary_writer = tf.summary.create_file_writer(test_log_path)
    
    model = TransformerModel(num_layers=NUM_OF_LAYERS, dim=512, num_heads=8, dim_feedforward=2048, input_vocab_size=8500, target_vocab_size=8000)
    train_losses, train_accuracies, val_losses, val_accuracies = training_loop(model, train_dataset, val_dataset, train_summary_writer, test_summary_writer)
    

                                               

ValueError: in user code:

    File "/var/folders/p8/rh_91kl969g6rhp4ksqdmjhr0000gn/T/ipykernel_18354/2046511709.py", line 41, in call  *
        enc_output = self.encoder(input, training, padding_mask)
    File "/Users/christinearnoldt/mambaforge/envs/iannwtf_final_project/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/Users/christinearnoldt/mambaforge/envs/iannwtf_final_project/lib/python3.12/site-packages/keras/src/layers/layer.py", line 721, in __call__
        raise ValueError(

    ValueError: Only input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: True (of type <class 'bool'>)


In [None]:
#-----------Loop for different sequence lengths with compile and fit----------------#

for length in SEQUENCE_LENGTH:
    train_dataset = create_dataset_for_cum_sum_prediction(1024, length, 1, False)
    val_dataset = create_dataset_for_cum_sum_prediction(512, length, 1, True)
    
    model = RNNModel(num_layers=NUM_OF_LAYERS, sequence_length=length, hidden_length=HIDDEN_LENGTH, cumsum=CUMSUM)
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    loss = tf.keras.losses.MeanAbsoluteError()

    # compile the model
    model.compile(optimizer = optimizer, loss=loss)
    
    EXPERIMENT_NAME = "LSTM_sum_prediction"
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    logging_callback = tf.keras.callbacks.TensorBoard(log_dir=f"./logs/{EXPERIMENT_NAME}/{length}")
    
    history = model.fit(train_dataset, 
                        validation_data=val_dataset,
                        epochs=EPOCHS,
                        callbacks=[logging_callback],
                        verbose=0)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir="logs/transformer_sum_prediction" --port=6007