In [None]:
from tensorflow import keras
import tensorflow as tf

In [None]:
'''
A layer encapsulates both a state (the layer's "weights") and a transformation from inputs to outputs (a "call", the layer's forward pass)
Create Linear Layer
'''
class Linear(keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        # Weights are initialized
        w_init = tf.random_normal_initializer()
        self.w = tf.Variable(initial_value = w_init(shape=(input_dim, units), dtype="float32"), trainable=True,)
        # bias value are initialized
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(initial_value = b_init(shape=(units,), dtype="float32"), trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b   # Matrix multiplication: a.x1 + b

# Usage of Linear Layer
x = tf.ones((2,2))
linear_layer = Linear(4,2)
output = linear_layer(x)  # 'call' method is automatically invoked
print("Output of Linear Layer for tensor x: \n\n", output)

#if condition returns True, then nothing happens, if False, AssertionError is raised
assert linear_layer.weights == [linear_layer.w, linear_layer.b]

Output of Linear Layer for tensor x: 

 tf.Tensor(
[[-0.11631936 -0.03059545  0.06272326  0.06283144]
 [-0.11631936 -0.03059545  0.06272326  0.06283144]], shape=(2, 4), dtype=float32)


Adding weight to a layer using add_weight() function

In [None]:
class Linear(keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        self.w = self.add_weight(shape=(input_dim, units), initializer="random_normal", trainable=True)   # Random Normal weights at initial layer
        self.b = self.add_weight(shape=(units,), initializer="zeros", trainable=True)   # Initial bias value is 0

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b


x = tf.ones((2, 2))
linear_layer = Linear(4, 2)
y = linear_layer(x)
print(y)

tf.Tensor(
[[-0.00187546  0.06312266  0.01194087  0.04764556]
 [-0.00187546  0.06312266  0.01194087  0.04764556]], shape=(2, 4), dtype=float32)


Layers can have non-trainable weights

In [None]:
# Trainable parameter is set to 
class ComputeSum(keras.layers.Layer):
    def __init__(self, input_dim):
        super(ComputeSum, self).__init__()
        self.total = tf.Variable(initial_value=tf.zeros((input_dim,)), trainable=False)

    def call(self, inputs):
        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
        return self.total

'''
tf.assign_add() is a TensorFlow operation that adds a value to a variable and returns the updated value. 
The function has the following signature:
tf.assign_add(
    ref,
    value,
    use_locking=False,
    name=None
)

where:
* ref is a tf.Variable object, which represents the variable to be updated.
* value is a Tensor, which represents the value to be added to the variable.
* use_locking (optional) is a boolean that controls whether the operation should use locking.
* name (optional) is a string that defines the name of the operation.
'''

x = tf.ones((2, 2))
my_sum = ComputeSum(2)   # Input dimension is 2

y = my_sum(x)     # Add (2,2) to (0,0) => (2,2)
print(y.numpy())

y = my_sum(x)     # Add (2,2) to (2,2) => (4,4)
print(y.numpy())

y = my_sum(x)     # Add (2,2) to (4,4) => (6,6)
print(y.numpy())

[2. 2.]
[4. 4.]
[6. 6.]


In [None]:
print("weights:", len(my_sum.weights))
print("non-trainable weights:", len(my_sum.non_trainable_weights))

# It's not included in the trainable weights:
print("trainable_weights:", my_sum.trainable_weights)

weights: 1
non-trainable weights: 1
trainable_weights: []


In many cases, you may not know in advance the size of your inputs, and you would like to lazily create weights when that value becomes known, some time after instantiating the layer.

In the Keras API, we recommend creating layer weights in the build(self, inputs_shape) method of your layer. Like this:

In [None]:
class Linear(keras.layers.Layer):
    def __init__(self, units=32):
        super(Linear, self).__init__()
        self.units = units

    # The __call__() method of your layer will automatically run build func in the first time it is called
    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(shape=(self.units,), initializer="random_normal", trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [None]:
# At instantiation, we don't know on what inputs this is going to get called
linear_layer = Linear(32)

# The layer's weights are created dynamically the first time the layer is called
y = linear_layer(x)

Layers are recursively composable.
We recommend creating such sublayers in the __init__() method and leave it to the first __call__() to trigger building their weights.

In [None]:
class MLPBlock(keras.layers.Layer):
    def __init__(self):
        super(MLPBlock, self).__init__()
        # The first two layers have 32 neurons each
        # and the third layer has 1 neuron.
        self.linear_1 = Linear(32)   # You may want to use Dense layer instead of Linear
        self.linear_2 = Linear(32)
        self.linear_3 = Linear(1)

    def call(self, inputs):
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        return self.linear_3(x)


mlp = MLPBlock()
# Tensor of shape (3, 64) is passed as input to the mlp object to create the weights
y = mlp(tf.ones(shape=(3, 64)))  # The first call to the `mlp` will create the weights
print("weights:", len(mlp.weights))
print("trainable weights:", len(mlp.trainable_weights))

weights: 6
trainable weights: 6


add_loss() method

In [None]:
# A layer that creates an activity regularization loss
class ActivityRegularizationLayer(keras.layers.Layer):
    def __init__(self, rate=1e-2):
        super(ActivityRegularizationLayer, self).__init__()
        self.rate = rate

    def call(self, inputs):
        self.add_loss(self.rate * tf.reduce_sum(inputs))
        return inputs
'''
This property is reset at the start of every __call__() to the top-level layer, 
so that layer.losses always contains the loss values created during the last forward pass.

This implementation is adding the regularization loss during the forward pass of the network, 
which means that the regularization term will be included in the gradients calculated during 
backpropagation and therefore included in the optimization process.
'''

'\nThis property is reset at the start of every __call__() to the top-level layer, \nso that layer.losses always contains the loss values created during the last forward pass.\n\nThis implementation is adding the regularization loss during the forward pass of the network, \nwhich means that the regularization term will be included in the gradients calculated during \nbackpropagation and therefore included in the optimization process.\n'

In [None]:
class OuterLayer(keras.layers.Layer):
    def __init__(self):
        super(OuterLayer, self).__init__()
        self.activity_reg = ActivityRegularizationLayer(1e-2)

    def call(self, inputs):
        return self.activity_reg(inputs)

layer = OuterLayer()
assert len(layer.losses) == 0  # No losses yet since the layer has never been called

_ = layer(tf.zeros(1, 1))
assert len(layer.losses) == 1  # We created one loss value

# `layer.losses` gets reset at the start of each __call__
_ = layer(tf.zeros(1, 1))
assert len(layer.losses) == 1  # This is the loss created during the call above

In addition, the loss property also contains regularization losses created for the weights of any inner layer:

In [None]:
class OuterLayerWithKernelRegularizer(keras.layers.Layer):
    def __init__(self):
        super(OuterLayerWithKernelRegularizer, self).__init__()
        # Dense Layer is like Linear layer (If we set activation to None in the dense layer in keras API, then they are technically equivalent)
        self.dense = keras.layers.Dense(32, kernel_regularizer=tf.keras.regularizers.l2(1e-3))

    def call(self, inputs):
        return self.dense(inputs)

layer = OuterLayerWithKernelRegularizer()
_ = layer(tf.zeros((1, 1)))

# This is `1e-3 * sum(layer.dense.kernel ** 2)`,
# created by the `kernel_regularizer` above.
print(layer.losses)

[<tf.Tensor: shape=(), dtype=float32, numpy=0.0024622313>]


In [None]:
# ! Do not run this cell, it just show an example of GradientTape usage

# Instantiate an optimizer.
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Iterate over the batches of a dataset.
for x_batch_train, y_batch_train in train_dataset:
  with tf.GradientTape() as tape:
    logits = layer(x_batch_train)  # Logits for this minibatch
    # Loss value for this minibatch
    loss_value = loss_fn(y_batch_train, logits)
    # Add extra losses created during this forward pass:
    loss_value += sum(model.losses)

  grads = tape.gradient(loss_value, model.trainable_weights)
  optimizer.apply_gradients(zip(grads, model.trainable_weights))


# For More detailed training loops:
# https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch/

In [None]:
import numpy as np

inputs = keras.Input(shape=(3,))
outputs = ActivityRegularizationLayer()(inputs)
model = keras.Model(inputs, outputs)

# If there is a loss passed in `compile`, the regularization
# losses get added to it
model.compile(optimizer="adam", loss="mse")
model.fit(np.random.random((2, 3)), np.random.random((2, 3)))

# It's also possible not to pass any loss in `compile`,
# since the model already has a loss to minimize, via the `add_loss`
# call during the forward pass!
model.compile(optimizer="adam")
model.fit(np.random.random((2, 3)), np.random.random((2, 3)))



<keras.callbacks.History at 0x7f72f8e6ab80>

add_metric() method

Layers also have an add_metric() method for tracking the moving average of a quantity during training.

Consider the following layer: a "logistic endpoint" layer. It takes as inputs predictions & targets, it computes a loss which it tracks via add_loss(), and it computes an accuracy scalar, which it tracks via add_metric().

In [None]:
class LogisticEndpoint(keras.layers.Layer):
    def __init__(self, name=None):
        super(LogisticEndpoint, self).__init__(name=name)
        self.loss_fn = keras.losses.BinaryCrossentropy(from_logits=True)
        self.accuracy_fn = keras.metrics.BinaryAccuracy()

    def call(self, targets, logits, sample_weights=None):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        loss = self.loss_fn(targets, logits, sample_weights)
        self.add_loss(loss)

        # Log accuracy as a metric and add it
        # to the layer using `self.add_metric()`.
        acc = self.accuracy_fn(targets, logits, sample_weights)
        self.add_metric(acc, name="accuracy")

        # Return the inference-time prediction tensor (for `.predict()`).
        return tf.nn.softmax(logits)

In [None]:
layer = LogisticEndpoint()

targets = tf.ones((2,2))
logits = tf.ones((2,2))

y = layer(targets, logits)

print("layer.metrics:", layer.metrics)
print("Accuracy:", float(layer.metrics[0].result()))

layer.metrics: [<keras.metrics.accuracy_metrics.BinaryAccuracy object at 0x7f72e94c5fd0>]
Accuracy: 1.0


In [None]:
inputs = keras.Input(shape=(3,), name='inputs')
targets = keras.Input(shape=(10,), name='targets')
logits = keras.layers.Dense(10)(inputs)
predictions = LogisticEndpoint(name='predictions')(logits, targets)

model = keras.Model(inputs=[inputs, targets], outputs=predictions)
model.compile(optimizer='adam')

data = {
    'inputs': np.random.random((3,3)),
    'targets': np.random.random((3,10)),
}

model.fit(data)



<keras.callbacks.History at 0x7f72e7ff43a0>

You can optionally enable serialization on your layers

get_config() method

In [None]:
class Linear(keras.layers.Layer):
  def __init__(self, units=32):
    super(Linear, self).__init__()
    self.units = units

  def build(self, input_shape):
    self.w = self.add_weight(shape=(input_shape[-1], self.units),
                             initializer='random_normal',
                             trainable=True)
    self.b = self.add_weight(shape=(self.units,), initializer='random_normal', trainable=True)

  def call(self, inputs):
    return tf.matmul(inputs, self.w) + self.b

  def get_config(self):
    return {'units': self.units}

In [None]:
layer = Linear(64)

config = layer.get_config()

print(config)
new_layer = Linear.from_config(config)

{'units': 64}


In [None]:
class Linear(keras.layers.Layer):
    def __init__(self, units=32, **kwargs):
        super(Linear, self).__init__(**kwargs)
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units,), initializer="random_normal", trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

    def get_config(self):
        config = super(Linear, self).get_config()
        config.update({"units": self.units})
        return config


In [None]:
layer = Linear(64)

config = layer.get_config()

print(config)
new_layer = Linear.from_config(config)

{'name': 'linear_8', 'trainable': True, 'dtype': 'float32', 'units': 64}


Privileged training argument in the call() method

**BatchNormalization** layer and the **Dropout** layer, have different behaviors during **training** and **inference**. 

Expose a **training** (boolean) argument in the call() method.

In [None]:
class CustomDropout(keras.layers.Layer):
  def __init__(self, rate, **kwargs):
    super(CustomDropout, self).__init__(**kwargs)
    self.rate = rate

  def call(self, inputs, training=None):
    if training:
      return tf.nn.dropout(inputs, rate=self.rate)
    return inputs

The other privileged argument supported by call() is the mask argument.

You will find it in all Keras RNN layers. A mask is a boolean tensor (one boolean value per timestep in the input) used to skip certain input timesteps when processing timeseries data.

Keras will automatically pass the correct mask argument to __call__() for layers that support it, when a mask is generated by a prior layer. Mask-generating layers are the Embedding layer configured with mask_zero=True, and the Masking layer.

To learn more about masking and how to write masking-enabled layers, please check out the guide "understanding padding and masking".

https://www.tensorflow.org/guide/keras/masking_and_padding?hl=en

### Model class

The Model class has the same API as Layer, with the following differences:

* It exposes built-in training, evaluation, and prediction loops (model.fit(), model.evaluate(), model.predict()).
* It exposes the list of its inner layers, via the model.layers property.
* It exposes saving and serialization APIs (save(), save_weights()...)

In [None]:
# ! Do not run this cell, it just show an example of GradientTape usage

class ResNet(tf.keras.Model):
  def __init__(self, num_classes=1000):
    super(ResNet, self).__init__()
    self.block_1 = ResNetBlock()
    self.block_2 = ResNetBlock()
    self.global_pool = layers.GloabalAveragePooling2D()
    self.classifier = keras.layers.Dense(num_classes)

  def call(self, inputs):
    x = self.block_1(inputs)
    x = self.block_2(x)
    x = self.global_pool(x)
    return self.classifier(x)


resnet = ResNet()
dataset = ...
resnet.fit(dataset, epochs=10)
resnet.save(filepath)

## End-to-end example

* Layers can be recursively nested to create new, bigger computation blocks.
* Layers can create and track losses (typically regularization losses) as well as metrics, via add_loss() and add_metric()
* The outer container, the thing you want to train, is a Model. A Model is just like a Layer, but with added training and serialization utilities.

### Variational Autoencoder

Dataset: MNIST

VAE will be a subclass of model and it will feature a regularization loss (KL divergence)

![](https://pub.mdpi-res.com/make/make-02-00020/article_deploy/html/images/make-02-00020-g001-550.jpg?1602276661)

In [31]:
import tensorflow as tf
from tensorflow import keras
from keras import backend as K

# z_mean, z_log_var to sample z, the vector encoding a digit
class Sampling(keras.layers.Layer):

  def call(self, inputs):
    z_mean, z_log_var = inputs
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))

    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

### Encoder Layer

In [32]:
# Maps MNIST Digits to a triplet (z_mean, z_log_var, z).
# Latent dimension: 32
# Intermediate dimension: 64
class Encoder(keras.layers.Layer):
  def __init__(self, latent_dim=32, intermediate_dim=64, name='encoder', **kwargs):
    super(Encoder, self).__init__(name=name, **kwargs)
    self.dense_proj = keras.layers.Dense(intermediate_dim, activation='relu')
    self.dense_mean = keras.layers.Dense(latent_dim)
    self.dense_log_var = keras.layers.Dense(latent_dim)
    self.sampling = Sampling()  # create Samling object

  def call(self, inputs):
    x = self.dense_proj(inputs)
    z_mean = self.dense_mean(x)
    z_log_var = self.dense_log_var(x)
    z = self.sampling((z_mean, z_log_var))
    return z_mean, z_log_var, z

### Decoder Layer

In [47]:
# Converts z, the encoded digit vector, back into readable digit
class Decoder(keras.layers.Layer):
  def __init__(self, original_dim, intermediate_dim=64, name='decoder', **kwargs):
    super(Decoder, self).__init__(name=name, **kwargs)
    self.dense_proj = keras.layers.Dense(intermediate_dim, activation='relu')
    self.dense_output = keras.layers.Dense(original_dim, activation='sigmoid')

  def call(self, inputs):
    x = self.dense_proj(inputs)
    return self.dense_output(x)

### VariationalAutoEncoder Model

In [48]:
# Combines the encoder and decoder into a model for training
class VariationalAutoEncoder(keras.Model):
  def __init__(self, original_dim, intermediate_dim=64, latent_dim=32, name='autoencoder', **kwargs):
    super(VariationalAutoEncoder, self).__init__(name=name, **kwargs)
    self.original_dim = original_dim
    self.encoder = Encoder(latent_dim=latent_dim, intermediate_dim=intermediate_dim)
    self.decoder = Decoder(original_dim, intermediate_dim=intermediate_dim)

  def call(self, inputs):
    z_mean, z_log_var, z = self.encoder(inputs)  # Encode input data
    reconstructed = self.decoder(z)     # Decode (reconstruct) the data
    # Add KL Divergence regularization loss
    kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
    self.add_loss(kl_loss)  # Add KL Divergence loss to the model
    return reconstructed

### Configure global parameters and functions

In [49]:
original_dim = 784   # 28x28 pixels
learning_rate = 1e-3 # 0.001
epochs = 2
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
mse_loss_fn = keras.losses.MeanSquaredError()   # MSE
loss_metric = keras.metrics.Mean()

### Prepare data

In [50]:
(x_train, _), _ = keras.datasets.mnist.load_data()
x_train =  x_train.reshape(60000, 784).astype('float32') / 255.0

train_dataset = tf.data.Dataset.from_tensor_slices(x_train)
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)

In [51]:
train_dataset

<_BatchDataset element_spec=TensorSpec(shape=(None, 784), dtype=tf.float32, name=None)>

### Run Variational AutoEncoder model

In [52]:
vae = VariationalAutoEncoder(original_dim, intermediate_dim=64, latent_dim=32)

In [54]:
for epoch in range(epochs):
  print('Epoch %d' % (epoch,))

  # Iterate over the batches of the dataset
  for step, x_batch_train in enumerate(train_dataset):
    with tf.GradientTape() as tape:
      reconstructed = vae(x_batch_train)
      # Compute reconstruction loss
      loss = mse_loss_fn(x_batch_train, reconstructed)
      loss += sum(vae.losses)  # Add KL Divergence regularization loss
    
    # Backpropagation (update weights using loss)
    grads = tape.gradient(loss, vae.trainable_weights)
    optimizer.apply_gradients(zip(grads, vae.trainable_weights))

    loss_metric(loss)

    if step % 100 == 0:
      print("step %d: mean loss = %.4f" % (step, loss_metric.result()))

Epoch 0
step 0: mean loss = 0.3819
step 100: mean loss = 0.1278
step 200: mean loss = 0.1003
step 300: mean loss = 0.0899
step 400: mean loss = 0.0848
step 500: mean loss = 0.0814
step 600: mean loss = 0.0791
step 700: mean loss = 0.0775
step 800: mean loss = 0.0763
step 900: mean loss = 0.0752
Epoch 1
step 0: mean loss = 0.0749
step 100: mean loss = 0.0742
step 200: mean loss = 0.0737
step 300: mean loss = 0.0732
step 400: mean loss = 0.0729
step 500: mean loss = 0.0725
step 600: mean loss = 0.0722
step 700: mean loss = 0.0719
step 800: mean loss = 0.0716
step 900: mean loss = 0.0713


### Alternative way to train VariationalAutoEncoder model

In [55]:
vae = VariationalAutoEncoder(original_dim=784, intermediate_dim=64, latent_dim=32)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

vae.compile(optimizer, loss=tf.keras.losses.MeanSquaredError())

vae.fit(x_train, x_train, epochs=2, batch_size=64)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f32993ef4c0>

### Remake the example without object oriented structure

In [56]:
original_dim = 784
intermediate_dim = 64
latent_dim = 32

### Encoder model

In [57]:
original_inputs = tf.keras.Input(shape=(original_dim, ), name='encoder_input')
x = keras.layers.Dense(intermediate_dim, activation='relu')(original_inputs)
z_mean = keras.layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = keras.layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = keras.Model(inputs=original_inputs, outputs=z, name='encoder')

### Decoder Model

In [58]:
latent_inputs = keras.Input(shape=(latent_dim, ), name='z_sampling')
x = keras.layers.Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = keras.layers.Dense(original_dim, activation='sigmoid')(x)
decoder = keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')

### Variational Autoencoder Model

In [59]:
outputs = decoder(z)
vae = keras.Model(inputs=original_inputs, outputs=outputs, name='vae')

### KL Divergence Regularization Loss

In [60]:
kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
vae.add_loss(kl_loss)

### Train the Model

In [61]:
optimizer = keras.optimizers.Adam(learning_rate=1e-3)
vae.compile(optimizer, loss=keras.losses.MeanSquaredError())
vae.fit(x_train, x_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f329955d610>

### Reference

https://www.tensorflow.org/guide/keras/custom_layers_and_models?hl=en