In [2]:
import tensorflow.keras as keras
keras.__version__

"""
Allocate only as much GPU memory as needed for the runtime allocations.
"""
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

Squential model:
- exactly one input and one output
- linear stack of layers

Functional API is good for tasks with ...
- several independent inputs
- multiple outputs
- graph-like (DAG) layer structure


An example task requiring functional API -- price of a second-hand piece of clothing

Three inputs:

1. user-provided medtadata -- one-hot encode it and use a densely connect network.
2. user-provided text description -- use an RNN or a 1D convnet.
3. a picture of the item -- 2D convnet.

Then merge the three modules to predict the price.

Show side by side a simple Sequential model and its equivalent in the functional API

In [3]:
"""
Sequential model
"""

from tensorflow.keras import Input, layers
from tensorflow.keras import Sequential, Model

seq_model = Sequential()
seq_model.add(layers.Dense(32, activation='relu', input_shape=(64,)))
seq_model.add(layers.Dense(32, activation='relu'))
seq_model.add(layers.Dense(10, activation='softmax'))

seq_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                330       
Total params: 3,466
Trainable params: 3,466
Non-trainable params: 0
_________________________________________________________________


In [4]:
"""
Equivalent functional API
"""

input_tensor = Input(shape=(64,))
x = layers.Dense(32, activation='relu')(input_tensor)
y = layers.Dense(32, activation='relu')(x)
output_tensor = layers.Dense(10, activation='softmax')(y)

# Instantiating a Model object using only an input tensor and an output tensor.
model = Model(input_tensor, output_tensor)  
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_5 (Dense)              (None, 10)                330       
Total params: 3,466
Trainable params: 3,466
Non-trainable params: 0
_________________________________________________________________


In [5]:
"""
When it comes to compiling, training, or evaluating such an instance of Model, the "functional API" is the save as that of "Sequential".
"""

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Generate dummy data for demonstration.
import numpy as np
x_train = np.random.random((1000, 64))
y_train = np.random.random((1000, 10))

model.fit(x_train, y_train, epochs=10, batch_size=128)
score = model.evaluate(x_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Multi-Input models

The functional API can be used to build models that have multiple inputs. Typically, such models at some point merge their different input branches using a layer that can combine several tensors: by adding them, concatenating them, and so on.

## A two-input question-answering model

A typical question-answering model has two inputs: `a natural-language question` and `a text snippet (such as a news article) providing information to be used for answering the question`. The model must then produce `an answer`: in the simplest possible setup, this is a one-word answer obtained via a softmax over some predefined vocabulary.

In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import Input

text_vocabulary_size = 10000
question_vocabulary_size = 10000
answer_vocabulary_size = 500

In [7]:
""" Input one -- text """
# The text input is a variable-length sequence of integers. Note that you can optionally name the inputs.
text_input = Input(shape=(None,), dtype='int32', name='text')

# Embedding dimension: Embeds the inputs into a sequence of vectors of size 64 .
embedded_text = layers.Embedding(text_vocabulary_size, 64)(text_input)

# Encodes the vectors in a single vector via an LSTM.
encoded_text = layers.LSTM(32)(embedded_text)


""" Input two -- question """
# Same process (with different layer instances) for the question
question_input = Input(shape=(None,), dtype='int32', name='question')
embedded_question = layers.Embedding(question_vocabulary_size, 32)(question_input)
encoded_question = layers.LSTM(16)(embedded_question)


""" Merging branches -- Concatenates the encoded question and encoded text """
concatenated = layers.concatenate([encoded_text, encoded_question], axis=-1)

# Adds a softmax classifier on top
answer = layers.Dense(answer_vocabulary_size, activation='softmax')(concatenated)

# At model instantiation, you specify the two inputs and the output.
model = Model([text_input, question_input], answer)

model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['acc']
)

model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
question (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 64)     640000      text[0][0]                       
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 32)     320000      question[0][0]                   
_______________________________________________________________________________________

How do you train this two-input model? There are two possible APIs: 

1. you can feed the model a list of Numpy arrays as inputs, or 
2. you can feed it a dictionary that maps input names to Numpy arrays. 

Naturally, the latter option is available only if you give names to your inputs.

In [8]:
"""
Feeding data to the model
"""

import numpy as np

num_samples = 1000
max_length = 100

# Generate two dummy training inputs -- {text, question}
text = np.random.randint(1, text_vocabulary_size, size=(num_samples, max_length))
question = np.random.randint(1, question_vocabulary_size, size=(num_samples, max_length))

# Generate one dummy training output.
# Answers are one-hot encoded, not integers
answers = np.random.randint(0, 1, size=(num_samples, answer_vocabulary_size))

# 1. Fitting using a list of inputs
# model.fit([text, question], answers, epochs=10, batch_size=128)

# 2. Fitting using a dictionary of inputs (only if inputs are named)
model.fit({'text': text, 'question': question}, answers, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7feecc56a940>

# Multi-output models

In the same way, you can use the functional API to build models with multiple outputs (or multiple heads).

A simple example is a network that attempts to simultaneously predict different properties of the data, such as a network that takes as input a series of social media posts from a single anonymous person and tries to predict attributes of that person, such as age, gender, and income level.

In [28]:
from tensorflow.keras.models import Model
from tensorflow.keras import Input, layers

vocabulary_size = 50000
num_income_groups = 10

posts_input = Input(shape=(None,), dtype='int32', name='posts')

embedded_posts = layers.Embedding(vocabulary_size, 256)(posts_input)

x = layers.Conv1D(128, 5, activation='relu')(embedded_posts)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation='relu')(x)

# Note that the output layers are given names
age_prediction = layers.Dense(1, name='age')(x)
income_prediction = layers.Dense(num_income_groups, activation='softmax', name='income')(x)
gender_prediction = layers.Dense(1, activation='sigmoid', name='gender')(x)

model = Model(posts_input, [age_prediction, income_prediction, gender_prediction])
model.summary()

Model: "functional_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
posts (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, None, 256)    12800000    posts[0][0]                      
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, None, 128)    163968      embedding_7[0][0]                
__________________________________________________________________________________________________
max_pooling1d_10 (MaxPooling1D) (None, None, 128)    0           conv1d_25[0][0]                  
______________________________________________________________________________________

## Compilation options of a multi-output model: multiple losses

Importantly, training such a model requires the ability to specify different loss functions for different heads of the network: for instance, age prediction is a scalar regression task, but gender prediction is a binary classification task, requiring a different training procedure. But because gradient descent requires you to minimize a scalar, you must combine these losses into a single value in order to train the model. The simplest way to combine different losses is to sum them all. In Keras, you can use either a list or a dictionary of losses in compile to specify different objects for different outputs; the resulting loss values are summed into a global loss, which is minimized during training.

In [29]:
# model.compile(
#     optimizer='rmsprop', 
#     loss=['mse', 'categorical_crossentropy', 'binary_crossentropy']
# )

# Equivalent (possible only if you give names to the output layers)
model.compile(
    optimizer='rmsprop', 
    loss={
        'age': 'mse',
        'income': 'categorical_crossentropy',
        'gender': 'binary_crossentropy'
    }
)

## Loss weighting

Note that very imbalanced loss contributions will cause the model representations to be optimized preferentially for the task with the largest individual loss, at the expense of the other tasks. To remedy this, you can assign different levels of importance to the loss values in their contribution to the final loss. This is useful in particular if the losses' values use different scales.

In [30]:
# model.compile(
#     optimizer='rmsprop',
#     loss=['mse', 'categorical_crossentropy', 'binary_crossentropy'], 
#     loss_weights=[0.25, 1., 10.]
# )

# Equivalent (possible only if you give names to the output layers)
model.compile(
    optimizer='rmsprop', 
    loss={
        'age': 'mse',
        'income': 'categorical_crossentropy',
        'gender': 'binary_crossentropy'
    },
    loss_weights={
        'age': 0.25,
        'income': 1.,
        'gender': 10.
    }
)

## Feeding data to a multi-output model 

Much as in the case of multi-input models, you can pass Numpy data to the model for training either via a list of arrays or via a dictionary of arrays.

In [31]:
# Generate dummy training inputs.
num_posts = 1000
max_length = 100  # Posts have max word length of this much.

posts = np.random.randint(1, vocabulary_size, size=(num_posts,max_length))
age_targets = np.random.randint(18, 50, size=num_posts)               # ???? What is the right size???
income_targets = np.random.uniform(1000, 100000, size=num_posts)
gender_targets = np.random.randint(0, 1, size=num_posts)

# age_targets, income_targets, and gender_targets are assumed to be Numpy arrays.
# model.fit(
#     posts, 
#     [age_targets, income_targets, gender_targets], 
#     epochs=10, 
#     batch_size=64
# )

# Equivalent (possible only if you give names to the output layers)
model.fit(
    posts, 
    {'age': age_targets, 'income': income_targets, 'gender': gender_targets},
    epochs=10, 
    batch_size=64
)

Epoch 1/10


ValueError: in user code:

    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/functional.py:386 call
        inputs, training=training, mask=mask)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/keras/layers/convolutional.py:247 call
        outputs = self._convolution_op(inputs, self.kernel)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py:1018 convolution_v2
        name=name)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py:1148 convolution_internal
        name=name)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:574 new_func
        return func(*args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:574 new_func
        return func(*args, **kwargs)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py:1889 conv1d
        name=name)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py:979 conv2d
        data_format=data_format, dilations=dilations, name=name)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py:593 _create_op_internal
        compute_device)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py:3485 _create_op_internal
        op_def=op_def)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py:1975 __init__
        control_input_ops, op_def)
    /home/ansel/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Negative dimension size caused by subtracting 5 from 2 for '{{node functional_15/conv1d_28/conv1d}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](functional_15/conv1d_28/conv1d/ExpandDims, functional_15/conv1d_28/conv1d/ExpandDims_1)' with input shapes: [?,1,2,256], [1,5,256,256].


# Directed acyclic graphs of layers

With the functional API, not only can you build models with multiple inputs and multiple outputs, but you can also implement networks with a complex internal topology. Neural networks in Keras are allowed to be arbitrary directed acyclic graphs of layers. The qualifier acyclic is important: these graphs can’t have cycles. It’s impossible for a tensor x to become the input of one of the layers that generated x. The only processing loops that are allowed (that is, recurrent connections) are those internal to recurrent layers.

## Inception modules

Inception is a popular type of network architecture for convolutional neural networks; it was developed by Christian Szegedy and his colleagues at Google in 2013–2014, inspired by the earlier network-in-network architecture. It consists of a stack of modules that themselves look like small independent networks, split into several parallel branches. Here's an example, taken from Inception V3:

In [None]:
x = ...

# Every branch has the same stride value (2), which is necessary to  
# keep all branch outputs the same size so you can concatenate them.
branch_a = layers.Conv2D(128, 1, activation='relu', strides=2)(x)

# In this branch, the striding occurs in the spatial convolution layer.
branch_b = layers.Conv2D(128, 1, activation='relu')(x)
branch_b = layers.Conv2D(128, 3, activation='relu', strides=2)(branch_b)

# In this branch, the striding occurs in the average pooling layer.
branch_c = layers.AveragePooling2D(3, strides=2)(x)
branch_c = layers.Conv2D(128, 3, activation='relu')(branch_c)

branch_d = layers.Conv2D(128, 1, activation='relu')(x)
branch_d = layers.Conv2D(128, 3, activation='relu')(branch_d)
branch_d = layers.Conv2D(128, 3, activation='relu', strides=2)(branch_d)

# Concatenates the branch outputs to obtain the module output
output = layers.concatenate([branch_a, branch_b, branch_c, branch_d], axis=-1)

## Residual connections

Residual connections are a common graph-like network component found in many post- 2015 network architectures, including Xception. A residual connection consists of making the output of an earlier layer available as input to a later layer, effectively creating a shortcut in a sequential network. Rather than being concatenated to the later activation, the earlier output is summed with the later activation, which assumes that both activations are the same size. If they’re different sizes, you can use a linear transformation to reshape the earlier activation into the target shape (for example, a Dense layer without an activation or, for convolutional feature maps, a 1 × 1 convolution without an activation).

Here’s how to implement a residual connection in Keras when the feature-map sizes are the same, using identity residual connections. This example assumes the exis- tence of a 4D input tensor x:

In [None]:
from tensorflow.keras import layers Applies a transformation to x

x = ...

# Applies a transformation to x
y = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
y = layers.Conv2D(128, 3, activation='relu', padding='same')(y)
y = layers.Conv2D(128, 3, activation='relu', padding='same')(y)

# Adds the original x back to the output features
y = layers.add([y, x])

The following implements a residual connection when the feature-map sizes differ, using a linear residual connection (again, assuming the existence of a 4D input tensor x)

In [None]:
from keras import layers 

x = ...
y = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
y = layers.Conv2D(128, 3, activation='relu', padding='same')(y)
y = layers.MaxPooling2D(2, strides=2)(y)

# Uses a 1 × 1 convolution to linearly downsample 
# the original x tensor to the same shape as y
residual = layers.Conv2D(128, 1, strides=2, padding='same')(x)

# Adds the residual tensor back to the output features
y = layers.add([y, residual])

## Layer weight sharing

One more important feature of the functional API is the ability to reuse a layer instance several times. When you call a layer instance twice, instead of instantiating a new layer for each call, you reuse the same weights with every call. This allows you to build models that have branches that share the same knowledge and perform the same operations. That is, they share the same representations and learn these representations simultaneously for different sets of inputs.

For example, consider a model that attempts to assess the semantic similarity between two sentences. The model has two inputs (the two sentences to compare) and outputs a score between 0 and 1, where 0 means unrelated sentences and 1 means sentences that are either identical or reformulations of each other.

In this setup, the two input sentences are interchangeable, because semantic similarity is a symmetrical relationship: the similarity of A to B is identical to the similarity of B to A. For this reason, it wouldn’t make sense to learn two independent models for processing each input sentence. Rather, you want to process both with a single LSTM layer. The representations of this LSTM layer (its weights) are learned based on both inputs simultaneously. This is what we call a Siamese LSTM model or a shared LSTM.


In [None]:
# Instantiates a single LSTM layer, once
lstm = layers.LSTM(32)

# Building the left branch of the model: 
# inputs are variable-length sequences of vectors of size 128.
left_input = Input(shape=(None, 128))
left_output = lstm(left_input)

# Building the right branch of the model: 
# when you call an existing layer instance, you reuse its weights.
right_input = Input(shape=(None, 128))
right_output = lstm(right_input)

# Builds the classifier on top
merged = layers.concatenate([left_output, right_output], axis=-1) 
predictions = layers.Dense(1, activation='sigmoid')(merged)

# Instantiating and training the model: when you train such a model, 
# the weights of the LSTM layer are updated based on both inputs.
model = Model([left_input, right_input], predictions)
model.fit([left_data, right_data], targets)

## Models as layers

Importantly, in the functional API, models can be used as you’d use layers—effectively, you can think of a model as a “bigger layer.” This is true of both the Sequential and Model classes. When you call a model instance, you’re reusing the weights of the model, exactly like what happens when you call a layer instance. Calling an instance, whether it’s a layer instance or a model instance, will always reuse the existing learned representations of the instance, which is intuitive.

One simple practical example of what you can build by reusing a model instance is a vision model that uses a dual camera as its input: two parallel cameras, a few centimeters (one inch) apart. Such a model can perceive depth, which can be useful in many applications.

You shouldn’t need two independent models to extract visual features from the left camera and the right camera before merging the two feeds. Such low-level processing can be shared across the two inputs: that is, done via layers that use the same weights and thus share the same representations. Here's how you'd implement a Siamese vision model (shared convolutional base) in Keras:


In [None]:
from tensorflow.keras import layers, Model
from tensorflow.keras import applications
from tensorflow.keras import Input

# The base image-processing model is the Xception network (convolutional base only).
xception_base = applications.Xception(weights=None, include_top=False)

# The inputs are 250 × 250 RGB images.
left_input = Input(shape=(250, 250, 3))
right_input = Input(shape=(250, 250, 3))

# Calls the same vision model twice
left_features = xception_base(left_input)
right_input = xception_base(right_input)

# The merged features contain information from the right visual feed and the left visual feed.
merged_features = layers.concatenate([left_features, right_input], axis=-1)