# The Import Section 

In [2]:
from pandas import read_csv
import numpy as np
from keras import Model
from keras.layers import Layer
import keras.backend as K
from keras.layers import Input, Dense, SimpleRNN
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.metrics import mean_squared_error

# Preparing the dataset

The following function generates a sequence of n Fibonacci number. If Sacale_data is set to True, then it would also use the MinMaxSaclar from scikit-learn to sacale the values between 0 and 1. Let's see its output for n=10

Next, we need a function get_fib_XY() that reformats the sequence into training examples and target values to be used by the Keras input layer. When given time_steps as a parameter, get_fib_XY() constructs each row of the dataset with time_steps number of columns. This function not only constructs the training set and test set from the Fibonacci sequence but also shuffles the training examples and reshapes them to the required TensorFlow format, i.e., total_samples x time_steps x features. Also, the function returns the scaler object that scales the values if scale_data is set to True.

Let’s generate a small training set to see what it looks like. We have set time_steps=3 and total_fib_numbers=12, with approximately 70% of the examples going toward the test points. Note the training and test examples have been shuffled by the permutation() function.

In [3]:
def get_fib_seq(n, scale_data=True):
    
    # Get the Fibonacci sequence
    seq = np.zeros(n)
    fib_n1 = 0.0
    fib_n = 1.0 
    for i in range(n):
            seq[i] = fib_n1 + fib_n
            fib_n1 = fib_n
            fib_n = seq[i] 
    scaler = []
    if scale_data:
        scaler = MinMaxScaler(feature_range=(0, 1))
        seq = np.reshape(seq, (n, 1))
        seq = scaler.fit_transform(seq).flatten()        
    return seq, scaler
 
fib_seq = get_fib_seq(10, False)[0]
print(fib_seq)

[ 1.  2.  3.  5.  8. 13. 21. 34. 55. 89.]


Next, we need a function get_fib_XY() that reformats the sequence into training examples and target values to be used by the Keras input layer. When given time_steps as a parameter, get_fib_XY() constructs each row of the dataset with time_steps number of columns. This function not only constructs the training set and test set from the Fibonacci sequence but also shuffles the training examples and reshapes them to the required TensorFlow format, i.e., total_samples x time_steps x features. Also, the function returns the scaler object that scales the values if scale_data is set to True.

Let’s generate a small training set to see what it looks like. We have set time_steps=3 and total_fib_numbers=12, with approximately 70% of the examples going toward the test points. Note the training and test examples have been shuffled by the permutation() function.

In [4]:
def get_fib_XY(total_fib_numbers, time_steps, train_percent, scale_data=True):
    dat, scaler = get_fib_seq(total_fib_numbers, scale_data)    
    Y_ind = np.arange(time_steps, len(dat), 1)
    Y = dat[Y_ind]
    rows_x = len(Y)
    X = dat[0:rows_x]
    for i in range(time_steps-1):
        temp = dat[i+1:rows_x+i+1]
        X = np.column_stack((X, temp))
        
    # random permutation with fixed seed   
    rand = np.random.RandomState(seed=13)
    idx = rand.permutation(rows_x)
    split = int(train_percent*rows_x)
    train_ind = idx[0:split]
    test_ind = idx[split:]
    trainX = X[train_ind]
    trainY = Y[train_ind]
    testX = X[test_ind]
    testY = Y[test_ind]
    trainX = np.reshape(trainX, (len(trainX), time_steps, 1))    
    testX = np.reshape(testX, (len(testX), time_steps, 1))
    return trainX, trainY, testX, testY, scaler
 
trainX, trainY, testX, testY, scaler = get_fib_XY(12, 3, 0.7, False)
print('trainX = ', trainX)
print('trainY = ', trainY)

trainX =  [[[ 8.]
  [13.]
  [21.]]

 [[ 5.]
  [ 8.]
  [13.]]

 [[ 2.]
  [ 3.]
  [ 5.]]

 [[13.]
  [21.]
  [34.]]

 [[21.]
  [34.]
  [55.]]

 [[34.]
  [55.]
  [89.]]]
trainY =  [ 34.  21.   8.  55.  89. 144.]


# Setting Up the Network

Now let’s set up a small network with two layers. The first one is the SimpleRNN layer, and the second one is the Dense layer. Below is a summary of the model.

In [5]:
# Set up parameters
time_steps = 20
hidden_units = 2
epochs = 30
 
# Create a traditional RNN network
def create_RNN(hidden_units, dense_units, input_shape, activation):
    model = Sequential()
    model.add(SimpleRNN(hidden_units, input_shape=input_shape, activation=activation[0]))
    model.add(Dense(units=dense_units, activation=activation[1]))
    model.compile(loss='mse', optimizer='adam')
    return model
 
model_RNN = create_RNN(hidden_units=hidden_units, dense_units=1, input_shape=(time_steps,1), 
                   activation=['tanh', 'tanh'])
model_RNN.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 2)                 8         
                                                                 
 dense (Dense)               (None, 1)                 3         
                                                                 
Total params: 11
Trainable params: 11
Non-trainable params: 0
_________________________________________________________________


# Train the Network and Evaluate

The next step is to add code that generates a dataset, trains the network, and evaluates it. This time around, we’ll scale the data between 0 and 1. We don’t need to pass the scale_data parameter as its default value is True. 

In [6]:
# Generate the dataset
trainX, trainY, testX, testY, scaler  = get_fib_XY(1200, time_steps, 0.7)
 
model_RNN.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)
 
 
# Evalute model
train_mse = model_RNN.evaluate(trainX, trainY)
test_mse = model_RNN.evaluate(testX, testY)
 
# Print error
print("Train set MSE = ", train_mse)
print("Test set MSE = ", test_mse)

Epoch 1/30
826/826 - 3s - loss: 9.8271e-04 - 3s/epoch - 4ms/step
Epoch 2/30
826/826 - 2s - loss: 8.2982e-04 - 2s/epoch - 3ms/step
Epoch 3/30
826/826 - 2s - loss: 6.8397e-04 - 2s/epoch - 2ms/step
Epoch 4/30
826/826 - 2s - loss: 5.4779e-04 - 2s/epoch - 2ms/step
Epoch 5/30
826/826 - 2s - loss: 4.5016e-04 - 2s/epoch - 2ms/step
Epoch 6/30
826/826 - 2s - loss: 3.5202e-04 - 2s/epoch - 2ms/step
Epoch 7/30
826/826 - 2s - loss: 2.7377e-04 - 2s/epoch - 2ms/step
Epoch 8/30
826/826 - 2s - loss: 2.0426e-04 - 2s/epoch - 3ms/step
Epoch 9/30
826/826 - 2s - loss: 1.4902e-04 - 2s/epoch - 2ms/step
Epoch 10/30
826/826 - 2s - loss: 1.1409e-04 - 2s/epoch - 3ms/step
Epoch 11/30
826/826 - 2s - loss: 9.2090e-05 - 2s/epoch - 3ms/step
Epoch 12/30
826/826 - 3s - loss: 7.2412e-05 - 3s/epoch - 3ms/step
Epoch 13/30
826/826 - 2s - loss: 6.5676e-05 - 2s/epoch - 3ms/step
Epoch 14/30
826/826 - 2s - loss: 6.2180e-05 - 2s/epoch - 3ms/step
Epoch 15/30
826/826 - 2s - loss: 4.4382e-05 - 2s/epoch - 2ms/step
Epoch 16/30
826/826

# Adding a Custom Attention Layer to the Network

In Keras, it is easy to create a custom layer that implements attention by subclassing the Layer class. The Keras guide lists clear steps for creating a new layer via subclassing. You’ll use those guidelines here. All the weights and biases corresponding to a single layer are encapsulated by this class. You need to write the __init__ method as well as override the following methods:

build(): The Keras guide recommends adding weights in this method once the size of the inputs is known. This method “lazily” creates weights. The built-in function add_weight() can be used to add the weights and biases of the attention layer.
call(): The call() method implements the mapping of inputs to outputs. It should implement the forward pass during training.

# The Call Method for the Attention Layer

The call method of the attention layer has to compute the alignment scores, weights, and context. You can go through the details of these parameters in Stefania’s excellent article on The Attention Mechanism from Scratch. You’ll implement the Bahdanau attention in your call() method.

The good thing about inheriting a layer from the Keras Layer class and adding the weights via the add_weights() method is that weights are automatically tuned. Keras does an equivalent of “reverse engineering” of the operations/computations of the call() method and calculates the gradients during training. It is important to specify trainable=True when adding the weights. You can also add a train_step() method to your custom layer and specify your own method for weight training if needed.


In [7]:
# Add attention layer to the deep learning network
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)
 
    def call(self,x):
        
        # Alignment scores. Pass them through tanh function
        e = K.tanh(K.dot(x,self.W)+self.b)
        
        # Remove dimension of size 1
        e = K.squeeze(e, axis=-1) 
        
        # Compute the weights
        alpha = K.softmax(e)
        
        # Reshape to tensorFlow format
        alpha = K.expand_dims(alpha, axis=-1)
        
        # Compute the context vector
        context = x * alpha
        context = K.sum(context, axis=1)
        return context

# RNN Network with Attention Layer

Let’s now add an attention layer to the RNN network you created earlier. The function create_RNN_with_attention() now specifies an RNN layer, an attention layer, and a Dense layer in the network. Make sure to set return_sequences=True when specifying the SimpleRNN. This will return the output of the hidden units for all the previous time steps.

In [8]:
def create_RNN_with_attention(hidden_units, dense_units, input_shape, activation):
    x=Input(shape=input_shape)
    RNN_layer = SimpleRNN(hidden_units, return_sequences=True, activation=activation)(x)
    attention_layer = attention()(RNN_layer)
    outputs=Dense(dense_units, trainable=True, activation=activation)(attention_layer)
    model=Model(x,outputs)
    model.compile(loss='mse', optimizer='adam')    
    return model    
 
model_attention = create_RNN_with_attention(hidden_units=hidden_units, dense_units=1, 
                                  input_shape=(time_steps,1), activation='tanh')
model_attention.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20, 1)]           0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 20, 2)             8         
                                                                 
 attention (attention)       (None, 2)                 22        
                                                                 
 dense_1 (Dense)             (None, 1)                 3         
                                                                 
Total params: 33
Trainable params: 33
Non-trainable params: 0
_________________________________________________________________


# Train and Evaluate the Deep Learning Network with Attention

It’s time to train and test your model and see how it performs in predicting the next Fibonacci number of a sequence.

In [9]:
model_attention.fit(trainX, trainY, epochs=epochs, batch_size=1, verbose=2)
 
# Evalute model
train_mse_attn = model_attention.evaluate(trainX, trainY)
test_mse_attn = model_attention.evaluate(testX, testY)
 
# Print error
print("Train set MSE with attention = ", train_mse_attn)
print("Test set MSE with attention = ", test_mse_attn)

Epoch 1/30
826/826 - 3s - loss: 0.0014 - 3s/epoch - 4ms/step
Epoch 2/30
826/826 - 2s - loss: 0.0013 - 2s/epoch - 2ms/step
Epoch 3/30
826/826 - 2s - loss: 0.0013 - 2s/epoch - 2ms/step
Epoch 4/30
826/826 - 3s - loss: 0.0013 - 3s/epoch - 4ms/step
Epoch 5/30
826/826 - 2s - loss: 0.0013 - 2s/epoch - 3ms/step
Epoch 6/30
826/826 - 2s - loss: 0.0013 - 2s/epoch - 3ms/step
Epoch 7/30
826/826 - 3s - loss: 0.0012 - 3s/epoch - 3ms/step
Epoch 8/30
826/826 - 3s - loss: 0.0012 - 3s/epoch - 4ms/step
Epoch 9/30
826/826 - 2s - loss: 0.0012 - 2s/epoch - 3ms/step
Epoch 10/30
826/826 - 2s - loss: 0.0012 - 2s/epoch - 2ms/step
Epoch 11/30
826/826 - 2s - loss: 0.0011 - 2s/epoch - 3ms/step
Epoch 12/30
826/826 - 2s - loss: 0.0011 - 2s/epoch - 3ms/step
Epoch 13/30
826/826 - 2s - loss: 0.0010 - 2s/epoch - 3ms/step
Epoch 14/30
826/826 - 2s - loss: 9.9590e-04 - 2s/epoch - 3ms/step
Epoch 15/30
826/826 - 2s - loss: 9.7308e-04 - 2s/epoch - 3ms/step
Epoch 16/30
826/826 - 2s - loss: 9.3787e-04 - 2s/epoch - 2ms/step
Epoch