In [1]:
import numpy as np
import pandas as pd 

In [10]:
hidden_size = 16
attention_size = 10
input_length = 5

np.random.seed(42)

# Synthetic vectors used to test
encoder_states = np.random.randn(input_length, hidden_size)
decoder_state = np.random.randn(1, hidden_size)

In [11]:
#dimention (5,16)
encoder_states

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986, -0.23415337,
        -0.23413696,  1.57921282,  0.76743473, -0.46947439,  0.54256004,
        -0.46341769, -0.46572975,  0.24196227, -1.91328024, -1.72491783,
        -0.56228753],
       [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ,  1.46564877,
        -0.2257763 ,  0.0675282 , -1.42474819, -0.54438272,  0.11092259,
        -1.15099358,  0.37569802, -0.60063869, -0.29169375, -0.60170661,
         1.85227818],
       [-0.01349722, -1.05771093,  0.82254491, -1.22084365,  0.2088636 ,
        -1.95967012, -1.32818605,  0.19686124,  0.73846658,  0.17136828,
        -0.11564828, -0.3011037 , -1.47852199, -0.71984421, -0.46063877,
         1.05712223],
       [ 0.34361829, -1.76304016,  0.32408397, -0.38508228, -0.676922  ,
         0.61167629,  1.03099952,  0.93128012, -0.83921752, -0.30921238,
         0.33126343,  0.97554513, -0.47917424, -0.18565898, -1.10633497,
        -1.19620662],
       [ 0.81252582,  1.35624003, -0

In [12]:
decoder_state

array([[-0.21967189,  0.35711257,  1.47789404, -0.51827022, -0.8084936 ,
        -0.50175704,  0.91540212,  0.32875111, -0.5297602 ,  0.51326743,
         0.09707755,  0.96864499, -0.70205309, -0.32766215, -0.39210815,
        -1.46351495]])

In [13]:
repeated_decoder_state = np.repeat(decoder_state, input_length, axis=0)
repeated_decoder_state

array([[-0.21967189,  0.35711257,  1.47789404, -0.51827022, -0.8084936 ,
        -0.50175704,  0.91540212,  0.32875111, -0.5297602 ,  0.51326743,
         0.09707755,  0.96864499, -0.70205309, -0.32766215, -0.39210815,
        -1.46351495],
       [-0.21967189,  0.35711257,  1.47789404, -0.51827022, -0.8084936 ,
        -0.50175704,  0.91540212,  0.32875111, -0.5297602 ,  0.51326743,
         0.09707755,  0.96864499, -0.70205309, -0.32766215, -0.39210815,
        -1.46351495],
       [-0.21967189,  0.35711257,  1.47789404, -0.51827022, -0.8084936 ,
        -0.50175704,  0.91540212,  0.32875111, -0.5297602 ,  0.51326743,
         0.09707755,  0.96864499, -0.70205309, -0.32766215, -0.39210815,
        -1.46351495],
       [-0.21967189,  0.35711257,  1.47789404, -0.51827022, -0.8084936 ,
        -0.50175704,  0.91540212,  0.32875111, -0.5297602 ,  0.51326743,
         0.09707755,  0.96864499, -0.70205309, -0.32766215, -0.39210815,
        -1.46351495],
       [-0.21967189,  0.35711257,  1

In [14]:
# Weights for the neural network, these are typically learned through training
# Use these in the alignment function below as the layer weights
layer_1 = np.random.randn(2 * hidden_size, attention_size)
layer_2 = np.random.randn(attention_size, 1)


In [5]:
layer_2

array([[ 1.55115198],
       [ 0.11567463],
       [ 1.17929718],
       [ 0.06751848],
       [ 2.06074792],
       [ 1.75534084],
       [-0.24896415],
       [ 0.97157095],
       [ 0.64537595],
       [ 1.36863156]])

In [15]:
def alignment(encoder_states, decoder_state):
    # First, concatenate the encoder states and the decoder state.
    inputs = np.concatenate((encoder_states, decoder_state), axis=1)
    print(inputs.shape)
    assert inputs.shape == (input_length, 2*hidden_size)
    
    # Matrix multiplication of the concatenated inputs and the first layer, with tanh activation
    activations = np.tanh(np.matmul(inputs, layer_1))
    assert activations.shape == (input_length, attention_size)
    
    # Matrix multiplication of the activations with the second layer. Remember that you don't need tanh here
    scores = np.matmul(activations, layer_2)
    assert scores.shape == (input_length, 1)
    
    return scores

In [16]:
# Run this to test your alignment function
scores = alignment(encoder_states, repeated_decoder_state)
print(scores)

(5, 32)
[[4.35790943]
 [5.92373433]
 [4.18673175]
 [2.11437202]
 [0.95767155]]


In [17]:
def softmax(x, axis=0):
    """ Calculate softmax function for an array x along specified axis
    
        axis=0 calculates softmax across rows which means each column sums to 1 
        axis=1 calculates softmax across columns which means each row sums to 1
    """
    return np.exp(x) / np.expand_dims(np.sum(np.exp(x), axis=axis), axis)

In [19]:
def attention(encoder_states, decoder_state):
    """ Example function that calculates attention, returns the context vector 
    
        Arguments:
        encoder_vectors: NxM numpy array, where N is the number of vectors and M is the vector length
        decoder_vector: 1xM numpy array, M is the vector length, much be the same M as encoder_vectors
    """ 
    
    # First, calculate the dot product of each encoder vector with the decoder vector
    scores = alignment(encoder_states, decoder_state)
    
    # Then take the softmax of those scores to get a weight distribution
    weights = softmax(scores)
    
    # Multiply each encoder state by its respective weight
    weighted_scores = encoder_states * weights
    
    # Sum up the weights encoder states
    context = np.sum(weighted_scores, axis=0)
    
    return context

context_vector = attention(encoder_states, repeated_decoder_state)
print(context_vector)

(5, 32)
[-0.63514569  0.04917298 -0.43930867 -0.9268003   1.01903919 -0.43181409
  0.13365099 -0.84746874 -0.37572203  0.18279832 -0.90452701  0.17872958
 -0.58015282 -0.58294027 -0.75457577  1.32985756]
