The image was taken from this [reference](https://medium.com/heuritech/attention-mechanism-5aba9a2d4727).


![image](workshop/attention_tutorial.webp)

<div class="alert alert-block alert-info">
<b>Example:</b> <br>
- I have a batch which contains 64 sentences <br>
- Each sentence contains 30 words <br>
- Each word is embedded into a vecotor with length 100 => data shape is: (64, 30, 100) <br>
- I pass this input to a Bidirectional LSTM with 128 units <br>
- My goal is to predict whether this is a posivie sentence (0) or a negative sentence (1) => Binary classification <br>
</div>

In [1]:
from keras import backend as K
import tensorflow as tf
from keras import Model
from keras.layers import Layer, Dense, LSTM, Input, Bidirectional

In [39]:
print("tensorflow version:", tf.__version__)

tensorflow version: 2.6.0


In [2]:
input = tf.random.normal([64, 30, 100])
lstm = Bidirectional(LSTM(128, return_sequences=True))
lstm_output = lstm(input)

print("Input shape:", input.shape)
print("Output shape", lstm_output.shape)

Input shape: (64, 30, 100)
Output shape (64, 30, 256)


<div class="alert alert-block alert-info">
<b>Choices:</b> <br>
Feed this output to a Dense layer with 1 neuron to get 0 or 1 for classes, or use attention mechanism.
<br>
</div>

<div class="alert alert-block alert-success">
<b>Steps:</b>
    <ol>
     <li>Calculate the score function (dot product)</li>
     <li>Normalize the scores</li>
     <li>Get the attention weights</li>
     <li>Multiply Values with the attention weights</li>
     <li>Compute the context vector (summation)</li>  
    </ol>
</div>

<div class="alert alert-block alert-warning">
<b>dot product:</b> <br>
    - $\ a = [1,2,3]$ <br>
    - $\ b = [4,5,6]$ <br>
    - $\ a.b = (a[0]*b[0]) + (a[1]*b[1]) + (a[2]*b[2])$ <br>
</div>

<div class="alert alert-block alert-info">
Compute the score function
</div>

In [12]:
attention_input = lstm_output
num_neurons = 1
num_dim_perword = attention_input.shape[-1]
w = tf.random.normal(shape=(num_dim_perword, num_neurons))
score = K.dot(attention_input, w)
print("weight size:", w.shape)
print("attention layer's input size:", attention_input.shape)
print("score shape", score.shape)

weight size: (256, 1)
attention layer's input size: (64, 30, 256)
score shape (64, 30, 1)


<div class="alert alert-block alert-info">
    Let's add a bias and apply an activation function
</div>

In [18]:
num_words_per_sentence = attention_input.shape[-2]
b = tf.random.normal(shape=(num_words_per_sentence, num_neurons))
score = score + b
# apply tanh
score = K.tanh(score)
print("bias shape:", b.shape)
print("score shape", score.shape)

bias shape: (30, 1)
score shape (64, 30, 1)


<div class="alert alert-block alert-warning">
<b>Why would I want to apply tanh?</b>
</div>

In [19]:
score[0]

<tf.Tensor: shape=(30, 1), dtype=float32, numpy=
array([[ 0.00671984],
       [-0.79233164],
       [ 0.84262574],
       [ 0.9393658 ],
       [ 0.96200097],
       [ 0.9514732 ],
       [-0.7956847 ],
       [-0.9320937 ],
       [-0.8195906 ],
       [-0.9541469 ],
       [ 0.5583366 ],
       [ 0.66266966],
       [ 0.3721411 ],
       [-0.95057094],
       [ 0.94560724],
       [-0.99997675],
       [-0.9090281 ],
       [-0.99658227],
       [ 0.48647708],
       [ 0.99285144],
       [ 0.9977037 ],
       [-0.8550069 ],
       [ 0.37468526],
       [ 0.98339844],
       [ 0.14576234],
       [ 0.75789636],
       [ 0.9262583 ],
       [ 0.7487535 ],
       [-0.5285568 ],
       [ 0.9952256 ]], dtype=float32)>

<div class="alert alert-block alert-info">
    Get the attention weights by normalizing the weights (Ex: apply softmax)
</div>

In [26]:
attention_weights = K.softmax(K.squeeze(score, axis=-1))
attention_weights = K.expand_dims(attention_weights, axis=-1)
print("attention_weights shape:", attention_weights.shape)

attention_weights shape: (64, 30, 1)


In [24]:
attention_weights[0]

<tf.Tensor: shape=(30, 1), dtype=float32, numpy=
array([[0.02223728],
       [0.01000134],
       [0.05129925],
       [0.05650992],
       [0.05780362],
       [0.05719826],
       [0.00996786],
       [0.00869681],
       [0.00973239],
       [0.00850712],
       [0.0386052 ],
       [0.04285062],
       [0.03204661],
       [0.0085376 ],
       [0.05686372],
       [0.00812604],
       [0.00889974],
       [0.00815367],
       [0.03592838],
       [0.05961467],
       [0.05990464],
       [0.00939374],
       [0.03212824],
       [0.05905379],
       [0.02555448],
       [0.04713174],
       [0.05577405],
       [0.04670278],
       [0.01302009],
       [0.05975638]], dtype=float32)>

<div class="alert alert-block alert-info">
Compute the weighted representation of the values
</div>

In [25]:
print("attention_input shape:", attention_input.shape)
print("attention_weights shape:", attention_weights.shape)
values_weighted_representation = attention_input*attention_weights # here we are using attention input as the values
print("Weighted representation of values (keys in this case):", values_weighted_representation.shape)

attention_input shape: (64, 30, 256)
attention_weights shape: (64, 30, 1)
Weighted representation of values (keys in this case): (64, 30, 256)


<div class="alert alert-block alert-info">
Compute the context vector
</div>

In [32]:
context_vector = K.sum(values_weighted_representation, axis=1)
print("context vector shape:", context_vector.shape)

context vector shape: (64, 256)


<div class="alert alert-block alert-info">
<b>In one catch:</b>
</div>

In [34]:
attention_input = lstm_output
# initialize the learning parameters
w = tf.random.normal(shape=(256, 1))
b = tf.random.normal(shape=(30, 1))
# compute the score
score = K.dot(attention_input, w)
score = score + b
# Get the attention weights
attention_weights = K.softmax(score)
# Compute the weighted reepresentation of the values
values_weighted_representation = attention_input*attention_weights # here we are using attention_input as the values
# Compute the context vector
context_vector = K.sum(values_weighted_representation, axis=1)
print("Now we can pass the context vector to the next layer")

Now we can pass the context vector to the next layer


<div class="alert alert-block alert-success">
<b>Let's write it in a Layer format that can be applied to a model.</b>
</div>

In [33]:
class Attention(Layer): 
    def __init__(self, num_neurons=1):    
        self.num_neurons = num_neurons
        super(Attention, self).__init__()
        
    def build(self, input_shape):
        self.num_dim_perword = input_shape[-1]
        self.words_pertweet = input_shape[-2]

        self.W = self.add_weight(
            name="att_weight",
            shape=(self.num_dim_perword, self.num_neurons),
            initializer='normal')

        self.b = self.add_weight(
            name="att_bias",
            shape=(self.words_pertweet, self.num_neurons),
            initializer='zero')
        super(Attention, self).build(input_shape)
        
    def call(self, x):
        e = K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        a = K.softmax(e)
        a = K.expand_dims(a,axis=-1)
        output = x*a
        return K.sum(output, axis=1)
    
input_layer = Input(shape=(30, 100))
lstm = Bidirectional(LSTM(units=128, return_sequences=True))(input_layer)
attn = Attention()(lstm)
output = Dense(units=1, activation="linear")(attn)
model = Model([input_layer], [output])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30, 100)]         0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 30, 256)           234496    
_________________________________________________________________
attention (Attention)        (None, 256)               286       
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 235,039
Trainable params: 235,039
Non-trainable params: 0
_________________________________________________________________


<div class="alert alert-block alert-success">
<b>Simple Attention mechanism explained by ChatGPT</b>
</div>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
"""
This implementation uses the TensorFlow Layer class to define the attention layer. 
The build method initializes the weights of the layer (W, b, and u), and the call 
method applies the attention mechanism to the input. The attention mechanism uses 
a tanh activation and a softmax activation to calculate the attention scores, and 
then multiplies the input with the scores to get the weighted input. Finally, the 
sum of the weighted input is returned.
"""
class AttentionLayer(layers.Layer):
    def __init__(self, neurons, **kwargs):
        self.neurons = neurons
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='W', shape=(input_shape[-1], self.neurons), initializer='uniform', trainable=True)
        self.b = self.add_weight(name='b', shape=(self.neurons,), initializer='zeros', trainable=True)
        self.u = self.add_weight(name='u', shape=(self.neurons, 1), initializer='uniform', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs, **kwargs):
        uit = tf.tanh(tf.add(tf.matmul(inputs, self.W), self.b))
        ait = tf.nn.softmax(tf.matmul(uit, self.u), axis=-1)
        weighted_input = tf.multiply(inputs, ait)
        return tf.reduce_sum(weighted_input, axis=1)

<div class="alert alert-block alert-warning">
<b>Explore this class at home and check what each line is doing. <br>
    Also think about why? <br>
    Apply this layer to a model and see if it can improve the performance.</b>
</div>

The following section is coming from this [reference](https://colab.research.google.com/github/whitead/dmol-book/blob/master/dl/attention.ipynb#scrollTo=tXLxMdk4R9Yu)

<div class="alert alert-block alert-info">
<b>Example 2:</b> <br>
    - We have the following sentence: The sleepy child reads a book <br>
    - The goal is to see what parts of the sentence the query (for instance "book") should be influenced by. <br>
    - Let's embed each word into a vector with length 3 <br>
    - consider the query as the word "book"
    - consider the values to be the sentiment of the word. Is it a positive word ("happy") or a negative word ("angry")
</div>



| Index| Embedding| Word|
|:-----|:--------:|----:|
| 0    |  0,0,0   | The |
| 1    |  2,0,1   | Sleepy |
| 2    |  1,-1,-2   | Child |
| 3    |  2,3,1   | Reads |
| 4    |  -2,0,0   | A |
| 5    |  0,2,1   | Book |

Keys: $(6, 3)$
\begin{equation}
\mathbf{K} = \left[
\begin{array}{lccccr}
0 & 2 & 1 & 2 & -2 & 0\\
0 & 0 & -1 & 3 & 0 & 2\\
0 & 1 & -2 & 1 & 0 & 1\\
\end{array}\right]
\end{equation}

Values: $(6, 1)$
\begin{equation}
\mathbf{V} = \left[ 0, -0.2, 0.3, 0.4, 0, 0.1\right]
\end{equation}

Query: $(3,)$
\begin{equation}
\vec{q} = \left[0, 2, 1\right]
\end{equation}

In [3]:
import numpy as np

In [9]:
i_query = np.random.normal(size=(3,))
i_keys = np.random.normal(size=(6, 3))
i_values = np.random.normal(size=(6, 1))


<div class="alert alert-block alert-warning">
<b>@: dot product</b>
</div>

<div class="alert alert-block alert-success">
<b> General attenion:</b>
</div>
\begin{equation}
    \vec{b} = \mathrm{softmax}\left(\frac{1}{\sqrt{d}}\vec{q}\cdot \mathbf{K}\right)
\end{equation}

In [39]:
def softmax(x, axis=None):
    return np.exp(x) / np.sum(np.exp(x), axis=axis)


def tensor_dot(q, k):
    b = softmax((k @ q) / np.sqrt(q.shape[0]))
    return b

def attention_layer(q, k, v):
    b = tensor_dot(q, k)
    return b @ v

attention_layer(i_query, i_keys, i_values)

array([-0.11157827])

<div class="alert alert-block alert-success">
<b> Self attention:</b> when the query, values and keys are equal (We only use the keys)
</div>

In [40]:
def batched_tensor_dot(q, k):
    # a will be batch x seq x feature dim
    # which is N x N x 4
    # batched dot product in einstein notation
    a = np.einsum("ij,kj->ik", q, k) / np.sqrt(q.shape[0])
    # now we softmax over sequence
    b = softmax(a, axis=1)
    return b


def self_attention(x):
    b = batched_tensor_dot(x, x)
    return b @ x


attention_result = self_attention(i_keys)
print(attention_result.shape)
attention_result

(6, 3)


array([[ 0.20009993,  0.30845095,  0.19241971],
       [ 0.75329141,  0.33190861, -0.17081802],
       [ 2.12516013,  0.48464586, -0.8761569 ],
       [-0.78917967, -2.29579892, -1.50165363],
       [-0.07414448,  0.94402292,  0.78108597],
       [ 0.53707919,  0.30445758, -0.01764354]])

<div class="alert alert-block alert-success">
<b> Adding Trainable Parameters:</b>
</div>

In [41]:
# weights should be input feature_dim -> desired output feature_dim
w_q = np.random.normal(size=(3, 3))
w_k = np.random.normal(size=(3, 3))
w_v = np.random.normal(size=(3, 2))


def trainable_self_attention(x, w_q, w_k, w_v):
    q = x @ w_q
    k = x @ w_k
    v = x @ w_v
    b = batched_tensor_dot(q, k)
    return b @ v


trainable_self_attention(i_keys, w_q, w_k, w_v)

array([[ 0.74485375,  1.61535542],
       [ 0.16417471, -1.22207086],
       [ 4.72068071, -3.76600936],
       [-0.40738699,  0.40194374],
       [ 1.96006861,  4.9651016 ],
       [ 0.57183174, -0.63849133]])

<div class="alert alert-block alert-success">
<b> Multi-head:</b>
</div>

In [42]:
w_q_h1 = np.random.normal(size=(3, 3))
w_k_h1 = np.random.normal(size=(3, 3))
w_v_h1 = np.random.normal(size=(3, 2))
w_q_h2 = np.random.normal(size=(3, 3))
w_k_h2 = np.random.normal(size=(3, 3))
w_v_h2 = np.random.normal(size=(3, 2))
w_h = np.random.normal(size=2)


def multihead_attention(x, w_q_h1, w_k_h1, w_v_h1, w_q_h2, w_k_h2, w_v_h2):
    h1_out = trainable_self_attention(x, w_q_h1, w_k_h1, w_v_h1)
    h2_out = trainable_self_attention(x, w_q_h2, w_k_h2, w_v_h2)
    # join along last axis so we can use dot.
    all_h = np.stack((h1_out, h2_out), -1)
    return all_h @ w_h


multihead_attention(i_keys, w_q_h1, w_k_h1, w_v_h1, w_q_h2, w_k_h2, w_v_h2)

array([[-8.42021125e-03, -5.85620417e-02],
       [ 5.48249739e-02,  1.42116146e+00],
       [ 1.03971379e+00,  2.32066883e+01],
       [-1.00868714e-01,  4.20652577e-01],
       [-1.47464783e-01, -3.74156949e-01],
       [-1.98759090e-03,  7.04947641e-02]])