In [416]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import gensim.downloader as api

In [417]:
wv = api.load('glove-wiki-gigaword-50')

SETUP PARAMETER

In [418]:
d_model = 50
n_head = 5
n_batch = 2

In [419]:
def tokenize(x):
    return x.split()

def embed(x):
    embeddings = wv[x]
    return embeddings

def posEncode(x):
    n_seq, _ = x.shape
    i = np.arange(0, d_model, 2, dtype='float16')
    denominator = np.power(10000, i/d_model)
    position = np.arange(0, n_seq, dtype='float16').reshape(-1,1)
    even_PE = np.sin(position / denominator)
    odd_PE  = np.cos(position / denominator)
    return x + np.ravel([even_PE.T, odd_PE.T],'F').reshape(n_seq, d_model)

def masking(x):
    mask = np.tril(np.ones((x.shape)))
    mask[mask==0] = -np.infty
    mask[mask==1] = 0
    return x + mask

In [420]:
def init_QKV(n_head, d_model):
    Q = tf.Variable(np.random.rand(1,n_head, d_model, d_model//n_head), dtype='float32')
    K = tf.Variable(np.random.rand(1,n_head, d_model, d_model//n_head), dtype='float32')
    V = tf.Variable(np.random.rand(1,n_head, d_model, d_model//n_head), dtype='float32')
    return Q, K, V

In [421]:
def context(input, Q,K,V, mask=False):    
    raw_attention = ( input @ Q ) @ tf.transpose( input @ K, perm=(0,1,3,2) )
    if (mask == True):
        raw_attention = masking( raw_attention )
    score = tf.nn.softmax( raw_attention / (d_model) ** .5 )
    context = score @ ( input @ V )
    
    return context

def concat4D(x):
    a,b,c,d = x.shape 
    x1 = tf.transpose(x, perm=(0,1,3,2))
    x2 = tf.reshape(x1, [a, 1, b*d, c])
    x3 = tf.transpose(x2, perm=(0,1,3,2))
    return x3

Try AUTODIFF P1 -> Passed

In [422]:
Q1, K1, V1 = init_QKV(n_head=n_head, d_model=d_model)
x = np.random.rand(n_batch,1,5,d_model)

In [423]:
conxt = context(x, Q1,K1,V1)
concat_conxt = concat4D(conxt)

In [424]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch([Q1,K1,V1])
    conxt = context(x,Q1,K1,V1)
    concat = concat4D(conxt)

In [446]:
def init_g_b(n_batch, d_model):
    gamma = tf.Variable(np.ones((n_batch, 1, 1,d_model)), dtype='float32')
    beta = tf.Variable(np.zeros((n_batch, 1, 1,d_model)), dtype='float32')
    return gamma, beta

In [433]:
def add_norm(context, prev_input, gamma, beta):
    
    context = context + prev_input
    a,_,b,_ = context.shape
    #________Mean__________
    m = tf.reduce_mean(context, axis=3)
    mean = tf.transpose(tf.reshape(m, [a,1,1,b]), perm=(0,1,3,2))
    #________Sigma__________
    s = tf.math.reduce_std(context, axis=3)
    sigma = tf.transpose(tf.reshape(s, [a,1,1,b]), perm=(0,1,3,2))

    context = (context - mean) / sigma
    context = context * gamma + beta
    return context

In [447]:
gamma1, beta1 = init_g_b(n_batch, d_model)

In [451]:
input3 = add_norm(concat_conxt, x, gamma1, beta1)

In [452]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch([Q1,K1,V1,gamma1,beta1])
    conxt = context(x,Q1,K1,V1)
    concat = concat4D(conxt)
    AN = add_norm(concat, x, gamma1, beta1)

Add and Norm 1: 
Add&Norm: passed + 
gradient:passed