In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import gensim.downloader as api

In [2]:
wv = api.load('glove-wiki-gigaword-50')

In [3]:
class Transformer:
    def __init__(self):
        self.d_model = 50
        self.n_head = 2
        self.PosEn_dtype = 'float64'
        self.Param_dtype = 'float64'
        self.n_batch = 2
    
    def tokenize(self, x):
        return x.split()

    def embed(self, x):
        embeddings = wv[x]
        return embeddings

    def posEncode(self, x):
        _, _, input_size, _ = x.shape
        i = np.arange(0, self.d_model, 2, dtype=self.d_model)
        denominator = np.power(10000, i/ self.d_model)
        position = np.arange(0, input_size, dtype=self.PosEn_dtype).reshape(-1,1)
        even_PE = np.sin(position / denominator)
        odd_PE  = np.cos(position / denominator)
        return x + np.ravel([even_PE.T, odd_PE.T],'F').reshape(input_size, self.d_model)

    def masking(self, x):
        mask = np.tril(np.ones((x.shape)))
        mask[mask==0] = -np.infty
        mask[mask==1] = 0
        return x + mask
    
    def init_QKV(self):
        head_dim = self.d_model//self.n_head
        Q = tf.Variable(np.random.rand(1,self.n_head, self.d_model, head_dim), dtype= self.Param_dtype)
        K = tf.Variable(np.random.rand(1,self.n_head, self.d_model, head_dim), dtype= self.Param_dtype)
        V = tf.Variable(np.random.rand(1,self.n_head, self.d_model, head_dim), dtype= self.Param_dtype)
        return Q, K, V
    
    def init_GB(self):
        gamma = tf.Variable(np.ones((self.n_batch, 1, 1,self.d_model)), dtype=self.Param_dtype)
        beta = tf.Variable(np.zeros((self.n_batch, 1, 1,self.d_model)), dtype=self.Param_dtype)
        return gamma, beta
    
    def context(self, x, Q,K,V, mask=False):    
        raw_attention = ( x @ Q ) @ tf.transpose( x @ K, perm=(0,1,3,2) )
        if (mask == True):
            raw_attention = self.masking( raw_attention )
        score = tf.nn.softmax( raw_attention / (self.d_model) ** .5 )
        context = score @ ( x @ V )
        
        return context

    def concat4D(self, x):
        a,b,c,d = x.shape 
        x1 = tf.transpose(x, perm=(0,1,3,2))
        x2 = tf.reshape(x1, [a, 1, b*d, c])
        x3 = tf.transpose(x2, perm=(0,1,3,2))
        return x3
    
    def add_norm(self, context, prev_input, gamma, beta):
        context = context + prev_input
        a,_,b,_ = context.shape
        #________Mean__________
        m = tf.reduce_mean(context, axis=3)
        mean = tf.transpose(tf.reshape(m, [a,1,1,b]), perm=(0,1,3,2))
        #________Sigma__________
        s = tf.math.reduce_std(context, axis=3)
        sigma = tf.transpose(tf.reshape(s, [a,1,1,b]), perm=(0,1,3,2))

        context = (context - mean) / sigma
        context = context * gamma + beta
        return context
    
    def init_WB(self, input_size, neurons):
        w = tf.Variable(np.random.rand(1,1, input_size, neurons), dtype= self.Param_dtype)
        b = tf.Variable(np.zeros((self.n_batch, 1, 1, neurons)) , dtype= self.Param_dtype)
        return w, b

    def feed_forward(self, x, w, b):
        z = x @ w + b
        a = tf.nn.relu(z)
        return a
