In [423]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [424]:
import numpy as np
import tensorflow as tf

A. ENCODER PART

In [425]:
d_model = 300
n_head = 4

Embedding Steps

In [426]:
def tokenize(x):
    return x.split()

In [427]:
def embed(x):
    embeddings = wv[x]
    return embeddings

In [428]:
def posEncode(x):
    n_seq, _ = x.shape
    i = np.arange(0, d_model, 2, dtype='float16')
    denominator = np.power(10000, i/d_model)
    position = np.arange(0, n_seq, dtype='float16').reshape(-1,1)
    even_PE = np.sin(position / denominator)
    odd_PE  = np.cos(position / denominator)
    return x + np.ravel([even_PE.T, odd_PE.T],'F').reshape(n_seq, d_model)

In [429]:
def masking(x):
    mask = np.tril(np.ones((len(x), len(x))))
    mask[mask==0] = -np.infty
    mask[mask==1] = 0
    x = x + mask
    return x

In [430]:
sentence = 'hi my name is bill I am from Canada'

In [431]:
tokens = tokenize(sentence)
embeddings = embed(tokens)
input = posEncode(embeddings)

Encoding Attention /
4 heads

In [432]:
def init_QKV():
    #used once only
    Q, K, V = [],[],[]
    for h in range(n_head):
        Q.append(np.random.rand(d_model, d_model//n_head)-0.5)
        K.append(np.random.rand(d_model, d_model//n_head)-0.5)
        V.append(np.random.rand(d_model, d_model//n_head)-0.5)
    return Q, K, V

In [433]:
def context(input, Q,K,V, mask=False):    
    context = []
    for h in range(n_head):
        raw_attention = (input @ Q[h]) @ (input @ K[h]).T
        if (mask == True):
            raw_attention = masking(raw_attention)
        
        score = tf.nn.softmax(raw_attention / (d_model)** .5)
        context.append(score @ (input @ V[h]))

    concat = np.concatenate([c for c in context], axis=1)    
    return concat

In [434]:
Q,K,V = init_QKV()
context1 = context(input, Q,K,V)

Add and Normalization
Feed Forward

In [435]:
gamma1 = np.random.rand(d_model)
beta1 = np.random.rand(d_model) 

gamma2 = np.random.rand(d_model) 
beta2 = np.random.rand(d_model) 

In [436]:
w1 = np.random.rand(d_model, d_model) 
b1 = np.random.rand(d_model) 

In [437]:
def add_norm(context, prev_input, gamma, beta):
    context = context + prev_input
    mean = context.mean(axis=1).reshape(-1,1)
    sigma = context.std(axis=1).reshape(-1,1)
    context = (context - mean) / sigma
    context = context * gamma + beta
    return context

def feed_forward(context, w, b):
    #Assume no hidden layer
    context = context @ w + b
    context = tf.nn.relu(context).numpy()
    return context

In [438]:
context2 = add_norm(context1, input, gamma1, beta1)
context3 = feed_forward(context2, w1, b1)
context4 = add_norm(context3, context2, gamma2, beta2)

--------------------------------------------------------------

B. DECODER PART

In [439]:
sentence2 = "</s>"
input2 = posEncode(embed(tokenize(sentence2))) #residual for the next addNorm

In [440]:
Q_d, K_d, V_d = init_QKV()

In [441]:
context_d_1 = context(input2, Q_d, K_d, V_d, mask=True)

Add and Norm

In [442]:
gamma_d_1 = np.random.rand(d_model)
beta_d_1 = np.random.rand(d_model) 

In [443]:
context_d_2 = add_norm(context_d_1, input2, gamma_d_1, beta_d_1) #residual for the next addNorm

In [444]:
def cross_context(input_e, input_d, Q_d,K_e,V_e):    
    context = []
    for h in range(n_head):
        raw_attention = (input_d @ Q_d[h]) @ (input_e @ K_e[h]).T
        score = tf.nn.softmax(raw_attention / (d_model)** .5)
        context.append(score @ (input_e @ V_e[h]))

    concat = np.concatenate([c for c in context], axis=1)    
    return concat

In [445]:
Q_d, K_e, V_e = init_QKV()

In [446]:
cross_context = cross_context(context4, context_d_2, Q_d, K_e, V_e)

Add and Norm 

In [447]:
gamma_d_2 = np.random.rand(d_model) 
beta_d_2 = np.random.rand(d_model)

In [448]:
context_d_3 = add_norm(context_d_2, cross_context, gamma_d_2, beta_d_2) #residual for the next addNorm

Feed Forward and Norm

In [449]:
w2 = np.random.rand(d_model, d_model) 
b2 = np.random.rand(d_model) 

In [450]:
context_d_4 = feed_forward(context_d_3, w2, b2)

In [451]:
gamma_d_3 = np.random.rand(d_model) 
beta_d_3 = np.random.rand(d_model)

In [452]:
context_d_5 = add_norm(context_d_4, context_d_3, gamma_d_3, beta_d_3)

LINEAR STEP

In [None]:
w_linear = np.random.rand(d_model, 3000000)
b_linear = np.random.rand(3000000)

In [454]:
raw_prediction = context_d_5 @ w_linear + b_linear

In [455]:
prediction = tf.nn.softmax(raw_prediction)

In [456]:
wv.index_to_key[np.argmax(prediction)]

'Danware'

: 