In [1]:
import numpy as np
import tensorflow as tf
from sklearn.decomposition import PCA

In [2]:
# load data from the local file
data = np.load('E:\\Transformer Data\\AirRaid-v0_10x1000_0.npy', allow_pickle =True)

In [3]:
# sess = tf.InteractiveSession()

In [4]:
# state array
state_arr = data[0][0]['state']

# number input feature to Encoder
N = 256
# number of feature sets
n_x = state_arr.shape[1]*state_arr.shape[2]*state_arr.shape[0]
# the steps values will be replaced from the real data..length of trajectory
# steps = data.shape[1]


''''''
steps = 512
N = 256
''''''



# # number of pixel per state
# N = 255
# # the steps values will be replaced from the real data..length of trajectory
# steps = 1000 

# number of query
dk = 64
# number of value
dv = 64

# number of head
n_h = 8

# dropout rate
Pdrop = 0.1 # this is same as the attention research paper

# FFN weight dimensionality
dff = 2048 # this number is computationally expensive
#dff = 64


# number of epoch
num_epochs = 500

In [5]:
np.random.seed(1)
# m = 1 
# input dummy states for 1000 steps
# S = np.random.rand(steps, N)
# # actions for each state
# A = np.zeros((steps,1), dtype=int)

# initial input state
init_S = np.empty((steps, n_x))

# input empty states to the encoder for 1000 steps
S = np.empty((steps, N))

# empty actions for each state
A = np.empty((steps,1), dtype=int)

In [6]:
def get_state_action(data, empty_state, empty_action, steps, n_x):
    '''
    parameters:
        data: games data 
        empty_state: empty array of state
        empty_action: empty array of action
        steps: Number of steps in the trajectory
        n_x: number of feature set
    return:
        init_S: input states of N steps
        A: actions of N steps
    '''
    
    init_S = empty_state
    A = empty_action

    for i in range(0,steps):
        X = data[0][i]['state']
        init_S[i, :] = X.reshape(n_x,1).T
        
        # create some randomness in the data
#         init_S[i, :] = init_S[i, :] + np.random.randint(10,250, n_x) #'''I will remove this when data is valid'''

        init_S[i, :] = init_S[i, :]
        
        A[i, :] = data[0][i]['action']
        
    return(init_S, A)

In [7]:
# This function will reduce dimentionality of feature set
def reduce_dim(init_S, N):
    '''
    parameters:
        init_S: initial input state
        N: number input feature to Encoder
    return:
        S: input state of encoder
    '''
    pca = PCA(n_components=N)
    S = pca.fit_transform(init_S)
    
    return S

In [8]:
init_S, A = get_state_action(data, init_S, A, steps, n_x)

In [9]:
S = reduce_dim(init_S, N)

In [10]:
input_X = S

In [11]:
# np.unique(A)
# A.shape

In [12]:
# #assign first 500 indices to -1 and rest to 1
# A[0:int(steps/3),] = 0
# A[int(steps/3):int(steps/3)*2,] = 1
# A[int(steps/3)*2:steps,] = 2


# # A[0:499,] = 0
# A[499:1000,] = 1

In [13]:
n_c = len(np.unique(A))

In [14]:
# this function intializes weights used in encoder, decoder(masked) for query, value and key
def initialize_weights(n_h, N, dk, dv):
    '''
    parameters:
        n_h: number of heads 
        N: number of pixel per state
        dk: number of query
        dv: number of value
    return:
        
    '''
    
    tf.set_random_seed(1)
    
    ## ENCODER WEIGTHS ##
    lst_encoder_weights = []
    for i in range(0,n_h):
        encoder_weights = {}
        WiQ = 'W' + str(i) + 'Q'
        WiK = 'W' + str(i) + 'K'
        WiV = 'W' + str(i) + 'V'
        
        encoder_weights[WiQ] = tf.get_variable(WiQ, [dk, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        encoder_weights[WiK] = tf.get_variable(WiK, [dk, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        encoder_weights[WiV] = tf.get_variable(WiV, [dv, N],
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        lst_encoder_weights.append(encoder_weights)
    
    ## MASKED DECODER WEIGTHS ##
    lst_masked_decoder_weights = []
    for i in range(0,n_h):
        masked_decoder_weights = {}
        md_WiQ = 'md_W' + str(i) + 'Q'
        md_WiK = 'md_W' + str(i) + 'K'
        md_WiV = 'md_W' + str(i) + 'V'
        
        masked_decoder_weights[md_WiQ] = tf.get_variable(md_WiQ, [dk, 1], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        masked_decoder_weights[md_WiK] = tf.get_variable(md_WiK, [dk, 1], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        masked_decoder_weights[md_WiV] = tf.get_variable(md_WiV, [dv, 1],
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        lst_masked_decoder_weights.append(masked_decoder_weights)

    ## DECODER WEIGTHS ##
    lst_decoder_weights = []
    for i in range(0,n_h):
        decoder_weights = {}
        d_WiQ = 'd_W' + str(i) + 'Q'
        d_WiK = 'd_W' + str(i) + 'K'
        d_WiV = 'd_W' + str(i) + 'V'
        
        decoder_weights[d_WiQ] = tf.get_variable(d_WiQ, [dk, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        decoder_weights[d_WiK] = tf.get_variable(d_WiK, [dk, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        decoder_weights[d_WiV] = tf.get_variable(d_WiV, [dv, N],
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
        lst_decoder_weights.append(decoder_weights)
    
    return (lst_encoder_weights, lst_masked_decoder_weights, lst_decoder_weights)

In [15]:
# This function will initialize multi head attention weights
def initialize_multihead_attention_weights(N, dk, n_h):
    '''
    parameters:
    N: number of pixel per state
    dk: number of query
    n_h: number of heads
    '''
    ## try to reduce dimension by dividin 
    ## N by 8
    ###
    
    multi_head_attention_weight = tf.get_variable('W1O', [dk*n_h, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))

    masked_multi_head_attention_weight = tf.get_variable('md_W1O', [dk*n_h, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))
    
    decoder_multi_head_attention_weight = tf.get_variable('d_W1O', [dk*n_h, N], 
                                                     initializer = tf.contrib.layers.xavier_initializer(seed=1))

    return (multi_head_attention_weight, masked_multi_head_attention_weight, decoder_multi_head_attention_weight)

In [16]:
# This function will initialize weights for the feed forward layer
def initialize_FFN_weights(N, dff):
    '''
    parameters:
        N: number of pixel per state
        dff: feed foward network dimensionality
    return:
        FFN_weight: Feed forwad network weight
    '''
    FFN_weights = {}
    
    FFN_weights['ffnW1'] = tf.get_variable('ffnW1', [N, dff],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))
    FFN_weights['ffnW2'] = tf.get_variable('ffnW2', [dff, N],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))

    ## ffw for decoder stack
    D_FFN_weights = {}
    
    D_FFN_weights['d_ffnW1'] = tf.get_variable('d_ffnW1', [N, dff],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))
    D_FFN_weights['d_ffnW2'] = tf.get_variable('d_ffnW2', [dff, N],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))
    
    
    return (FFN_weights, D_FFN_weights)

In [17]:
# THis function will initialize bias for Feed forward lawyer
def initialize_bias(N, dff):
    '''
    parameters:
        N: number of pixel per state
        dff: feed foward network dimensionality
    return:
        b: bias
    '''
    b_weights = {}
    
    b_weights['b1'] = tf.get_variable('b1', [dff],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))

    b_weights['b2'] = tf.get_variable('b2', [N],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))
    
    # biases for decoder stack
    d_b_weights = {}
    
    d_b_weights['d_b1'] = tf.get_variable('d_b1', [dff],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))

    d_b_weights['d_b2'] = tf.get_variable('d_b2', [N],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))    

    return (b_weights, d_b_weights)

In [18]:
# This function will initialize parameter for output layer
def initialize_weight_output(N, n_c):
    '''
    parameters:
        N: number of features
    return:
        W_out: weights for output layer
        b_out: bias for output layer
    '''
    
    # in case we have multi class problem then deminsion will be changed from [N,1]
    # use [N,n_c] where n_c is number of classes
    W_out = tf.get_variable('W_out', [N,n_c],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))
    
    b_out = tf.get_variable('b_out', [1],
                                initializer = tf.contrib.layers.xavier_initializer(seed=1))
    return W_out, b_out

In [19]:
# W, md_W, d_W = initialize_weights(n_h, N, dk, dv)

In [20]:
# invoke all weights & biases here as global variables

W, md_W, d_W = initialize_weights(n_h, N, dk, dv)
mha_W, m_mha_W, d_mha_W = initialize_multihead_attention_weights(N, dk, n_h)
ffn_W, d_ffn_W = initialize_FFN_weights(N, dff)

b_weights, d_b_weights = initialize_bias(N, dff)

W_out, b_out = initialize_weight_output(N, n_c)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



**(Note: Positional Encoding)** Need to resolve assignment of positional encoding because when value is too small it does effect on the input state wehreas when value is increased then it effect the output value

In [21]:
# THis function calculates positional encoding of the input
def positional_encoding(input_data, N, steps):
    '''
    parameters:
        input_data: input state
        N: number of pixel per state
        steps: length of trajectory
    return:
        new states with positional embedings
    '''
    # pos is postions of state in the dimension and is the vector 
    pos = np.array([i+1 for i in range(0, input_data.shape[0])])
    
    # i is the dimension and will be equal to steps
    i = steps
    
    # d_model is same as N(number of pixels per state)
    d_model= N
    
    #calculcate positional embeding with sin function
    PE = np.sin(pos/(10000**(2*i/d_model)))
#     PE = np.sin(pos/(10**(2*i/d_model))) # since 10000 has negligible effect it is reduced from 10000 to 10
    
    # add positional encoding to each of state of trajectory
    for i in range(0, PE.shape[0]):
        input_data[i,:] = input_data[i,:] + PE[i,]
    return input_data

In [22]:
int_sess = tf.InteractiveSession()

In [23]:
# this is temporary function to convert sample input numpy array to tensorflow data
def convert_numpyPE_to_tfPE(S, N, steps):
    '''
    parameters:
        S: input state
        N: number of pixel per state
        steps: length of trajectory
    return:
        data_tf: converted PE in tf
    '''
    data_tf = tf.convert_to_tensor(positional_encoding(S, N, steps), np.float32)

#     int_sess = tf.InteractiveSession()  
##     print(data_tf.eval())

#     int_sess.close()
    
    return data_tf

In [24]:
def mask_output(Z, time_stamp = 0, mask_value = -1e9):
    
    
    mask_data = tf.fill(Z.shape, mask_value)

    np_mask_idx = np.zeros(Z.shape, dtype= bool)

    for i in range(0, time_stamp):
        np_mask_idx[i,:] = True
        
    mask_idx = tf.convert_to_tensor(np_mask_idx)

    mask_Z = tf.where(mask_idx, Z, mask_data)
    
    return mask_Z

In [25]:
# This function will caluculate Q, K and V
def get_q_k_v(query_data, key_data, value_data, WQ, WK, WV):
    '''
    parameters:
        query_data: data to calculate query
        key_data: data to calculate key
        value_data: data to calculate value
        WQ: query weights
        WK: key weights
        WV: value weights
    return:
        Q: query
        K: key
        V: value
    '''
    
    Q = tf.matmul(query_data,tf.transpose(WQ))
    K = tf.matmul(key_data,tf.transpose(WK))
    V = tf.matmul(value_data,tf.transpose(WV))
    
    return (Q, K, V)

In [26]:
# the attention function will calculate the z value for given weights, and positional encoded inputs
# def calaculate_single_attention(S, WQ, WK, WV, dk, mask = False):
def calaculate_single_attention(Q, K, V, dk, mask = False, time_stamp = 0):
    '''
    parameters:
        Q: query 
        K: key  
        N: value  
        dk: number of query
        mask: implement mask on illegal connection
    return:
        Z: attention head
    '''
    
#     # calculate query, value and key
#     Q = tf.matmul(S,tf.transpose(WQ))
#     K = tf.matmul(S,tf.transpose(WK))
#     V = tf.matmul(S,tf.transpose(WV))
    
    # linear activation of query and key
    matmul_z1 = tf.matmul(Q,tf.transpose(K)) # in my computational grapy matmul_z1 is A
    
    # to mask values of the output call masking function to get masked value (-inf)
    if(mask == True):
        matmul_z1 = mask_output(matmul_z1, time_stamp)
#         return matmul_z1
    
    # non-linear activation 
    softmax_a = tf.nn.softmax(tf.divide(matmul_z1,np.sqrt(dk))) # in my computational grapy softmax_a is B
    
    # head of the attention
    Z = tf.matmul(softmax_a, V) # attention head

    return Z

In [27]:
# This function will concate n number of head to single head
def multi_head_attention(query_data, key_data, value_data, dk, n_h, W, mha_W, mask = False):
    '''
    parameters:
        query_data: data to calculate query
        key_data: data to calculate key
        value_data: data to calculate value
        dk: number of query
        n_h: number of heads
        W: Input encoder weight  ## for now only
        mha_W: multihead_attention_weights
    return:
        Z_: concated single head
    '''
    lst_attention_heads = []
    for i in range(0,n_h):
        WiQ = 'W' + str(i) + 'Q'
        WiK = 'W' + str(i) + 'K'
        WiV = 'W' + str(i) + 'V'

        WQ = W[i][WiQ]
        WK = W[i][WiK]
        WV = W[i][WiV]
        
        # get query, key and values
        Q, K, V = get_q_k_v(query_data, key_data, value_data, WQ, WK, WV)
        
        # get n number of head and append to the list
        head = calaculate_single_attention(Q, K, V, dk)
        lst_attention_heads.append(head)
        
    # concatinate all heads, get weights for multi head attention multiplication
    concat_head = tf.concat(lst_attention_heads, 1) 

    # perform linear action with weights and concatinated heads above
    Z_ = tf.matmul(concat_head, mha_W)
    
    return Z_

In [28]:
# this function will add and normalize 
def add_norm(data1, data2):
    '''
    parameters:
        data1: input data after some operations
        data2: skipped data, bypass from the operations
    return:
        addNorm: added and normalized values
    '''

    addNorm = tf.nn.dropout(tf.add(data1, data2), rate = Pdrop)
    return addNorm

In [29]:
# This function perform position-wise feed forward NN
def feed_forward_layer(attention_norm, W1, W2, b1, b2):
    '''
    Parameters:
        attention_norm: input data after attention and normalization
        ffn_W: Weights for Feed forward net
        b_weights: Bias
    return:
        z2: activation of the network
    '''
#     W1 = ffn_W['ffnW1']
#     W2 = ffn_W['ffnW2']

#     b1 = b_weights['b1']
#     b2 = b_weights['b2']

    z1 = tf.add(tf.matmul(attention_norm, W1), b1)
    a1 = tf.nn.relu(z1)

    max_a1 = tf.maximum(0.0, a1)

    z2 = tf.add(tf.matmul(max_a1, W2), b2)
    
    return z2

In [30]:
# This function stack multiple function and works as encoder
def encode_layer(S, dk, n_h, W, mha_W, ffn_W, b_weights):
    '''
    parameters:
        S: tensflow PE data
        dk: number of query
        n_h: number of attention head
        W: attention head weights
        mha_W: weights to concat multihead
        ffn_W: feed forward network weights
        b_weights: Bias for feed forward network
    return:
        encode: encoded value from the encoder layer
    '''
    W1 = ffn_W['ffnW1']
    W2 = ffn_W['ffnW2']

    b1 = b_weights['b1']
    b2 = b_weights['b2']
    
    Z_ = multi_head_attention(query_data=S, key_data=S, value_data=S, dk=dk, n_h=n_h, W=W, mha_W=mha_W)
    attention_norm = add_norm(Z_, S) # in my computational grapy attention_norm is x
#     X_ = feed_forward_layer(attention_norm, ffn_W, b_weights)
    X_ = feed_forward_layer(attention_norm, W1, W2, b1, b2)
    encode_norm = add_norm(X_, attention_norm) # in my computational grapy encode_norm is encode
    
    return encode_norm

In [31]:
# add positional encoding to the the data and convert to tensorflow format
S = positional_encoding(S, N, steps)
S = convert_numpyPE_to_tfPE(S, N, steps)

In [32]:
encode_value = encode_layer(S, dk, n_h, W, mha_W, ffn_W, b_weights)

# encode_value = encode_layer(encode_value, dk, n_h, W, mha_W, ffn_W, b_weights)

In [33]:
"""
DECODER STACK
"""

'\nDECODER STACK\n'

In [34]:
A_Actual = A

In [35]:
A = positional_encoding(A, N, steps)
A = convert_numpyPE_to_tfPE(A, N, steps)

In [36]:
# This function will evaluate masked multihead attention for the output (shifted right )
def masked_multi_head_attention(query_data, key_data, value_data, dk, n_h, md_W, m_mha_W, time_stamp):
    '''
    parameters:
        query_data: data to calculate query
        key_data: data to calculate key
        value_data: data to calculate value
        dk: number of query
        n_h: number of heads
        md_W: output decoder masked weight  ## for now only
        m_mha_W: masked_multihead_attention_weights
    return:
        Z_: concated single head
    '''
        
    lst_masked_attention_heads = []
    for i in range(0,n_h):
        WiQ = 'md_W' + str(i) + 'Q'
        WiK = 'md_W' + str(i) + 'K'
        WiV = 'md_W' + str(i) + 'V'

        WQ = md_W[i][WiQ]
        WK = md_W[i][WiK]
        WV = md_W[i][WiV]
    #     print(WQ)

        # get n number of head and append to the list
#         head = calaculate_single_attention(A, WQ, WK, WV, dk, mask = True)
        
        # get query, key and values
        Q, K, V = get_q_k_v(query_data, key_data, value_data, WQ, WK, WV)
        
        # get n number of head and append to the list
        head = calaculate_single_attention(Q, K, V, dk, mask = True, time_stamp = time_stamp)
        
        lst_masked_attention_heads.append(head)


    # lst_masked_attention_heads    
    # concatinate all heads, get weights for multi head attention multiplication
    concat_head = tf.concat(lst_masked_attention_heads, 1) 

    # perform linear action with weights and concatinated heads above
    Z_ = tf.matmul(concat_head, m_mha_W)
    
    return Z_

In [37]:
# This function will evaluate masked multihead attention for the output (shifted right )
def decoder_multi_head_attention(query_data, key_data, value_data, dk, n_h, d_W, d_mha_W):
    '''
    parameters:
        query_data: data to calculate query
        key_data: data to calculate key
        value_data: data to calculate value
        dk: number of query
        n_h: number of heads
        d_W: decoder weight  ## for now only
        d_mha_W: decoder_multihead_attention_weights
    return:
        Z_: concated single head
    '''
    lst_decode_attention_heads = []
    for i in range(0,n_h):
        WiQ = 'd_W' + str(i) + 'Q'
        WiK = 'd_W' + str(i) + 'K'
        WiV = 'd_W' + str(i) + 'V'

        WQ = d_W[i][WiQ]
        WK = d_W[i][WiK]
        WV = d_W[i][WiV]

        # get query, key and values
        Q, K, V = get_q_k_v(query_data, key_data, value_data, WQ, WK, WV)

        # get n number of head and append to the list
        head = calaculate_single_attention(Q, K, V, dk)

        lst_decode_attention_heads.append(head)

    concat_head = tf.concat(lst_decode_attention_heads, 1) 


    # perform linear action with weights and concatinated heads above
    Z_ = tf.matmul(concat_head, d_mha_W)
    
    return Z_

In [38]:
# This function stack multiple function and works as encoder
def decoder_layer(A, encode_value, dk, n_h, md_W, m_mha_W, d_W, d_mha_W, d_ffn_W, d_b_weights, time_stamp=0):
    '''
    parameters:
        A: tensflow PE data
        encode_value: encoded value from the encoder layer
        dk: number of query
        n_h: number of attention head
        md_W: masked attention weights
        d_W: decoder attention head weights
        m_mha_W: weights to concat masked multihead 
        d_mha_W: decoder multihead attention weight
        d_ffn_W: decoder feed forward network weights
        d_b_weights: decoder Bias for feed forward network
    return:
        decode: decoded value from the decoder layer
    '''

    W1 = d_ffn_W['d_ffnW1']
    W2 = d_ffn_W['d_ffnW2']

    b1 = d_b_weights['d_b1']
    b2 = d_b_weights['d_b2']

    Z_m = masked_multi_head_attention(query_data=A, key_data=A, value_data=A, dk=dk, n_h=n_h, md_W = md_W,
                                      m_mha_W =m_mha_W, time_stamp = time_stamp)
    masked_attention_norm = add_norm(Z_m, A) # in my computational grapy masked_attention_norm is x

    Z_d = decoder_multi_head_attention(query_data = masked_attention_norm, key_data = encode_value, value_data = encode_value,
                                 dk=dk, n_h=n_h, d_W=d_W, d_mha_W=d_mha_W)
    decoder_attention_norm = add_norm(Z_d, masked_attention_norm)

    X_ = feed_forward_layer(decoder_attention_norm, W1, W2, b1, b2)
    decode = add_norm(X_, decoder_attention_norm)
    
    return decode

In [39]:
decode_value = decoder_layer(A, encode_value, dk, n_h, md_W, m_mha_W, d_W, d_mha_W, d_ffn_W, d_b_weights)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [40]:
# This function will get prediction from the decoder layer
def get_prediction(decode_value, A, W_out, b_out):
    '''
    parameters:
        decode_value: decoded value from the decoder layer
        A: output values
        W_out: weight for the output layer
        b_out: bias for the output layer
    return:
        predicted_class: prediction from the decoder values
        '''
    Z = tf.matmul(decode_value,W_out) + b_out
    
    '''if 2 outputs use sigmoid'''
#     output_prob = tf.nn.sigmoid(Z)
    
    '''if more than 2 outputs use softmax'''
    output_prob = tf.nn.softmax(Z)

    predicted_indices = tf.argmax(output_prob, dimension=1,name="predictions")

    predicted_class = tf.gather(A, predicted_indices)
    
    return predicted_class, Z

In [41]:
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predicted_class, labels=A))


In [42]:
predicted_class, _ = get_prediction(decode_value, A, W_out, b_out)

Instructions for updating:
Use the `axis` argument instead


In [43]:
# this function will convert array values to one hot encoding values
def get_one_hot_encoding(A_Actual, n_c):
    '''
    parameters:
        A_Actual: lable data
    return:
        b: one hot encoded values of labels'''
    
    a = A_Actual.reshape(A_Actual.shape[0],)
    b = np.zeros((a.size, a.max()+1))
    b[np.arange(a.size),a] = 1
    b
    
    return b

In [44]:
# output_Y = A_Actual.astype(np.float32)

# losses = tf.nn.sigmoid_cross_entropy_with_logits(labels = output_Y, logits = logits)

# cost = tf.reduce_mean(losses)

# optimizer = tf.train.AdamOptimizer(learning_rate=0.007).minimize(cost)

In [45]:
# tf.reset_default_graph()

# run the last node of the graph in this code
init = tf.global_variables_initializer() 

# sess = tf.Session()

sess = int_sess

sess.run(init)


In [46]:
# this function will replace output array with unique numbers
def convert_to_unique_numbers(A_Actual):
    '''
    parameters:
        A_Actual: label values
    return:
        numbers: list of unique number 
    '''
    
    names = A_Actual.tolist()
    d = {ni: indi for indi, ni in enumerate(set(names))}
    numbers = [d[ni] for ni in names]
    return numbers

In [47]:
# rehape output array to 1D array
A_Actual = A_Actual.reshape(A_Actual.shape[0],)
numbers = convert_to_unique_numbers(A_Actual)

In [48]:
'''for binary class'''
# output_Y = A_Actual.astype(np.float32)

# '''for multiclass get one hot encoded values '''
output_Y = get_one_hot_encoding(np.array(numbers), n_c).astype(np.float32)

# n_y -- scalar, number of classes    
X = tf.placeholder(tf.float32, [steps, N])
# Y = tf.placeholder(tf.float32, [None, n_y])
Y = tf.placeholder(tf.float32, [output_Y.shape[0], output_Y.shape[1]])

In [49]:
for i in range(1, steps+1):
# for i in range(1, 10):
    print(i)
    A_tsi = sess.run([predicted_class])[0].astype(int) # tsi: time stamp at ith value
    
    if i == steps: #9:#steps---A.shape[0]
        A_hat = A_tsi
    
    A_tsi = positional_encoding(A_tsi, N, steps)

    A_tsi = convert_numpyPE_to_tfPE(A_tsi, N, steps)

    decode_value = decoder_layer(A_tsi, encode_value, dk, n_h, md_W, m_mha_W, d_W, d_mha_W, d_ffn_W, d_b_weights, time_stamp=i)
    
    if i == steps: #9:#steps---A.shape[0]
        predicted_class, logits = get_prediction(decode_value, A, W_out, b_out) # logits is fc layer (last)
    else:
        predicted_class, _ = get_prediction(decode_value, A, W_out, b_out)

sess.close()


# int_sess.close()

#     A_tsi = sess.run([predicted_class])
# A_hat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [50]:
logits

<tf.Tensor 'add_3081:0' shape=(512, 5) dtype=float32>

In [51]:
cost_dict = {}

In [None]:
losses = tf.nn.sigmoid_cross_entropy_with_logits(labels = output_Y, logits = logits)

cost = tf.reduce_mean(losses)

optimizer = tf.train.AdamOptimizer(learning_rate=0.007,
                                   beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False).minimize(cost)


# run the last node of the graph in this code
init = tf.global_variables_initializer()


config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.6

with tf.Session(config=config) as sess:
    sess.run(init)  
    
    cost_lst = []
    
    for epoch in range(num_epochs):
        _ , epoch_cost = sess.run([optimizer, cost], feed_dict={X: input_X, Y: output_Y})
        
        cost_lst.append(epoch_cost)
        
        print('Epoch: ', epoch, ' Cost : ', epoch_cost)


In [None]:
from matplotlib import pyplot as plt

In [None]:
# plt.ylim(top=1000)
# print('Transformer on Sample Data')
plt.plot(np.log(cost_lst))
plt.xlabel('Epochs')
plt.ylabel('Error on Logramathic scale')
plt.title('Transformer on Games Data \n Steps = '+ str(steps))
plt.show()

In [55]:
# plt.axes.set

In [57]:
num_epochs=1000