In [1]:
import tensorflow as tf
import keras 

In [2]:
def scaled_dot_attention (Q,K,V) :
    d_k = tf.cast(tf.shape(K)[-1],tf.float32)
    scores = tf.matmul(Q,K,transpose_b=True)/tf.math.sqrt(d_k)
    attention_weight = tf.nn.softmax(scores,axis=-1)
    output = tf.matmul(attention_weight,V)
    return output

In [3]:
Q = tf.random.normal([1, 3, 4])
K = tf.random.normal([1, 3, 4])
V = tf.random.normal([1, 3, 4])

In [4]:
print(f"Q : {Q}")
print(f"K : {K}")
print(f"V : {V}")

Q : [[[-1.9292568   1.2269667   0.6370855   0.268685  ]
  [ 0.09105452 -0.64205617 -1.4091802   0.6219789 ]
  [-0.04484814  1.2287714   0.5167272  -0.01811061]]]
K : [[[-0.96645766 -0.5565629  -0.45644137  0.18691523]
  [-1.6269337  -1.5180072   0.01363173 -1.1714674 ]
  [-0.10310423  0.24987963  0.7463436  -1.4164997 ]]]
V : [[[-0.3739626  -0.70063055  0.898552   -0.315544  ]
  [-0.04521862  0.23821616  0.52189904 -0.10142639]
  [ 0.7520454   0.61420715 -0.39475915 -0.6265716 ]]]


In [5]:
scaled_dot_attention(Q,K,V)

<tf.Tensor: shape=(1, 3, 4), dtype=float32, numpy=
array([[[ 0.07502148,  0.02067154,  0.38318983, -0.33131033],
        [-0.13379401, -0.23168473,  0.6230169 , -0.27831373],
        [ 0.3288317 ,  0.21215089,  0.09149134, -0.45905122]]],
      dtype=float32)>

In [6]:
import tensorflow as tf
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    d_k = tf.cast(tf.shape(K)[-1], tf.float32)
    scores = tf.matmul(Q, K, transpose_b=True) / tf.math.sqrt(d_k)
    attention_weights = tf.nn.softmax(scores, axis=-1)
    output = tf.matmul(attention_weights, V)
    return output

# Contoh input (3 kata dengan dimensi 4)
Q = tf.random.normal([1, 3, 4])
K = tf.random.normal([1, 3, 4])
V = tf.random.normal([1, 3, 4])

output = scaled_dot_product_attention(Q, K, V)
print(output)


tf.Tensor(
[[[ 1.0591817   0.50720805 -0.11987717 -0.11517995]
  [ 0.92439175  0.30485007 -0.04524641 -0.04805614]
  [ 0.899948    0.32792068 -0.10444834 -0.0222591 ]]], shape=(1, 3, 4), dtype=float32)


In [7]:
class Scaled_dot_Attention (tf.keras.layers.Layer) :
    def __init__ (self):
        super(Scaled_dot_Attention,self).__init__()

    def call(self,Q,K,V,mask=None) :
        d_k = tf.cast(td.shape(K)[-1],tf.float32) 
        scores = tf.matmul(Q,K,transpose_b=True)/tf.math.sqrt(d_k)
        if mask is not None :
            scores += (mask * -1e9)
        weight = tf.nn.softmax(scores,axis=-1)
        output = tf.matmul(weight,V)
        return output,weight

In [8]:
class MultiheadAttention (tf.keras.layers.Layer) :
    def __init__ (self,num_heads,d_model) :
        super(MultiheadAttention,self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.Wq = keras.layers.Dense(d_model)
        self.Wk = keras.layers.Dense(d_model)
        self.Wv = keras.layers.Dense(d_model)
        self.dense = keras.layers.Dense(d_model)
        self.attention = Scaled_dot_Attention()
    
    def splitsHeads (self,x,batch_size):
        x = tf.reshape(x,(batch_size,-1,self.num_heads,self.depth))
        return tf.transpose(x,perm=[0,2,1,3])
    
    def call (self,Q,K,V,mask=None) :
        batch_size = tf.shape(Q)[0]
        Q = self.splitsHeads(self.Wq, batch_size)
        K = self.splitsHeads(self.Wk, batch_size)
        V = self.splitsHeads(self.Wv, batch_size)

        output,attention_weight = self.attention(Q,K,V,mask)
        output = tf.transpose(output,perm=[0,2,1,3])
        output = tf.reshape(output,(batch_size,-1,self.d_model))
        output = tf.dense(output)
        
        return output,attention_weight


In [9]:
class Encoder_Layers (keras.layers.Layer) :
    def __init__ (self,d_model,num_heads,dff,rate=0.1) :
        super(Encoder_Layers,self).__init__()
        self.mha = MultiheadAttention(num_heads=num_heads,d_model=d_model)
        self.ffn = tf.keras.Sequential([
            keras.layers.Dense(dff,activation='relu'),
            keras.layers.Dense(d_model)
        ])

        self.layernormal1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernormal2 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call (self,x,mask) :
        attn_output = self.mha(x,x,x,mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernormal1(x + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernormal2(out1 + ffn_output)
        
        return out2
        

In [10]:
class DecoderLayer (keras.layers.Layer) :
    def __init__ (self,d_model,num_heads, dff,rate=0.1) :
        super(DecoderLayer,self).__init__()
        self.mha1 = MultiheadAttention(num_heads=num_heads,d_model=d_model)
        self.mha2 = MultiheadAttention(num_heads=num_heads,d_model=d_model)

        self.ffn = tf.keras.Sequential(
            [
                keras.layers.Dense(dff,activation='relu'),
                keras.layers.Dense(d_model)

            ]
        )
        self.layernormal1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernormal2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernormal3 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        self.dropout3 = keras.layers.Dropout(rate)
    
    def call(self,x,enc_output,look_ahead_mask,padding_mask) :
        attn1, _ = self.mha1(x, x, x, look_ahead_mask)  # Masked Self-Attention
        attn1 = self.dropout1(attn1)
        out1 = self.layernorm1(x + attn1)

        attn2, _ = self.mha2(out1, enc_output, enc_output, padding_mask)  # Encoder-Decoder Attention
        attn2 = self.dropout2(attn2)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)  # Feed Forward Network
        ffn_output = self.dropout3(ffn_output)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

In [11]:
import tensorflow as tf
import numpy as np

# Tentukan dimensi model (ukuran vektor setiap kata)
d_model = 128  # Ukuran embedding
num_heads = 8  # Jumlah kepala attention
dff = 512  # Ukuran hidden layer pada FFN
dropout_rate = 0.1  # Dropout untuk regularisasi


In [12]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training, mask):
        attn_output = self.mha(x, x, x, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()
        self.mha1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.mha2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)

        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1 = self.mha1(x, x, x, attention_mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        attn2 = self.mha2(out1, enc_output, enc_output, attention_mask=padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3


In [13]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(Encoder, self).__init__()
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training, mask):
        for enc_layer in self.enc_layers:
            x = enc_layer(x, training, mask)
        return x

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(Decoder, self).__init__()
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        for dec_layer in self.dec_layers:
            x = dec_layer(x, enc_output, training, look_ahead_mask, padding_mask)
        return x


In [14]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, dropout_rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, dropout_rate)
        self.final_layer = tf.keras.layers.Dense(d_model)  # Layer akhir untuk output

    def call(self, enc_input, dec_input, training, enc_mask, look_ahead_mask, dec_mask):
        enc_output = self.encoder(enc_input, training, enc_mask)
        dec_output = self.decoder(dec_input, enc_output, training, look_ahead_mask, dec_mask)
        final_output = self.final_layer(dec_output)  # Proses ke layer akhir
        return final_output


In [None]:
# Buat input acak (seolah-olah ini adalah embedding dari token kata)
batch_size = 2  # Contoh 2 kalimat
seq_length = 10  # Maksimal 10 kata per kalimat
dummy_enc_input = tf.random.uniform((batch_size, seq_length, d_model))  # Encoder input
dummy_dec_input = tf.random.uniform((batch_size, seq_length, d_model))  # Decoder input

# Buat Transformer dengan 2 layer Encoder & Decoder
transformer = Transformer(num_layers=2, d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)



In [16]:
# Jalankan model
output = transformer(dummy_enc_input, dummy_dec_input, training=False, 
                     enc_mask=None, look_ahead_mask=None, dec_mask=None)

print("Output Transformer shape:", output.shape)  # Harusnya (batch_size, seq_length, d_model)


ValueError: Exception encountered when calling Transformer.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: False (of type <class 'bool'>)[0m

Arguments received by Transformer.call():
  • enc_input=tf.Tensor(shape=(2, 10, 128), dtype=float32)
  • dec_input=tf.Tensor(shape=(2, 10, 128), dtype=float32)
  • training=False
  • enc_mask=None
  • look_ahead_mask=None
  • dec_mask=None