In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model

class MultiHeadSelfAttention(tf.keras.layers.Layer):# permite que cada token en una secuencia atienda a cada otro token en la misma secuencia
    def __init__(self, embed_dim, num_heads=8):#Bloques de 8 en 8 tokens
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim#dimensiones del emmbeding
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
            
        self.projection_dim = embed_dim // num_heads#Proyeccion de cada cabeza
        #Capas densas para query, key y value
        self.query_dense = Dense(embed_dim)
        self.key_dense = Dense(embed_dim)
        self.value_dense = Dense(embed_dim)
        self.combine_heads = Dense(embed_dim)

    def attention(self, query, key, value):#q informacion en entrada, k busca relevancia, v informacion a recuperar
        score = tf.matmul(query, key, transpose_b=True)#Producto punto entre query y key
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)#escalado de la raiz de key
        weights = tf.nn.softmax(scaled_score, axis=-1)#Aplicacion de softmax para los pesos de atención
        output = tf.matmul(weights, value)#Producto punto entre los pesos y el valor
        
        return output, weights

    def separate_heads(self, x, batch_size):#x tensor de entrada, 
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))#Separa  las attention head
        return tf.transpose(x, perm=[0, 2, 1, 3])#traspone las dimensiones a un orden adecuado

    def call(self, inputs):#Llamada al modelo
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)#obtenemos query key y value mediante las capas densas
        key = self.key_dense(inputs)#mediante proyección
        value = self.value_dense(inputs)
        
        query = self.separate_heads(query, batch_size)#Separar query key value en diferentes cabezas
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)
        
        attention, weights = self.attention(query, key, value)#Calculamos la atencion escalada
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])#Reorganizamos el tensor
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.combine_heads(concat_attention)
        return output



2024-05-29 00:19:21.272716: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-29 00:19:21.329800: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 00:19:21.329847: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 00:19:21.332439: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-29 00:19:21.350225: I tensorflow/core/platform/cpu_feature_guar

In [2]:
class TransformerBlock(tf.keras.layers.Layer):#Bloque transformer con capa feed forward
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(#Capa feed
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)#Capas de normalización
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)#Desactiva neuronas al azar para prevenir overfitting
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)#Multi head attenrion
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)#add &norm1
        ffn_output = self.ffn(out1)#Feed forward
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)#add &norm 2



In [3]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):#embedding de tokens y de posicion
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)#tokens unicos en el conjunto
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)#Maximo de secuencias de entrada
                                                                                #emb dim dimension de embeddings

    def call(self, x):
        maxlen = tf.shape(x)[-1]#Longitud de secuencia de entrada dinamica
        positions = tf.range(start=0, limit=maxlen, delta=1)#Posiciones de los tokens
        positions = self.pos_emb(positions)#Mapeo de posiciones
        x = self.token_emb(x)#mapea los tokens a su embedding
        return x + positions



In [4]:
class TransformerModel(tf.keras.Model):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_blocks):
        super(TransformerModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)#Positional embedding layer
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_blocks)]#Bloques transformer
        self.dense_layer = Dense(vocab_size, activation="softmax")  # capa final

    def call(self, inputs):
        x = self.embedding_layer(inputs)#Se obtienen el mapeo de tokens mas sus posiciones
        for transformer_block in self.transformer_blocks:#Iteracion en bloques de transformer
            x = transformer_block(x)#Se transforman las entradas en cada iteracion de los bloques
        x = tf.reduce_mean(x, axis=1)
        return self.dense_layer(x)



In [18]:
# Ejemplo de uso:
maxlen = 100  # Longitud máxima de la secuencia de entrada
vocab_size = 20000  # Tamaño del vocabulario
embed_dim = 128  # Dimensión de las embeddings
num_heads = 8  # Número de cabezas en la atención
ff_dim = 512  # Dimensión de la capa feed-forward interna
num_blocks = 4  # Número de bloques del transformador

model = TransformerModel(maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_blocks)
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
#model.summary()


In [21]:
from tensorflow.keras.utils import plot_model
model.build(input_shape=(None, maxlen))
file='transformer_model'
plot_model(model,to_file=file+'.png',show_shapes=True,show_dtype=False)
model.summary()


Model: "transformer_model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddi  multiple                  2572800   
 ng_6 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformer_block_24 (Tran  multiple                  198272    
 sformerBlock)                                                   
                                                                 
 transformer_block_25 (Tran  multiple                  198272    
 sformerBlock)                                                   
                                                                 
 transformer_block_26 (Tran  multiple                  198272    
 sformerBlock)                                                   
                                               

In [None]:
# Datos de ejemplo
import numpy as np
x_train = np.random.randint(0, vocab_size, size=(1000, maxlen))
y_train = np.random.randint(0, 2, size=(1000, 1))

model.fit(x_train, y_train, batch_size=32, epochs=2)