<a href="https://colab.research.google.com/github/AdityaKane2001/transformer2017/blob/main/transformer2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras 

In [None]:


MAX_SEQ_LEN = 64
# BATCH_SIZE = 25000
DROPOUT_RATE = 0.1
EMBEDDING_DIMS = 512
VOCABULARY_SIZE = 4096
N_TRANSFORMERS = 6
FFNN_DIMS = 2048
NUM_HEADS = 8
KEY_DIMS = EMBEDDING_DIMS/ NUM_HEADS
VALUE_DIMS = EMBEDDING_DIMS/ NUM_HEADS

"""
Input pipeline:
1. We get batch_size number of pairs of sentences from the dataset: 
    batch_size x  ("My name is Aditya Kane", "<start> Ich bin Aditya Kane <end>")
2. These sentences are then tokenized: 
    batch_size x  ([2,3,4,5,6],[1,15,7,8,9,1000])
3. The sentences are then padded to the largest sentence: 
    batch_size x ([2,3,4,5,6,0,0,0,0], [1,15,7,8,9,1000,0,0,0])
4. They are then converted to embeddings:
    batch_size x max_seq_len x embedding_dims
5. Add positional embeddings to this
    batch_size x max_seq_len x embedding_dims
This is the input to our model.
"""

class PositionAwareEmbeddings(layers.Layer):
    def __init__(self):
        super().__init__()
        self.dropout = layers.Dropout(DROPOUT_RATE)
        self.embed_dims = EMBEDDING_DIMS
        self.vocab_size = VOCABULARY_SIZE
        self.embeddings = layers.Embedding(VOCABULARY_SIZE, EMBEDDING_DIMS,
                                    input_length=MAX_SEQ_LEN)
        self.max_seq_len = MAX_SEQ_LEN
        

    def get_positional_embeddings(self, input_seq_len):
        positions = tf.reshape(tf.range(input_seq_len, dtype=tf.double), (input_seq_len,1))
        freqs = tf.math.pow(10000, 
                -tf.range(0, self.embed_dims, delta=2) / self.embed_dims)

        sin_embs = tf.transpose(tf.cast(tf.math.sin(positions * freqs), tf.float32))
        cos_embs = tf.transpose(tf.cast(tf.math.cos( positions* freqs), tf.float32))
        expanded_sin_embs = tf.scatter_nd( 
            indices = [[i] for i in range(512) if i%2==1],
            updates = sin_embs,
            shape = ( self.embed_dims, input_seq_len)
        )
        expanded_cos_embs = tf.scatter_nd( 
            indices = [[i] for i in range(512) if i%2==0],
            updates = cos_embs,
            shape = ( self.embed_dims, input_seq_len)
        )
        pos_embs = tf.transpose(expanded_sin_embs + expanded_cos_embs)
        return pos_embs #, expanded_sin_embs,expanded_cos_embs

    def call(self, inputs):
        input_seq_len = inputs.shape[-1]
        pos_emb = self.get_positional_embeddings(input_seq_len)
        outputs = self.embeddings(inputs)
        outputs += pos_emb

        return outputs

class MultiheadAttention(layers.Layer):
    def __init__(self):
        super().__init__()
        self.heads = NUM_HEADS


    def call(self, inputs):
        pass

class ResidualAddNormMHA(layers.layer):
    def __init__(self):
        super().__init__()
        pass

    def call(self, inputs):
        pass

In [None]:
pe = PositionAwareEmbeddings()
pos_embs, sin, cos = pe.get_positional_embeddings(100)
# print(pos_embs.shape)

In [None]:
tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
                            [7, 7, 7, 7], [8, 8, 8, 8]],
                           [[5, 5, 5, 5], [6, 6, 6, 6],
                            [7, 7, 7, 7], [8, 8, 8, 8]]]).shape

TensorShape([2, 4, 4])

In [None]:
import pathlib

text_file = tf.keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [None]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [None]:
import random

for _ in range(5):
    print(random.choice(text_pairs))

("We stayed up until about 2:30 studying for today's test.", '[start] Permanecimos despiertos, hasta eso de las dos y media, estudiando para el examen de hoy. [end]')
('The knife is dull.', '[start] El cuchillo está romo. [end]')
('Beijing is bigger than Rome.', '[start] Pekín es más grande que Roma. [end]')
("I'm looking for a bank. Is there one near here?", '[start] Estoy buscando un banco. ¿Hay alguno por aquí cerca? [end]')
('Stars can be seen at night.', '[start] Por la noche se pueden ver las estrellas. [end]')


In [None]:
print(len(text_pairs
    ))

118964
