# Transformers

Transformer and train it on text data.

</br></br>

### Import Data and Libraries
---

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open("input.txt") as f:
  text = f.read()

In [None]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

</br></br>

### Create Tokenizer and Preprocessor
---

In [None]:
chars = set(text)

stoi = {val:idx for (idx, val) in enumerate(chars)}
itos = {val:key for (key, val) in stoi.items()}

encode = lambda x: [stoi[i] for i in x]
decode = lambda x: [itos[i] for i in x]

In [None]:
TRAIN_SIZE = 0.8
CONTEXT_LENGTH = 8
BATCH_SIZE = 16
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
EMBEDDING_DIM = 32
VOCAB_SIZE = len(chars)
TRANSFORMER_BLOCKS = 2
ATTENTION_HEADS = 4

In [None]:
encode("hello"), decode([35, 14, 15, 15, 44])

([35, 14, 15, 15, 44], ['h', 'e', 'l', 'l', 'o'])

In [None]:
tokenized_text = torch.tensor(encode(text))
tokenized_text.shape

torch.Size([1115394])

In [None]:
train_data = tokenized_text[:int(len(tokenized_text) * TRAIN_SIZE)]
test_data  = tokenized_text[int(int(len(tokenized_text) * TRAIN_SIZE)):]

In [None]:
torch.stack((torch.tensor([1, 2, 3]), torch.tensor([1, 2, 3])))

tensor([[1, 2, 3],
        [1, 2, 3]])

In [None]:
# Kind of a stochastic mini batch approach

def get_batch(data):
  # Random indices in data
  indices = torch.randint(high=train_data.shape[0] - CONTEXT_LENGTH - 1, size=(BATCH_SIZE,))

  # X_batch is just 32 x 8, Y_batch is X_batch offset by one
  X_batch = torch.stack([data[i:i + CONTEXT_LENGTH].clone().detach()       for i in indices])
  Y_batch = torch.stack([data[i+1:i + CONTEXT_LENGTH + 1].clone().detach() for i in indices])

  return (X_batch, Y_batch)

In [None]:
get_batch(train_data)

(tensor([[14, 15, 30, 20, 46, 43, 30, 34],
         [52,  3, 56, 31, 36, 23, 23, 56],
         [30, 43, 14, 15,  7, 24, 14, 26],
         [14, 34, 20,  2, 58, 14, 30, 52],
         [43, 44, 63, 14, 46, 30, 32, 26],
         [ 7, 45, 35, 30, 35,  7, 61, 30],
         [30, 20, 46, 43, 30, 34, 20, 15],
         [35, 26,  2, 58, 45, 58, 30, 32],
         [44, 24, 14, 30, 35,  7, 58, 30],
         [59,  4, 56, 46, 43, 30, 45, 35],
         [14, 46, 30, 43,  7, 14, 16,  4],
         [ 2, 45, 30, 58, 14, 15, 43, 44],
         [ 0,  4, 60, 35, 20, 46,  5, 14],
         [14, 30, 44, 32, 30, 35,  7, 58],
         [14, 30, 26,  2, 15, 14, 43, 30],
         [26, 20, 34, 14, 30,  7, 61, 61]]),
 tensor([[15, 30, 20, 46, 43, 30, 34, 44],
         [ 3, 56, 31, 36, 23, 23, 56, 47],
         [43, 14, 15,  7, 24, 14, 26, 58],
         [34, 20,  2, 58, 14, 30, 52, 30],
         [44, 63, 14, 46, 30, 32, 26,  7],
         [45, 35, 30, 35,  7, 61, 30, 35],
         [20, 46, 43, 30, 34, 20, 15, 15],
         

</br></br>

### Create Model Architecture
---

`Transformer Class`
- **Embedding Layer**: converts token sequences to embeddings of length 32, embedding layer uses vocab size of the length of unique characters

</br>

- **Postitional Embedding Layer**: So batches are comprised of 8 tokens based on context length, to avoid, "the cat on the thing the", all the "the"s, we want each "the" to be slightly different since on different position, so each token embedding is added by a new trainable embedding layer corresponding to position. So its 8 "vocab" size and 32 dim length to add each 8, and just pass in [0, 1, 2, 3, 4, 5, 6, 7] as indexes to it to give the embedding.

</br>

- **Transformer Blocks**: So

</br>

- **Linear and Softmax**: So

`Block Class`
- **Embedding Layer**: converts token sequences to embeddings of length 32, embedding layer uses vocab size of the length of unique characters

</br>

- **Postitional Embedding Layer**: So batches are comprised of 8 tokens based on context length, to avoid, "the cat on the thing the", all the "the"s, we want each "the" to be slightly different since on different position, so each token embedding is added by a new trainable embedding layer corresponding to position. So its 8 "vocab" size and 32 dim length to add each 8, and just pass in [0, 1, 2, 3, 4, 5, 6, 7] as indexes to it to give the embedding.

</br>

- **Transformer Blocks**: So

</br>

- **Linear and Softmax**: So

In [None]:
class FFNN(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.l1 = nn.Linear(emb_dim, emb_dim * 4)
    self.l2 = nn.Linear(emb_dim * 4, emb_dim)
    self.dropout = nn.Dropout(0.2)

  def forward(self, x):
    out1 = F.relu(self.l1(x))
    out2 = F.relu(self.l2(out1))
    out3 = self.dropout(out2)

    return out3


class AttentionHead(nn.Module):
  def __init__(self):
    super().__init__()
    pass

  def forward(self, x):
    pass


class SelfAttention(nn.Module):
  def __init__(self):
    super().__init__()
    head_len = EMBEDDING_DIM // ATTENTION_HEADS
    self.att = nn.ModuleList([
        AttentionHead(head_len) for i in range(ATTENTION_HEADS)
    ])

  def forward(self, x):
    out1 = torch.cat([att(x) for att in self.att], dim=-1)


class Block(nn.Module):
  def __init__(self):
    super().__init__()
    self.att = SelfAttention()
    self.layer_norm = nn.LayerNorm(EMBEDDING_DIM)
    self.ffnn = FFNN(EMBEDDING_DIM)

  def forward(self, x):
    att_out  = self.att(x)
    ln_out   = self.layer_norm(x + att_out)
    ffnn_out = self.ffnn(ln_out)
    out      = self.layer_norm(ln_out + ffnn_out)

    return out


class Transformer(nn.Module):
  def __init__(self):
    super().__init__()
    self.emb     = nn.Embedding(VOCAB_SIZE,     EMBEDDING_DIM)
    self.pos_emb = nn.Embedding(CONTEXT_LENGTH, EMBEDDING_DIM)

    self.transformer_blocks = nn.Sequential(
        *[Block() for _ in range(TRANSFORMER_BLOCKS)]
    )

    self.lin = nn.Linear(EMBEDDING_DIM, VOCAB_SIZE)

  def forward(self, x):
    emb = self.emb(x) + self.pos_emb(torch.arange(0, CONTEXT_LENGTH))
    transformer_out = self.transformer_blocks(emb)
    char_distro = F.softmax(self.lin(transformer_out))
    return char_distro

In [None]:
transformer = Transformer()
transformer(get_batch(train_data)[0]).shape

TypeError: SelfAttention.__init__() missing 1 required positional argument: 'emb_dim'

In [None]:
Transformer_blocks = nn.Sequential(
    *[FFNN(32) for _ in range(TRANSFORMER_BLOCKS)]
)

Transformer_blocks

Sequential(
  (0): FFNN(
    (l1): Linear(in_features=32, out_features=128, bias=True)
    (l2): Linear(in_features=128, out_features=32, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (1): FFNN(
    (l1): Linear(in_features=32, out_features=128, bias=True)
    (l2): Linear(in_features=128, out_features=32, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [None]:
thing = torch.randn((4, 5))
print(thing)
nn.LayerNorm(5)(thing)

tensor([[-1.3097, -1.0693,  1.4562,  0.5970, -1.7053],
        [ 1.1939, -0.5129,  0.1139, -0.4218,  0.2866],
        [ 0.6820, -0.8391,  0.4124,  0.3746,  1.8601],
        [-0.2809,  0.6187,  1.3372, -0.0051, -0.6741]])


tensor([[-0.7417, -0.5444,  1.5289,  0.8236, -1.0665],
        [ 1.7344, -1.0532, -0.0294, -0.9044,  0.2526],
        [ 0.2139, -1.5544, -0.0996, -0.1435,  1.5835],
        [-0.6781,  0.5926,  1.6075, -0.2885, -1.2335]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
torch.cat([torch.randn((4, 5)) for i in range(3)], dim=1).shape

torch.Size([4, 15])