In [1]:
import torch
import tiktoken
from torch import nn

# Architecture

<img src="./resources/gptarch.png">

### Configuration

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
GPT_CONFIG = {
    "vocab_size":tokenizer.n_vocab,
    "context_length":1024,
    "emb_dim":768,
    "n_heads":12,
    "n_layers":12,                  # Number of transformer layers
    "drop_rate":0.1,
    "qkv_bias":False
}

In [4]:
GPT_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

# Transformer Block

<img src="./resources/gpttf.png">

## Normalizing Activations with layer normalization

In [6]:
torch.manual_seed(123)
batch = torch.rand(5,4)
batch

tensor([[0.2961, 0.5166, 0.2517, 0.6886],
        [0.0740, 0.8665, 0.1366, 0.1025],
        [0.1841, 0.7264, 0.3153, 0.6871],
        [0.0756, 0.1966, 0.3164, 0.4017],
        [0.1186, 0.8274, 0.3821, 0.6605]])

In [7]:
layer = torch.nn.Sequential(nn.Linear(4,8),nn.ReLU())
out = layer(batch)
out

tensor([[0.6374, 0.3983, 0.0873, 0.0000, 0.0000, 0.5753, 0.0609, 0.0043],
        [0.2929, 0.3200, 0.0000, 0.0000, 0.0000, 0.2298, 0.0381, 0.0000],
        [0.6254, 0.4421, 0.0000, 0.0000, 0.0000, 0.5259, 0.0636, 0.0297],
        [0.4001, 0.2804, 0.1059, 0.0000, 0.0000, 0.5429, 0.0369, 0.0000],
        [0.6079, 0.4485, 0.0000, 0.0000, 0.0000, 0.4893, 0.0605, 0.0403]],
       grad_fn=<ReluBackward0>)

In [10]:
print(out.mean(dim=-1,keepdim=True))
print(out.var(dim=-1,keepdim=True))

tensor([[0.2204],
        [0.1101],
        [0.2108],
        [0.1708],
        [0.2058]], grad_fn=<MeanBackward1>)
tensor([[0.0741],
        [0.0208],
        [0.0732],
        [0.0447],
        [0.0681]], grad_fn=<VarBackward0>)


In [13]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim,unbias=False):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.unbiased = unbias
        
    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=self.unbiased)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift

In [14]:
norm_layer = LayerNorm(out.shape[-1])
logit = norm_layer(out)

In [16]:
logit

tensor([[ 1.6378,  0.6984, -0.5228, -0.8658, -0.8658,  1.3939, -0.6266, -0.8489],
        [ 1.3553,  1.5559, -0.8163, -0.8163, -0.8163,  0.8877, -0.5336, -0.8163],
        [ 1.6377,  0.9137, -0.8330, -0.8330, -0.8330,  1.2449, -0.5817, -0.7157],
        [ 1.1597,  0.5544, -0.3281, -0.8637, -0.8637,  1.8819, -0.6769, -0.8637],
        [ 1.6474,  0.9942, -0.8432, -0.8432, -0.8432,  1.1615, -0.5955, -0.6780]],
       grad_fn=<AddBackward0>)

In [15]:
print(logit.mean(dim=-1,keepdim=True))
print(logit.var(dim=-1,keepdim=True))

tensor([[ 2.2352e-08],
        [ 3.7253e-08],
        [ 1.4901e-08],
        [ 8.1956e-08],
        [-2.9802e-08]], grad_fn=<MeanBackward1>)
tensor([[1.1427],
        [1.1422],
        [1.1427],
        [1.1426],
        [1.1427]], grad_fn=<VarBackward0>)


## Feed Forward network and GeLU activation

#### GELU

In [23]:
class GeLU(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))*(x+0.044715*torch.pow(x,3))))

In [18]:
inp = torch.rand(2,1,4)
inp

tensor([[[0.5789, 0.9131, 0.0275, 0.1634]],

        [[0.3009, 0.5201, 0.3834, 0.4451]]])

In [26]:
gelu = GeLU()
gelu(inp)

tensor([[[0.4160, 0.7481, 0.0141, 0.0923]],

        [[0.1860, 0.3632, 0.2489, 0.2990]]])

#### Feed Forward

In [27]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
            GeLU(),
            nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)

In [28]:
inputs = torch.rand(5,6,768)

ffn = FeedForward(GPT_CONFIG)

out = ffn(inputs)
out.shape

torch.Size([5, 6, 768])

### Transformer Block 

In [30]:
from MultiHeadAttention import MultiHeadAttentionLayer

In [32]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.attention = MultiHeadAttentionLayer(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ffn = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        
    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = x + shortcut
        
        shortcut = x
        x = self.norm2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = x + shortcut
        
        return x

In [35]:
torch.manual_seed(123)

x = torch.rand(2, 4, 768)  # Shape: [batch_size, num_tokens, emb_dim]
block = TransformerBlock(GPT_CONFIG)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


# GPT Model

In [33]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [40]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Hello, I am"
txt2 = "Once you reach your"

batch.append(torch.tensor(tokenizer.encode(txt1)).to(dtype=torch.int))
batch.append(torch.tensor(tokenizer.encode(txt2)).to(dtype=torch.int))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[15496,    11,   314,   716],
        [ 7454,   345,  3151,   534]], dtype=torch.int32)


In [41]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG)

out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[15496,    11,   314,   716],
        [ 7454,   345,  3151,   534]], dtype=torch.int32)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.1654, -0.5162, -0.3986,  ..., -0.2268,  0.3410, -0.4689],
         [ 0.9846, -0.8799, -0.6668,  ..., -0.4058,  0.2684, -0.2774],
         [ 0.9290, -0.1931,  0.1867,  ...,  0.0074,  0.7953,  0.1561],
         [-0.3444,  0.0945, -0.2120,  ...,  1.4769, -0.7510, -0.9095]],

        [[-0.7491, -0.5921, -0.3185,  ..., -0.7649, -0.0388, -0.6094],
         [ 0.4283, -0.2397, -1.0481,  ...,  0.2326,  0.6545, -0.5148],
         [ 0.9426,  0.0823, -0.5413,  ..., -0.1721, -1.1168, -0.4192],
         [-0.2581,  0.2348,  0.1501,  ...,  0.7086, -0.0064, -1.1429]]],
       grad_fn=<UnsafeViewBackward0>)


## Sample Text Generation :

In [42]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]  

        probas = torch.softmax(logits, dim=-1)

        idx_next = torch.argmax(probas, dim=-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim=1)

    return idx

In [48]:
start_context = "Once you reach your"

encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print(encoded_tensor)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [7454, 345, 3151, 534]
tensor([[7454,  345, 3151,  534]])
encoded_tensor.shape: torch.Size([1, 4])


In [49]:
model.eval() # disable dropout

out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[ 7454,   345,  3151,   534, 20860, 49329, 41977,  5729, 47875,  1508]])
Output length: 10


In [50]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Once you reach yourcommunications Bagg66666666 apparentlyigroupailable
