<h3 style="text-align: center;"> Schematics of GPT to be implemented </h3>

<div style="display: flex; justify-content: space-around;">

  <figure style="text-align: center; margin: 10px;">
    <img src="decoder-transformer.png" alt="Simple Transformer Architecture" width="300">
    <figcaption>Simple Transformer Architecture</figcaption>
  </figure>

  <figure style="text-align: center; margin: 10px;">
    <img src="transformer-block.png" alt="Transformer Block" width="300">
    <figcaption>Transformer Block</figcaption>
  </figure>

</div>

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [None]:
import torch
import torch.nn as nn 

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_out = nn.Dropout(cfg["drop_rate"])
        self.transformer_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, x):
        B, T = x.shape
        token_embeds = self.token_emb(x) # (B, T, emb_dim)
        pos_embeds = self.pos_emb(torch.arange(T, device=x.device)) # (T, emb_dim)
        x = token_embeds + pos_embeds # (B, T, emb_dim)
        x = self.drop_out(x) # (B, T, emb_dim)
        x = self.transformer_blocks(x) # (B, T, emb_dim)
        x = self.final_norm(x) # (B, T, emb_dim)
        logits = self.out_head(x) # (B, T, emb_dim) @ (emb_dim, vocab_size) -> (B, T, vocab_size)
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x

In [6]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tmp_batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

tmp_batch.append(torch.tensor(tokenizer.encode(txt1)))
tmp_batch.append(torch.tensor(tokenizer.encode(txt2)))

batch = torch.stack(tmp_batch, dim=0)
batch.shape

torch.Size([2, 4])

In [8]:
# Let's feed the batch to the model
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Logits shape: ", logits.shape)  # Should be (2, 4, 50257)
logits


Logits shape:  torch.Size([2, 4, 50257])


tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)

#### Layer Normalization
- Layer normalization is typically applied before and after multi-head attention module. This helps to prevent vanishing and exploding gradients
by ensuring stable and efficient neural network training.

In [9]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / (std + self.eps)
        return self.scale * norm_x + self.shift

In [11]:
# Let's test the LayerNorm class
torch.manual_seed(123)
layer_norm = LayerNorm(emb_dim=5)
x = torch.randn(2, 5)
norm_x = layer_norm(x)
print(f"Original mean: {x.mean(dim=-1)}, Original std: {x.std(dim=-1)}")
print(f"Normalized mean: {norm_x.mean(dim=-1)}, Normalized std: {norm_x.std(dim=-1)}")

Original mean: tensor([-0.3596, -0.2606]), Original std: tensor([0.5018, 0.5781])
Normalized mean: tensor([-2.9802e-08,  0.0000e+00], grad_fn=<MeanBackward1>), Normalized std: tensor([1.1180, 1.1180], grad_fn=<StdBackward0>)


## Implement Feed forward network with GeLU activations

In [12]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [13]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    
    def forward(self, x):
        return self.layer(x)

In [14]:
ffn = FeedForward(GPT_CONFIG_124M)

# input shape: [batch_size, num_token, emb_size]
x = torch.rand(2, 3, 768) 
out = ffn(x)
print(out.shape)

torch.Size([2, 3, 768])


## Implement shortcut connections

- Let's see example of a deep neural network with/without shortcut connections.

In [42]:
# Let's work on an example DeepNeural Network with 5 layers
class DeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut=False):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList(
            [
                nn.Sequential(nn.Linear(layer_sizes[i], layer_sizes[i+1]), GELU())
                for i in range(len(layer_sizes)-1)
            ]
        )

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [43]:
def print_gradients(model, x):
    # Forward pass
    logits = model(x)
    target = torch.tensor([[0.0]])

    # Compute loss
    loss_fn = nn.MSELoss()
    loss = loss_fn(logits, target)
    # Backward pass
    loss.backward()

    # Print gradients
    for name, param in model.named_parameters():
        if "weight" in name:
            print(f"Gradient of {name}: {param.grad.abs().mean().item()}")

In [44]:
layer_sizes = [3, 3, 3, 3, 3, 1]
dnn_without_shortcut = DeepNeuralNetwork(layer_sizes, use_shortcut=False)
x = torch.rand(1, 3)
print_gradients(dnn_without_shortcut, x)

Gradient of layers.0.0.weight: 6.240667062229477e-06
Gradient of layers.1.0.weight: 3.732276127266232e-06
Gradient of layers.2.0.weight: 7.05161392033915e-06
Gradient of layers.3.0.weight: 1.9250619516242296e-05
Gradient of layers.4.0.weight: 0.0006308858282864094


In [45]:
# Let's test the DeepNeuralNetwork with shortcut
dnn_with_shortcut = DeepNeuralNetwork(layer_sizes, use_shortcut=True)
x = torch.rand(1, 3)
print_gradients(dnn_with_shortcut, x)

Gradient of layers.0.0.weight: 0.0024605130311101675
Gradient of layers.1.0.weight: 0.0014590122736990452
Gradient of layers.2.0.weight: 0.0022815661504864693
Gradient of layers.3.0.weight: 0.0008245914359577
Gradient of layers.4.0.weight: 0.018168918788433075
