In [4]:
import torch
import torch.nn as nn
import math

In [109]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, head_dim, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.d_in = d_in
        self.d_out = head_dim*num_heads
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.W_query = nn.Linear(d_in, self.d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, self.d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, self.d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(self.d_out, self.d_out)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
        
    def forward(self, input_layer):
        batch, sequence_len, input_dim = input_layer.shape
        queries = self.W_query(input_layer)
        keys = self.W_key(input_layer)
        values = self.W_value(input_layer)
        # q , k , v shapes are like this - [batch, seq_len, d_out]
        # reshape them to be - [batch, seq_len, head, head_dim]
        queries = queries.view(batch, sequence_len, self.num_heads, self.head_dim)
        keys = keys.view(batch, sequence_len, self.num_heads, self.head_dim)
        values = values.view(batch, sequence_len, self.num_heads, self.head_dim)
        queries = queries.transpose(1,2)
        keys = keys.transpose(1,2)
        values = values.transpose(1,2)
        # q, k , v shape - [batch, head, seq_len, head_dim]
        attn_scores = queries @ keys.transpose(2,3)
        # attn_scores shape - [batch, head, seq_len, seq_len]
        attn_scores = attn_scores.masked_fill(self.mask.bool()[:sequence_len, :sequence_len], -torch.inf)
        attn_weights = torch.softmax(attn_scores/math.sqrt(self.head_dim), dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = attn_weights @ values
        # context_vec shape - [batch, num_heads, seq_len, head_dim]
        context_vec = context_vec.transpose(1,2).contiguous().view(batch, sequence_len, self.d_out)
        out_proj = self.out_proj(context_vec)
        return out_proj

In [58]:
batch = 2
seq_len = 3
d_in = 5
head_dim = 5
context_len = 5
dropout = 0.1
num_heads = 2
# d_out will be 400*5 = 2000

In [59]:
mha = MultiHeadAttention(d_in, head_dim, context_len, dropout, num_heads)

In [128]:
total_params = sum(p.numel() for p in mha.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 260


In [60]:
dummy_input = torch.randn(batch, seq_len, d_in)

In [61]:
out = mha(dummy_input)

In [62]:
print(out.shape)

torch.Size([2, 3, 10])


In [63]:
print(out)

tensor([[[ 0.1363, -0.0252,  0.3123, -0.2153, -0.2828, -0.1511,  0.3966,
           0.0476,  0.5206,  0.5171],
         [ 0.1866, -0.0819,  0.3330, -0.1300, -0.2103, -0.1940,  0.3484,
           0.0607,  0.5241,  0.4756],
         [ 0.3216, -0.0443,  0.0479, -0.3863, -0.3033, -0.1601,  0.1961,
          -0.1051,  0.2081,  0.4516]],

        [[ 0.1345, -0.3399,  0.3063,  0.2915,  0.2007,  0.3696, -0.5283,
           0.0192,  0.5290, -0.1796],
         [ 0.0671, -0.3187, -0.1127,  0.3623,  0.0985,  0.4154, -0.7602,
           0.0150,  0.3262, -0.2289],
         [ 0.1386, -0.3822, -0.0712,  0.2497,  0.0062,  0.3670, -0.5348,
          -0.1006,  0.4150, -0.0215]]], grad_fn=<ViewBackward0>)


In [64]:
mask = torch.triu(torch.ones(context_len, context_len), diagonal=1)
mask = mask.bool()
print(mask)

tensor([[False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]])


In [110]:
class FeedForward(nn.Module):
    def __init__(self, d_in, dropout):
        super().__init__()
        self.activation = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        self.layer1 = nn.Linear(d_in, 4*d_in)
        self.layer2 = nn.Linear(4*d_in, d_in)
    def forward(self, input_tensor):
        x = self.layer1(input_tensor)
        x = self.activation(x)
        x = self.layer2(x)
        x = self.dropout(x)
        return x

In [111]:
 class LayerNorm(nn.Module):          
    def __init__(self, emb_dim):   
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
                                  
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x-mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [83]:
nn.Parameter(torch.ones(5))

Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)

In [129]:
ff = FeedForward(GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["dropout"])

In [130]:
total_params = sum(p.numel() for p in ff.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 4,722,432


In [79]:
ff(dummy_input)

tensor([[[ 0.1552, -0.0985,  0.0164,  0.0320,  0.0066],
         [ 0.1247, -0.0298,  0.0656, -0.0014,  0.1116],
         [ 0.2474,  0.1433, -0.0559, -0.2302,  0.0719]],

        [[ 0.2712, -0.4586, -0.0041,  0.0925,  0.0307],
         [ 0.0747,  0.1404,  0.0000, -0.0840,  0.0527],
         [ 0.0000, -0.2969,  0.1940, -0.1839,  0.2872]]],
       grad_fn=<MulBackward0>)

In [125]:
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm1 = LayerNorm(config["emb_dim"])
        assert(config["head_dim"] * config["num_heads"] == config["emb_dim"])
        self.attention_block = MultiHeadAttention(config["emb_dim"], config["head_dim"], config["context_len"], config["dropout"], 
                                                  config["num_heads"], config["qkv_bias"])
        self.dropout = nn.Dropout(config["dropout"])
        self.layer_norm2 = LayerNorm(config["emb_dim"])
        self.ff = FeedForward(config["emb_dim"], config["dropout"])
    def forward(self, input_tensor):
        x = self.layer_norm1(input_tensor)
        x = self.attention_block(x)
        x = self.dropout(x)
        second_input = x + input_tensor
        x = self.layer_norm2(second_input)
        x = self.ff(x)
        out = x+second_input
        return out
        

In [126]:
GPT_CONFIG_124M = {
"vocab_size": 50257,     # Vocabulary size
"context_len": 256,  # Context length
"head_dim" : 64,
"emb_dim": 768,          # Embedding dimension
"num_heads": 12,           # Number of attnention heads
"n_layers": 12,          # Number of layers
"dropout": 0.1,        # Dropout rate
"qkv_bias": False        # Query-Key-Value bias
}

In [114]:
x = torch.rand(2, 4, 768)                  
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [131]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.pos_emb = nn.Embedding(config["context_len"], config["emb_dim"])
        self.dropout = nn.Dropout(config["dropout"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config["n_layers"])])
        self.final_layer_norm = LayerNorm(config["emb_dim"])
        self.output_layer = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, input_tensor):
        batch, seq_len = input_tensor.shape
        tok_emb = self.tok_emb(input_tensor)
        pos_emb = self.pos_emb(torch.arange(seq_len, device = input_tensor.device))
        x = tok_emb + pos_emb
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.final_layer_norm(x)
        x = self.output_layer(x)
        return x

In [132]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [133]:
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 8.6199e-03,  6.3280e-01,  2.1399e-01,  ...,  5.7252e-01,
          -1.6893e-01, -5.4216e-01],
         [ 1.1306e+00, -7.9799e-01, -1.1022e-01,  ...,  5.9146e-01,
           1.9603e-01, -2.3746e-01],
         [ 2.0163e-02,  3.6033e-02, -4.0366e-01,  ..., -1.1639e-03,
           3.4936e-01,  4.5264e-01],
         [ 4.1024e-01, -2.6074e-01,  3.2827e-01,  ...,  1.2635e+00,
           4.6138e-02,  9.8668e-01]],

        [[ 5.8559e-01,  3.7186e-01, -1.2099e-01,  ...,  4.6907e-01,
          -1.3180e+00, -4.7436e-01],
         [ 6.6893e-01, -6.7278e-01, -3.6985e-01,  ..., -1.6321e-01,
          -4.4507e-01, -2.2840e-01],
         [ 1.9243e-01,  1.0521e+00, -1.3014e-01,  ..., -2.0987e-01,
          -7.8911e-02,  1.2126e-01],
         [ 4.4798e-01,  4.1775e-01,  4.9786e-01,  ...,  7.0224e-01,
           5.1943e-01,  1.4487e-01]]], grad_fn=<UnsafeViewBackward0>)


In [134]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 162,419,712
