In [1]:
GPT_CONFIG_124M = {
    'vocab_size':50257,
    'context_length': 1024,
    'emb_dim':768,
    'n_heads': 12,
    'n_layers':12,
    'drop_rate':0.1,
    'qkv_bias': False
}

## Dummy GPT Model

In [2]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):

    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        #place holder for transformer blocks
        self.trf_blocks = nn.Sequential(
            *[DummpyTransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        #placeholder for layer norm
        self.final_norm = DummpyLayerNorm(cfg['emb_dim'])

        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias = False)


    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape

        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len))

        x = tok_embeds+pos_embeds
        x = self.drop_emb(x)

        x = self.trf_blocks(x)

        x = self.final_norm(x)

        logits = self.out_head(x)

        return logits
    



class DummyTransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()

    def forward(self, x):
        return x
    

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps = 1e-5):
        super().__init__()


    def forward(self, x):
        return x



## Layer Norm

it makes the mean = 0 and variance = 1 accross the dim = -1 in our case

In [4]:
class LayerNorm(nn.Module):

    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim = True)
        var = x.var(dim =-1, keepdim = True, unbiased =False)
        norm_x = (x-mean)/torch.sqrt(var+self.eps)
        return norm_x


### Scale and shift

- Note that in addition to performing the normalization by subtracting the mean and dividing by the variance, we added two trainable parameters, a scale and a shift parameter
- The initial scale (multiplying by 1) and shift (adding 0) values don't have any effect; however, scale and shift are trainable parameters that the LLM automatically adjusts during training if it is determined that doing so would improve the model's performance on its training task
- This allows the model to learn appropriate scaling and shifting that best suit the data it is processing
- Note that we also add a smaller value (eps) before computing the square root of the variance; this is to avoid division-by-zero errors if the variance is 0

## GELU

In [5]:
class GELU(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5* x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0/ torch.pi)) *
            (x + 0.044715 * torch.pow(x,3))
        ))

## FeedForward Block

In [None]:
class FeedForward(nn.Module):

    def __init__(self,cfg):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'],4*cfg['emb_dim'] ),
            GELU(),
            nn.Linear(4*cfg['emb_dim'], cfg['emb_dim'])
        )


    def forward(self,x):
        return self.layers(x)

## Transformer block

In [7]:
class TransfomerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg['emb_dim'],
            d_out = cfg['emb_dim'],
            context_length = cfg['context_length'],
            num_heads = cfg['n_heads'],
            dropout = cfg['drop_rate'],
            qkv_bias = cfg['qkv_bias']

        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):

        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

In [8]:
class GPT(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[TransfomerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias = False)

    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape

        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))

        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)

        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [9]:
from gpt import GPT

model = GPT(GPT_CONFIG_124M)

In [11]:
total_paramters = sum(p.numel() for p in model.parameters())
print(f"{total_paramters:,}")

163,009,536


In [12]:
model.tok_emb.weight.shape

torch.Size([50257, 768])

In [13]:
model.out_head.weight.shape

torch.Size([50257, 768])

In [14]:
total_paramters_gpt2 = total_paramters - sum(p.numel() for p in model.out_head.parameters())
print(f"{total_paramters_gpt2:,}")

124,412,160


In [15]:
def generate_text_simple(model, idx, max_new_tokens, context_size):

    for _ in range(max_new_tokens):

        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:,-1,:]

        probs = torch.softmax(logits, dim =-1)

        idx_next = torch.argmax(probs, dim =-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim = 1)

    return idx

In [18]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

In [19]:
encoded = tokenizer.encode("Hello, I am")
print(encoded)

[15496, 11, 314, 716]


In [21]:
encode_tensor = torch.tensor(encoded).unsqueeze(0)
encode_tensor.shape

torch.Size([1, 4])

In [22]:
model.eval()

out = generate_text_simple(model,encode_tensor,6, GPT_CONFIG_124M['context_length'])

print(out)

tensor([[15496,    11,   314,   716, 15017, 47745, 50100, 24652, 23567, 23607]])


In [23]:
out.shape

torch.Size([1, 10])

In [24]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I amSanmeet Ratt 153iggins loops
