#### Reading raw text

In [18]:
with open ("the-verdict.txt","r",encoding="utf-8") as file:
    raw_text=file.read()

#### Creating vocab

In [19]:
import re
preprocessed= re.split(r'([,.:?_!"()\']|--|\s)',raw_text)
result= [items.strip() for items in preprocessed if items.split()]
allwords=sorted(set(result))
allwords.extend(["<|unk|>","<|endoftext|>"])
vocab= {word:index for index,word in enumerate(allwords)}

##### Implementation of simple tokenizer -- For understanding

In [20]:
class simpleTokenizer1:
    def __init__(self, vocab):
        self.str_to_int=vocab
        self.int_to_str={int:str for str,int in vocab.items()}
    
    def encode(self, text):
        _preprocessed= re.split(r'([,.:?_!"()\']|--|\s)',text)
        _result= [items.strip() for items in _preprocessed if items.split()]
        return [self.str_to_int[items] for items in _result]
    
    def decode(self, tokens):
        text= " ".join([self.int_to_str[items] for items in tokens])
        text= re.sub(r'\s([,.:?_!"()\']|--|\s)',r'\1',text)
        return text          

In [21]:
tokenizer= simpleTokenizer1(vocab)
text="thought Jack Gisburn"
encoded= tokenizer.encode(text)
print(encoded)
decode= tokenizer.decode(encoded)
print(decode)

[1014, 58, 39]
thought Jack Gisburn


In [22]:
class simpleTokenizer2:
    def __init__(self, vocab):
        self.int_to_str={str:int for int,str in vocab.items()}
        self.str_to_int=vocab
        
    def encode(self, text):
        preprocessed= re.split(r'([.,?!_!"()\']|--|\s)',text)
        _preprocessed= [items.strip() for items in preprocessed if items.split()]
        encoded_text= [item if item in self.str_to_int 
                       else "<|unk|>" for item in _preprocessed]
        return [self.str_to_int[items] for items in encoded_text]
    
    def decode(self, tokens):
        text= " ".join([self.int_to_str[items] for items in tokens])
        text= re.sub(r'\s([,.:?_!"()\']|--|\s)',r'\1',text)
        return text 
        

In [23]:
tokenizer2= simpleTokenizer2(vocab)
text="Hello my name is Anant"
encoded= tokenizer2.encode(text)
print(encoded)
decode= tokenizer2.decode(encoded)
print(decode)

[1143, 705, 1143, 590, 1143]
<|unk|> my <|unk|> is <|unk|>


### Byte-Pair Encoding --> A type of subword encoding technique used in GPT

In [24]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [25]:
import tiktoken
tokenizer= tiktoken.get_encoding("gpt2")

### Implementing dataset, Dataloader

In [26]:
from torch.utils.data import Dataset, DataLoader
import torch

In [27]:
class GPTDataset_V1:
    
    def __init__(self, text, tokenizer, maximum_length, stride):
        
        self.input_ids=[]
        self.target_ids=[] 
        
        token_ids= tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids)-maximum_length, stride):
            input_tokens = token_ids[i:i+maximum_length]
            output_tokens= token_ids[i+1:i+maximum_length+1]
            
            self.input_ids.append(torch.tensor(input_tokens))
            self.target_ids.append(torch.tensor(output_tokens))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
            
            

In [28]:
def Create_Dataloader_V1(text, batch_size=4, maximum_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset= GPTDataset_V1(text, tokenizer, maximum_length, stride)

    dataloader= DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)
    
    return dataloader

In [29]:
vocab_size= 50257
output_dim=256

embedding_layer= torch.nn.Embedding(vocab_size, output_dim)

In [30]:
max_length=4 
dataLoader= Create_Dataloader_V1(raw_text, 
                                batch_size=8, 
                                maximum_length=max_length, 
                                stride=max_length, shuffle=False)
data_iter= iter(dataLoader)
inputs, targets= next(data_iter)  

In [31]:
print("Token Ids: \n", inputs)

Token Ids: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


In [32]:
print("\nInputs shape\n", inputs.shape)


Inputs shape
 torch.Size([8, 4])


## Self-Attention

In [33]:
class Self_Attention_V1:
    
    def __init__(self, d_in, d_out, qkv_biases=False):
        super.__init__()
        self.w_query=torch.Parameter(torch.nn.random(d_in, d_out, bias=qkv_biases))
        self.w_key=torch.Parameter(torch.nn.random(d_in, d_out, bias=qkv_biases))
        self.w_value=torch.Parameter(torch.nn.random(d_in, d_out, bias=qkv_biases))
        
        def forward(self, x):
            
            queries=self.w_query(x)
            keys=self.w_key(x)
            values=self.w_values(x)
            
            # attention= softmax([1/sqrt(d_out)*query*key^T])*value
            
            attention_scores= queries @ keys.T
            attention_weight= torch.softmax(attention_scores/ keys.shape[-1]**0.5, dim=-1)
            context_vec= attention_weight @ values
            
            return context_vec
            

In [34]:
from torch import nn

In [35]:
class Self_Attention_V2(nn.Module):
    
    def __init__(self, d_in, d_out, qkv_biases=False):
        super.__init__()
        self.w_query=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_key=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_values=nn.Linear(d_in, d_out, bias=qkv_biases)
        
        def forward(self, x):
            
            queries=self.w_query(x)
            keys=self.w_key(x)
            values=self.w_values(x)
            
            # attention= softmax([1/sqrt(d_out)*query*key^T])*value
            
            attention_scores= queries @ keys.T
            attention_weight= torch.softmax(attention_scores/ keys.shape[-1]**0.5, dim=-1)
            context_vec= attention_weight @ values
            
            return context_vec
            

### Causal attention

In [36]:
class Causal_Attention(nn.Module):
    
    def __init__(self, d_in, d_out, context_length, dropout, qkv_biases=False):
        super.__init__()
        self.d_out=d_out
        self.w_query=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_key=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_values=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
        self.droout= nn.Dropout(dropout)
        
        def forward(self, x):
            batch_size, num_tokens, d_in= x.shape
            queries=self.w_query(x)
            keys=self.w_key(x)
            values=self.w_values(x)
            
            # attention= softmax([1/sqrt(d_out)*query*key^T])*value
            
            attention_scores= queries @ keys.transpose(1,2)
            attention_scores.masked_fill_(
                self.mask.bool()[:num_tokens,:num_tokens],-torch.inf
            )
            attention_weight= torch.softmax(attention_scores/ keys.shape[-1]**0.5, dim=-1)
            attention_weight= self.dropout(attention_weight)
            context_vec= attention_weight @ values
            
            return context_vec
            

## Multi-head attention wrapper

In [37]:
class MultiheadAttentionWrapper:
    
    def __init__(self, d_in, d_out, context_length, num_heads, dropout,qkv_biases=False):
        self.heads=nn.ModuleList([
            Causal_Attention(d_in, d_out, context_length, dropout, qkv_biases) for _ in range(num_heads)]
        )
        
    def forward(self, x):
        combined_context_vec= torch.cat([head(x) for head in self.heads], dim=-1)
        return combined_context_vec
    

### Normalization layer

In [38]:
class LayerNorm:
    def __init__(self, embedding_dim):
        super.__init__()
        self.scale= nn.Parameter(torch.ones(embedding_dim))
        self.shift= nn.Parameter(torch.ones(embedding_dim))
        self.eps=1e-5
        
    def forward(self, x):
        mean= x.mean( dim=-1, keepdim=True)
        var= x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x= (x-mean)/torch.sqrt(var+self.eps)
        return norm_x*self.scale+ self.shift

## Multihead Attention with weight splits

In [80]:
class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) 
        
        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

### GPT-2 Configuration

In [71]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

### Layer-Norm, Feed Forward Neural Network, GELU Activation Function

In [82]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift
    
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return 0.5 * x * (1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi)) *
            (x + 0.044715 * torch.pow(x,3))))
            
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers=nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
            GELU(),
            nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"])
        )
    
    def forward(self,x):
        return self.layers(x)

### Transformer blocks

In [73]:
class TransformerBlocks(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn= MultiheadAttention(
            d_in=cfg["emb_dim"],
            d_out= cfg["emb_dim"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias= cfg["qkv_bias"],
            context_length= cfg["context_length"]
        )
        self.norm1= LayerNorm(cfg["emb_dim"])
        self.norm2= LayerNorm(cfg["emb_dim"])
        self.ff= FeedForward(cfg)
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
        
    def forward(self, x):
        shortcut= x
        x= self.norm1(x)
        x= self.attn(x)
        x= self.drop_shortcut(x)
        x= x+shortcut
        
        shortcut=x
        x= self.norm2(x)
        x= self.ff(x)
        x= self.drop_shortcut(x)
        
        x= x+shortcut
        return x           

### GPT Model

In [74]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        
        self.tok_emb= nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb= nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb= nn.Dropout(cfg["drop_rate"])
        
        self.trf_blocks=nn.Sequential(
            *[TransformerBlocks(cfg) for _ in range(cfg["n_layers"])])
        
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head= nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
        
    
    def forward(self, in_idx):
        batch_size, seq_length= in_idx.shape
        tok_embeds= self.tok_emb(in_idx)
        pos_embeds= self.pos_emb(torch.arange(seq_length,device=in_idx.device))
        x= tok_embeds+ pos_embeds
        x= self.drop_emb(x)
        x=self.trf_blocks(x)
        x=self.final_norm(x)
        logits= self.out_head(x)
        return logits

In [75]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [83]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)
