#### Reading raw text

In [1]:
with open ("the-verdict.txt","r",encoding="utf-8") as file:
    raw_text=file.read()

#### Creating vocab

In [2]:
import re
preprocessed= re.split(r'([,.:?_!"()\']|--|\s)',raw_text)
result= [items.strip() for items in preprocessed if items.split()]
allwords=sorted(set(result))
allwords.extend(["<|unk|>","<|endoftext|>"])
vocab= {word:index for index,word in enumerate(allwords)}

##### Implementation of simple tokenizer -- For understanding

In [3]:
class simpleTokenizer1:
    def __init__(self, vocab):
        self.str_to_int=vocab
        self.int_to_str={int:str for str,int in vocab.items()}
    
    def encode(self, text):
        _preprocessed= re.split(r'([,.:?_!"()\']|--|\s)',text)
        _result= [items.strip() for items in _preprocessed if items.split()]
        return [self.str_to_int[items] for items in _result]
    
    def decode(self, tokens):
        text= " ".join([self.int_to_str[items] for items in tokens])
        text= re.sub(r'\s([,.:?_!"()\']|--|\s)',r'\1',text)
        return text          

In [4]:
tokenizer= simpleTokenizer1(vocab)
text="thought Jack Gisburn"
encoded= tokenizer.encode(text)
print(encoded)
decode= tokenizer.decode(encoded)
print(decode)

[1014, 58, 39]
thought Jack Gisburn


In [5]:
class simpleTokenizer2:
    def __init__(self, vocab):
        self.int_to_str={str:int for int,str in vocab.items()}
        self.str_to_int=vocab
        
    def encode(self, text):
        preprocessed= re.split(r'([.,?!_!"()\']|--|\s)',text)
        _preprocessed= [items.strip() for items in preprocessed if items.split()]
        encoded_text= [item if item in self.str_to_int 
                       else "<|unk|>" for item in _preprocessed]
        return [self.str_to_int[items] for items in encoded_text]
    
    def decode(self, tokens):
        text= " ".join([self.int_to_str[items] for items in tokens])
        text= re.sub(r'\s([,.:?_!"()\']|--|\s)',r'\1',text)
        return text 
        

In [6]:
tokenizer2= simpleTokenizer2(vocab)
text="Hello my name is Anant"
encoded= tokenizer2.encode(text)
print(encoded)
decode= tokenizer2.decode(encoded)
print(decode)

[1143, 705, 1143, 590, 1143]
<|unk|> my <|unk|> is <|unk|>


### Byte-Pair Encoding --> A type of subword encoding technique used in GPT

In [7]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [8]:
import tiktoken
tokenizer= tiktoken.get_encoding("gpt2")

### Implementing dataset, Dataloader

In [9]:
from torch.utils.data import Dataset, DataLoader
import torch

In [10]:
class GPTDataset_V1:
    
    def __init__(self, text, tokenizer, maximum_length, stride):
        
        self.input_ids=[]
        self.target_ids=[] 
        
        token_ids= tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids)-maximum_length, stride):
            input_tokens = token_ids[i:i+maximum_length]
            output_tokens= token_ids[i+1:i+maximum_length+1]
            
            self.input_ids.append(torch.tensor(input_tokens))
            self.target_ids.append(torch.tensor(output_tokens))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
            
            

In [11]:
def Create_Dataloader_V1(text, batch_size=4, maximum_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset= GPTDataset_V1(text, tokenizer, maximum_length, stride)

    dataloader= DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)
    
    return dataloader

In [12]:
vocab_size= 50257
output_dim=256

embedding_layer= torch.nn.Embedding(vocab_size, output_dim)

In [13]:
max_length=4 
dataLoader= Create_Dataloader_V1(raw_text, 
                                batch_size=8, 
                                maximum_length=max_length, 
                                stride=max_length, shuffle=False)
data_iter= iter(dataLoader)
inputs, targets= next(data_iter)  

In [14]:
print("Token Ids: \n", inputs)

Token Ids: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


In [15]:
print("\nInputs shape\n", inputs.shape)


Inputs shape
 torch.Size([8, 4])


## Self-Attention

In [16]:
class Self_Attention_V1:
    
    def __init__(self, d_in, d_out, qkv_biases=False):
        super.__init__()
        self.w_query=torch.Parameter(torch.nn.random(d_in, d_out, bias=qkv_biases))
        self.w_key=torch.Parameter(torch.nn.random(d_in, d_out, bias=qkv_biases))
        self.w_value=torch.Parameter(torch.nn.random(d_in, d_out, bias=qkv_biases))
        
        def forward(self, x):
            
            queries=self.w_query(x)
            keys=self.w_key(x)
            values=self.w_values(x)
            
            # attention= softmax([1/sqrt(d_out)*query*key^T])*value
            
            attention_scores= queries @ keys.T
            attention_weight= torch.softmax(attention_scores/ keys.shape[-1]**0.5, dim=-1)
            context_vec= attention_weight @ values
            
            return context_vec
            

In [17]:
from torch import nn

In [18]:
class Self_Attention_V2(nn.Module):
    
    def __init__(self, d_in, d_out, qkv_biases=False):
        super.__init__()
        self.w_query=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_key=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_values=nn.Linear(d_in, d_out, bias=qkv_biases)
        
        def forward(self, x):
            
            queries=self.w_query(x)
            keys=self.w_key(x)
            values=self.w_values(x)
            
            # attention= softmax([1/sqrt(d_out)*query*key^T])*value
            
            attention_scores= queries @ keys.T
            attention_weight= torch.softmax(attention_scores/ keys.shape[-1]**0.5, dim=-1)
            context_vec= attention_weight @ values
            
            return context_vec
            

### Causal attention

In [19]:
class Causal_Attention(nn.Module):
    
    def __init__(self, d_in, d_out, context_length, dropout, qkv_biases=False):
        super.__init__()
        self.d_out=d_out
        self.w_query=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_key=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.w_values=nn.Linear(d_in, d_out, bias=qkv_biases)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
        self.droout= nn.Dropout(dropout)
        
        def forward(self, x):
            batch_size, num_tokens, d_in= x.shape
            queries=self.w_query(x)
            keys=self.w_key(x)
            values=self.w_values(x)
            
            # attention= softmax([1/sqrt(d_out)*query*key^T])*value
            
            attention_scores= queries @ keys.transpose(1,2)
            attention_scores.masked_fill_(
                self.mask.bool()[:num_tokens,:num_tokens],-torch.inf
            )
            attention_weight= torch.softmax(attention_scores/ keys.shape[-1]**0.5, dim=-1)
            attention_weight= self.dropout(attention_weight)
            context_vec= attention_weight @ values
            
            return context_vec
            

## Multi-head attention wrapper

In [20]:
class MultiheadAttentionWrapper:
    
    def __init__(self, d_in, d_out, context_length, num_heads, dropout,qkv_biases=False):
        self.heads=nn.ModuleList([
            Causal_Attention(d_in, d_out, context_length, dropout, qkv_biases) for _ in range(num_heads)]
        )
        
    def forward(self, x):
        combined_context_vec= torch.cat([head(x) for head in self.heads], dim=-1)
        return combined_context_vec
    

### Normalization layer

In [21]:
class LayerNorm:
    def __init__(self, embedding_dim):
        super.__init__()
        self.scale= nn.Parameter(torch.ones(embedding_dim))
        self.shift= nn.Parameter(torch.ones(embedding_dim))
        self.eps=1e-5
        
    def forward(self, x):
        mean= x.mean( dim=-1, keepdim=True)
        var= x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x= (x-mean)/torch.sqrt(var+self.eps)
        return norm_x*self.scale+ self.shift
        

## Multihead Attention with weight splits

In [None]:
class MultiheadAttention(nn.Module):
    def __init__(self,d_in, d_out, context_length,dropout,num_heads,qkv_bias=False):
        
        super().__init__()
        assert (d_out % num_heads==0), "d_out must be divisible by num_heads"
            
       
        self.d_out=d_out
        self.dropout=dropout
        self.num_heads=num_heads
        self.head_dim= d_out // num_heads

        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        
        self.out_proj=nn.Linear(d_in,d_out)
        
        self.register_buffer("mask",
                            torch.triu(torch.ones(context_length,context_length), diagonal=1))
        
        
    def forward(self,x):
        
        b, num_tokens, d_in=x.shape
        
        keys= self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries= self.W_query(x)
        values= self.W_value(x)
        
        
        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys= keys.view(b,num_tokens,self.num_heads, self.head_dim)
        queries=queries.view(b,num_tokens,self.num_heads, self.head_dim)
        values= values.view(b,num_tokens,self.num_heads, self.head_dim)
        
       # (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys.transpose(1,2)
        queries.transpose(1,2)
        values.transpose(1,2)
        
        attention_scores= queries @ keys.transpose(2,3)
        
        mask_bool = self.mask.bool()[:num_tokens,:num_tokens]
        
        attention_scores.masked_fill_(mask_bool,-torch.inf)
        
        attention_weights= torch.softmax(attention_scores / keys.shape[-1]**0.5, dim=-1)
        attention_weights= self.dropout(attention_weights)
        
        
        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec= (attention_weights @ values).transpose(1,2)
        
        context_vec=context_vec.contiguous().view(b,num_tokens,self.d_out)
        context_vec= self.out_proj(context_vec)
        return context_vec
        