In [3]:
import numpy as np
import pandas as pd
import torch

In [4]:
with open("C://Users//adity//LLM//India, officially the Republic of I.txt",'r',encoding='utf-8') as f:
    raw_text=f.read()
print(len(raw_text))
print(raw_text[:100])

55955
India, officially the Republic of India,[j][21] is a country in South Asia. It is the seventh-larges


In [25]:
from torch.utils.data import Dataset, DataLoader
import tiktoken

class GPTTokenizerDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = self.tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTTokenizerDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last
    )
    return dataloader

Coding up the Attention model:- Here we would be creating a class of the causal attention and instantiating multiple times for the multihead attention model.

Now for example if we set the number of heads we want is 10, then what exactly happens:-
--> we obtain a tensor with ten sets of context vector matrices.
--> In each context vector matrix the rows represent the context vectors corresponding to the tokens, and the columns corresponding to the embedding dimension specified via d_out.
--> Final embedding dimension is 10 x 10.

IMPLEMENTING THE PARALLEL METHOD OF IMPLEMENTATION.

In [16]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)
        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [19]:
#Defining the parameters
GPT_CONFIG={
    'vocab_size':50000,
    'context_length':1024,
    'embedding_dim':512,
    'num_heads':16,
    'n_layers':12,
    'dropout':0.1,
    'qkv_bias':False #Whether to include a bias layer in the linear layers of the multi head attention for query,key and value computations.
}

Coding up the placeholder architecture, it is like the mothership from where all the robots will branch out

In [20]:
class GPT_Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape
        token_embeddings=self.tok_emb(in_idx)
        positional_embeddings=self.pos_emb(in_idx)
        x=token_embeddings+positional_embeddings
        x=self.drop_emb(x)
        x=self.trf_blocks(x)
        x=self.final_norm(x)
        logits=self.out_head(x)
        return logits

class TransformerBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
    def forward(self,x):
        return x
class LayerNorm(nn.Module):
    def __init__(self, normalised_shape, eps=1e-6):
        super().__init__()
    def forward(self,x):
        return x



In [26]:
class LayerNormalization(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean= x.mean(-1, keepdim=True)
        variance = x.var(-1, keepdim=True)
        norm_x=(x-mean)/(torch.sqrt(variance+self.eps))
        return self.scale*norm_x + self.shift

We will use swish activation function.

In [28]:
class Swish(nn.Module):
    def __init__(self):
        super(Swish, self).__init__()
    def forward(self, x):
        return x * torch.sigmoid(x)

In [30]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers=nn.Sequential(
            nn.Linear(config["emb_dim"], 4*config["emb_dim"]),
            Swish(),
            nn.Linear(4*config["emb_dim"], config["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)

    #STOPPED AT PAGE NUMBER 129
