In [21]:
import torch
import torch.utils
import torch.utils.data
from torch.utils.data import Dataset , DataLoader 
import tiktoken

import torch.nn as nn
from tqdm.auto import tqdm

In [7]:


def calc_loss_batch(input_batch , target_batch , model:torch.nn.Module , device:torch.device):
    input_batch , target_batch = input_batch.to(device) , target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(   logits.flatten(0,1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader , model , device , num_batches = None):
    total_loss = 0
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches  = min(num_batches , len(data_loader))
    for i , (inputs , target) in enumerate(data_loader):
        if i < num_batches:
            loss  =  calc_loss_batch(inputs , target , model , device)

            total_loss +=loss.item()

        else:
            break

        return total_loss  / num_batches

In [2]:





def evaluate_model(model , train_dataloader , eval_dataloaer , device , eval_iter ):
    model.eval()
    with torch.no_grad():
        train_loss =  calc_loss_loader(train_dataloader , model , device , num_batches= eval_iter)
        val_loss = calc_loss_loader(eval_dataloaer , model , device , num_batches=eval_iter)
    
    model.train()
    return train_loss , val_loss 



def generate_and_print_sample(model , tokenizer , device , start_context ):
    model.eval()
    context_size  = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context , tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_and_sample(
            model = model , idx  = encoded, max_new_tokens = 50 , context_size = context_size
        )
        decoded_text = token_ids_to_text(token_ids , tokenizer)
        print(decoded_text.replace("\n" , " "))
        model.train()

     
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, 
                       eval_freq, eval_iter, start_context):
    train_losses, val_losses, track_tokens_seen = [], [], []  # A
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):  # B
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  # C
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()  # D
            optimizer.step()  # E
            
            tokens_seen += input_batch.numel()  # Count the total tokens processed
            global_step += 1
            
            if global_step % eval_freq == 0:  # F
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                
                print(f"Epoch {epoch+1} (Step {global_step:06d}): "
                      f"Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}")
                
                # G: Generate and print a sample during evaluation
                generate_and_print_sample(
                    model, train_loader.dataset.tokenizer, device, start_context
                )
    
    return train_losses, val_losses, track_tokens_seen





In [3]:


def text_to_token_ids(text,  tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded = torch.tensor(encoded).unsqueeze(0)
    return encoded


def token_ids_to_text(tokens , tokenizer):
    flat  = tokens.squeeze(0)
    decode = tokenizer.decode(flat.tolist())
    return decode

    
def generate_and_sample(model  , idx , context_size ,max_new_tokens ):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
            print(logits.shape)
        logits  = logits[:, -1  , :]
        print(logits.shape)
        probs  = torch.softmax(logits  , dim=-1)
        print(probs)
        idx_next = torch.argmax(probs, dim=-1 , keepdim= True)
        print(idx_next)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx 

In [10]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [11]:



class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt) #A
        for i in range(0, len(token_ids) - max_length, stride): #B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self): #C
        return len(self.input_ids)
    def __getitem__(self, idx): #D
         return self.input_ids[idx], self.target_ids[idx]
    




def create_dataloader_v1(txt, batch_size=4,
    max_length=256, stride=128, shuffle=True, drop_last=True):
    tokenizer = tiktoken.get_encoding("gpt2") #A
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #B
    dataloader = DataLoader(
    dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    return dataloader

In [12]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention_V2(
        d_in=cfg["emb_dim"],
        d_out=cfg["emb_dim"],
        context_length=cfg["context_length"],
        num_heads=cfg["n_heads"],
        dropout=cfg["drop_rate"],
        qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_resid = nn.Dropout(cfg["drop_rate"])
    def forward(self, x):
    #A
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_resid(x)
        x = x + shortcut # Add the original input back
        shortcut = x #B
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut #C
        return x


In [13]:

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'] , 4 * cfg['emb_dim']) ,
            GELU(),
            nn.Linear(4 * cfg['emb_dim'] , cfg['emb_dim'])
        )
    def forward(self, x ):
        return self.layers(x)
    


In [14]:

class MultiHeadAttention_V2(nn.Module):
    def __init__(self, d_in , d_out , context_length  , dropout ,num_heads,qkv_bias = False):
        super().__init__()
        assert d_out % num_heads  == 0,'d_out must be divisible by the num_heads'
        self.w_query = nn.Linear(d_in , d_out ,bias=qkv_bias)
        self.w_key = nn.Linear(d_in , d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in  , d_out,bias=qkv_bias)
        self.d_in =d_in
        self.d_out = d_out
        self.dropout = nn.Dropout(dropout)
        self.num_heads  = num_heads
        self.head_dim = d_out // num_heads
        self.out_proj  = nn.Linear(d_out , d_out)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length , context_length),diagonal=1)
        )

    def forward(self,x):
        b, num_tokens , d_in = x.shape
        keys = self.w_key(x)
        queries  = self.w_query(x)
        values = self.w_value(x)
        queries = queries.view(b, num_tokens , self.num_heads , self.head_dim)
        values = values.view(b , num_tokens , self.num_heads , self.head_dim)
        keys = keys.view( b, num_tokens , self.num_heads , self.head_dim)
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1,2)
        values = values.transpose(1,2)
        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool= self.mask.bool()[:num_tokens , :num_tokens]
        attn_scores.masked_fill(mask_bool , -torch.inf)
        attn_weights = torch.softmax(attn_scores /self.head_dim**0.5   , dim=-1 )
        attn_weights = self.dropout(attn_weights)
        context_vector = (attn_weights  @ values).transpose(1, 2)
        context_vector = context_vector.contiguous().view(b , num_tokens , self.d_out)
        context_vector = self.out_proj(context_vector)
        return context_vector





In [15]:
class LayerNorm(nn.Module):
    def __init__(self , emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale  = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self , x):
        mean = x.mean(dim= -1, keepdim = True)
        var = x.var(dim =-1, keepdim = True)
        norm_x = (x - mean) / torch.sqrt(var +self.eps)
        return self.scale * norm_x + self.shift 


In [16]:

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return 0.5 * x *(1+ torch.tanh(torch.sqrt(torch.tensor(2.0/ torch.pi)) * (x+0.044715 * torch.pow(x, 3))) )
      

In [17]:



class GPTModel(nn.Module):
    def __init__(self, cfg):

        super().__init__()

        self.tok_emb = nn.Embedding(cfg['vocab_size'] , cfg['emb_dim'])

        self.pos_emb = nn.Embedding(cfg['context_length'] , cfg['emb_dim'])

        self.drop_rate = nn.Dropout(cfg['drop_rate'])

        self.transformer_block  = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])

        self.final_norm  =  LayerNorm(emb_dim=cfg['emb_dim'])

        self.out_head = nn.Linear(cfg['emb_dim'] , cfg['vocab_size'] , bias = False)

    def forward(self, idx):

        batch_size , seq_len = idx.shape

        token_embs = self.tok_emb(idx)

        pos_embs = self.pos_emb(torch.arange(seq_len , device= idx.device))

        x = token_embs + pos_embs

        x  =  self.drop_rate(x)

        x = self.transformer_block(x)

        x = self.final_norm(x)

        x = self.out_head(x)

        return x 


In [19]:



torch.manual_seed(123)
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 102,
# Context length
"emb_dim": 768,
# Embedding dimension
"n_heads": 12,
# Number of attention heads
"n_layers": 12,
# Number of layers
"drop_rate": 0.1,
# Dropout rate
"qkv_bias": False
# Query-Key-Value bias
}

model = GPTModel(GPT_CONFIG_124M)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
optimizer  =  torch.optim.AdamW(model.parameters() , lr = 0.0004  , weight_decay= 0.01)
num_epochs = 1
train_ratio = 0.90
filename = '/kaggle/input/verdict/the-verdict.txt'

with open(filename , 'r') as f:
    text_data = f.read()

text_data = text_data[:20000]

split = int(train_ratio * len(text_data))
print(split)
train_data= text_data[:split]
val_data = text_data[split:]
train_dataloader = create_dataloader_v1(txt= train_data , batch_size= 2 , max_length=GPT_CONFIG_124M['context_length'] , shuffle =  True , drop_last=True , stride=GPT_CONFIG_124M['context_length'])
val_dataloader = create_dataloader_v1(txt= val_data , batch_size= 2 , max_length=GPT_CONFIG_124M['context_length'] , shuffle =  False , drop_last=False , stride=GPT_CONFIG_124M['context_length'])


train_losses , val_losses  , token_seen = train_model_simple(
    model= model , train_loader= train_dataloader , 
    val_loader= val_dataloader , optimizer= optimizer , eval_freq=5 , device= device, num_epochs = 10,
    eval_iter=1 , start_context="Every effort moves you")

18000
Epoch 1 (Step 000000): Train Loss: 10.321, Val Loss: 10.282
torch.Size([1, 4, 50257])
torch.Size([1, 50257])
tensor([[1.9492e-05, 1.9975e-05, 1.9546e-05,  ..., 5.7806e-06, 1.3575e-05,
         4.3576e-05]], device='cuda:0')
tensor([[550]], device='cuda:0')
torch.Size([1, 5, 50257])
torch.Size([1, 50257])
tensor([[1.7172e-05, 1.7087e-05, 1.5286e-05,  ..., 1.5114e-05, 3.1586e-05,
         4.9194e-05]], device='cuda:0')
tensor([[284]], device='cuda:0')
torch.Size([1, 6, 50257])
torch.Size([1, 50257])
tensor([[1.2948e-05, 2.3578e-05, 2.6369e-05,  ..., 5.4167e-06, 2.4091e-05,
         5.9528e-05]], device='cuda:0')
tensor([[550]], device='cuda:0')
torch.Size([1, 7, 50257])
torch.Size([1, 50257])
tensor([[7.4493e-06, 1.5952e-05, 1.1110e-05,  ..., 1.0022e-05, 2.8316e-05,
         5.2267e-05]], device='cuda:0')
tensor([[284]], device='cuda:0')
torch.Size([1, 8, 50257])
torch.Size([1, 50257])
tensor([[1.6201e-05, 1.5952e-05, 1.6092e-05,  ..., 7.8490e-06, 1.6474e-05,
         6.0509e-05]],