In [1]:
import numpy as np
import torch 
import pandas as pd
import matplotlib.pyplot as plt
import tiktoken

In [2]:
from engine.llm_engine import createGPTModel
from engine.data_loader import create_dataloader_v1

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
GPT_CONFIG_124M = {
    "vocab_size" : 50257,
    "context_length" : 256,
    "emb_dim" : 768,
    "n_heads" : 12,
    "n_layers" : 12,
    "drop_rate" : 0.1,
    "kqv_bias" : False
}

In [5]:
model = createGPTModel(GPT_CONFIG_124M)

In [None]:
# def create_dataloader_v1(txt, batch_size=4 , context_length=256, 
#                          stride=256, shuffle=True, drop_last = True, 
#                          num_workers=0)

## Dummy Data

In [6]:
file_path = "the-verdict.txt"
with open(file_path , "r" , encoding="utf-8") as file:
    text_data = file.read()

In [7]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


In [8]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[ : split_idx]
val_data = text_data[split_idx : ]

In [9]:
torch.manual_seed(42)
train_loader = create_dataloader_v1(
    train_data,
    batch_size = 2,
    context_length=256,
    stride=256,
    drop_last = True,
    shuffle=True,
    num_workers=0
)
val_loader = create_dataloader_v1(
    val_data,
    batch_size = 2,
    context_length=256,
    stride=256,
    drop_last = False,
    shuffle=False,
    num_workers=0
)

In [10]:
print("Train loader:")
for x,y in train_loader:
    print(x.shape , y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [11]:
print("Validation loader:")
for x,y in val_loader:
    print(x.shape , y.shape)

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [5]:
batch = []

batch.append(torch.tensor([6109,  3626,  6100,   345]))
batch.append(torch.tensor([6109,  1110,  6622,   257]))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [6]:
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.1735,  0.1922, -0.4804,  ...,  0.0184,  0.5818, -0.1149],
         [-0.8016, -0.0366, -0.6047,  ...,  0.2760, -0.4721, -0.4356],
         [-0.3624, -0.3774, -0.6089,  ...,  0.8500,  0.5048,  0.1919],
         [ 0.0605, -0.4519, -0.5605,  ...,  0.3887,  0.8183,  1.4593]],

        [[ 0.4189,  0.5406, -0.9069,  ...,  0.1814,  0.7552, -0.1245],
         [-0.2921, -0.2976, -0.5443,  ..., -0.7706, -1.6711, -0.1323],
         [ 0.2992, -0.5517, -0.3342,  ...,  0.6089,  1.4291,  0.4221],
         [ 0.5685,  0.0318, -1.3728,  ...,  0.2945, -0.1885, -0.1910]]],
       grad_fn=<UnsafeViewBackward0>)


## Total model Prameters

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


In [14]:
total_size_bytes = total_params * 4      
total_size_mb = total_size_bytes / (1024 * 1024)    
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


## Text Generation

In [15]:
def generate_text(model , idx, max_new_token, context_size):
    for _ in range(max_new_token):
        idx_cond = idx[ : , -context_size : ]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[ : , -1, :]
        probs = torch.softmax(logits , dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next) , dim=1)
    return idx

In [16]:
tokenizer = tiktoken.get_encoding("gpt2")

In [17]:
start_context = "Hii, I'm Ashish Raj, currently I'm focused on building and deploying "
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)

encoded: [39, 4178, 11, 314, 1101, 7844, 680, 13308, 11, 3058, 314, 1101, 5670, 319, 2615, 290, 29682, 220]


In [18]:
encoded_tensor = torch.tensor(encoded).unsqueeze(0) 
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded_tensor.shape: torch.Size([1, 18])


In [22]:
model.eval()                 
out = generate_text(
    model=model,
    idx=encoded_tensor, 
    max_new_token=6, 
    context_size = 4
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[   39,  4178,    11,   314,  1101,  7844,   680, 13308,    11,  3058,
           314,  1101,  5670,   319,  2615,   290, 29682,   220,  9464, 14737,
         29760, 22344, 11245, 45370]])
Output length: 24


In [23]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hii, I'm Ashish Raj, currently I'm focused on building and deploying left Mason cryptocurrenciesosaurs003 embodies


## Training and Validation Losses

In [12]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)        
    target_batch = target_batch.to(device)      
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
     logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

In [13]:
def cal_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches , len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss/num_batches

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
with torch.no_grad():                                       
    train_loss = cal_loss_loader(train_loader, model, device)   
    val_loss = cal_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.99063385857476
Validation loss: 10.96821117401123


## Training the model

In [18]:
def generate_text(model , idx, max_new_token, context_size):
    for _ in range(max_new_token):
        idx_cond = idx[ : , -context_size : ]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[ : , -1, :]
        probs = torch.softmax(logits , dim=-1)
        idx_next = torch.argmax(probs, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next) , dim=1)
    return idx

In [19]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

In [20]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [None]:
# start_context = "Every effort moves you "
# tokenizer = tiktoken.get_encoding("gpt2")

In [17]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = cal_loss_loader(
            train_loader, model, device, num_batches = eval_iter
        )
        val_loss = cal_loss_loader(
            val_loader, model, device, num_batches = eval_iter
        )
    model.train()
    return train_loss , val_loss

In [21]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text(
            model=model, idx=encoded, 
            max_new_token=50, context_size=context_size
            )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [16]:
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs):
        model.train()
        for input_batch, traget_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, traget_batch, model, device)
            loss.backward()
            optimizer.step()
            token_seen += input_batch.numel() # torch.numel() is a function that returns the total number of elements in a given tensor. It calculates the product of all dimensions of the tensor, effectively counting every individual value stored within it
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                  f"Train loss {train_loss:.3f}, "
                  f"Val loss {val_loss:.3f}")
        generate_and_print_sample(                     
            model, tokenizer, device, start_context)
        
    return train_losses, val_losses, track_tokens_seen  