In [None]:
from SLM_Architecture import GPTModel,GPTConfig,Transformer_Block

import importlib
import Dataset_Preprocessing
importlib.reload(Dataset_Preprocessing)
from Dataset_Preprocessing import download_dataset, get_tokenizer,  Build_Dataset,process,get_batch

import torch 
from contextlib import nullcontext
from tqdm.auto import tqdm
from torch.optim.lr_scheduler import LinearLR,SequentialLR,CosineAnnealingLR

## Download and build dataset

In [None]:
dataset = download_dataset()
tokenizer = get_tokenizer("gpt2")
Build_Dataset(dataset,tokenizer)

In [None]:
config = GPTConfig(
    vocab_size=50257,
    block_size=128,
    n_layer=6,
    n_head=6,
    n_embed=384,
    dropout=0.1,
    bias=True
    
)
model = GPTModel(config)

In [None]:
model

In [None]:
def estimate_loss(model):
    out = {}
    model.eval()
    with torch.inference_mode():    
        for split in ["train","eval"] : 
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X,Y = get_batch(split)
                with ctx : 
                    logits,loss = model(X,Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

## Training configuration

In [None]:
lr= 1e-4
max_iters = 20000
warmup_steps = 100
min_lr = 5e-4
eval_iters = 500
batch_size = 32
block_size = 128

gradient_accumulation_steps = 32 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_type = "cuda" if "cuda" == device else "cpu"
dtype = "bfloat16" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "float16"
ptdtype = {"float32":torch.float32,"bfloat16":torch.bfloat16,"float1":torch.float16}[dtype]

ctx = nullcontext() if device == "cpu" else torch.amp.autocast(device_type=device_type,dtype=ptdtype)
torch.set_default_device(device)
torch.manual_seed(42)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(),lr=lr,betas=(0.9,0.95),weight_decay=0.1,eps=1e-9)
scheduler_warmup = LinearLR(optimizer,total_iters=warmup_steps)
scheduler_decay = CosineAnnealingLR(optimizer,T_max=max_iters - warmup_steps,eta_min=min_lr)
scheduler = SequentialLR(optimizer,schedulers=[scheduler_warmup,scheduler_decay],milestones=[warmup_steps])

scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))


## Pretrain SLM

In [None]:
best_val_loss = float('inf')
best_model_params_path = "best_model_params.pt"
train_loss_list,val_loss_list = [],[]

model = model.to(device)
for epoch in tqdm(range(max_iters)) :
    if epoch % eval_iters == 0 and epoch != 0 : 
        losses = estimate_loss(model)
        print(f"Epoch {epoch}: train loss {losses["train"]:.4f}, val loss {losses["val"]:.4f}")
        print(f"The current learning rate: {optimizer.param_groups[0]["lr"]:.5f}") 
        train_loss_list += [losses["train"]]
        val_loss_list   += [losses["val"]]
        
        if losses["val"] < best_val_loss:
            best_val_loss = losses["val"]
            torch.save(model.state_dict(),best_model_params_path)
    X,y = get_batch("train",block_size,batch_size,device=device,train_path="train.bin",val_path="validation.bin")
    X,y = X.to(device),y.to(device)
    with ctx : 
        logits , loss = model(X,y)
        scaler.scale(loss).backward()
        loss = loss / gradient_accumulation_steps
    if ((epoch+1) % gradient_accumulation_steps ==0) or (epoch+1 == max_iters) : 
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=0.5)
        scaler.step(optimizer)
    scheduler.step()
     