In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from loguru import logger
from datetime import datetime
from tqdm import tqdm
import torch.optim as optim
import numpy as np

from src.gpt_base import GPT

## Setting up Logger

In [None]:
def setup_logging(log_dir="logs"):
    os.makedirs(log_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"training_{timestamp}.log")
    
    logger.remove()
    logger.add(
        lambda msg: print(msg),
        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | {message}",
        colorize=True,
        level="INFO"
    )
    
    logger.add(
        log_file,
        format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
        level="INFO",
        rotation="100 MB",
        retention="30 days"
    )
    
    logger.info(f"Logging setup complete. Logs will be saved to: {log_file}")
    return log_file

## Load Shakespear data

In [3]:
## Load Data
_path = "/Users/aditya/Documents/self_learning/ERA V3/week 12/input.txt"
with open(_path, "r") as f:
    text = f.read()

text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [4]:
# First, let's create character-level encoding
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

In [25]:
# save stoi as json
import json
with open("docs/stoi.json", "w") as f:
    json.dump(stoi, f)

# save itos as json
with open("docs/itos.json", "w") as f:
    json.dump(itos, f)

In [5]:
vocab_size

65

In [6]:
# Encoding/Decoding functions
def encode(s): 
    return [stoi[c] for c in s]

def decode(l): 
    return ''.join([itos[i] for i in l])

## Model Training

In [7]:
@dataclass
class GPTConfig:
    block_size: int = 1024  # max sequence length
    vocab_size: int = vocab_size
    num_layer: int = 12  # number of layers
    num_head: int = 12  # number of heads
    emb_dim: int = 768  # embedding dimension
    dropout: float = 0.1  # dropout rate

In [8]:
config = GPTConfig()
model = GPT(config)

In [23]:
# Hyperparameters
batch_size = 4
epochs = 3
learning_rate = 1e-4
block_size = 1024 # seq length

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(65, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x TransformerBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (mlp): FeedForward(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, i

In [24]:
from torchinfo import summary

# Create example input tensor with correct dtype
summary(
    model,
    input_size=(batch_size, config.block_size),
    dtypes=[torch.long],  # Specifically tell it to use long tensors
    device=device,
    col_names=["input_size", "output_size", "num_params", "mult_adds"],
    depth=4,  # Show nested layers up to this depth
    row_settings=["var_names"]
)

Layer (type (var_name))                            Input Shape               Output Shape              Param #                   Mult-Adds
GPT (GPT)                                          [4, 1024]                 [4, 1024, 65]             --                        --
├─ModuleDict (transformer)                         --                        --                        --                        --
│    └─Embedding (wte)                             [4, 1024]                 [4, 1024, 768]            49,920                    199,680
│    └─Embedding (wpe)                             [1, 1024]                 [1, 1024, 768]            786,432                   786,432
│    └─Dropout (drop)                              [4, 1024, 768]            [4, 1024, 768]            --                        --
│    └─ModuleList (h)                              --                        --                        --                        --
│    │    └─TransformerBlock (0)                   [4, 1024

In [11]:
text_encoded = torch.tensor(encode(text), dtype=torch.long)

In [12]:
def get_batch(data):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [11]:
log_dir = "/kaggle/working/"
log_file = setup_logging(log_dir=log_dir)

# Training loop
for epoch in range(60, epochs+1):
    pbar = tqdm(range(len(text_encoded) // (batch_size * block_size)))
    running_loss = 0.0
    best_loss = np.inf
    
    for i in pbar:
        # Get batch
        xb, yb = get_batch(data=text_encoded)
        
        # Forward pass
        optimizer.zero_grad()
        logits = model(xb)
        loss = F.cross_entropy(logits.view(-1, vocab_size), yb.view(-1))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item()
        pbar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
    
    logger.info(f"Epoch {epoch+1}/{epochs}, Average Loss: {running_loss/len(pbar):.4f}")
    avg_loss = running_loss/len(pbar)
    if avg_loss < best_loss:
        cp = {
            "model_state_dict": model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(),
            'loss': avg_loss,
        }        
        torch.save(cp, f"gpt_model_and_loss.pth")
        best_loss = avg_loss
        print(f"Model saved : epoch_{epoch}_loss_{avg_loss:.4f}")
        

  0%|          | 0/17 [00:00<?, ?it/s]

: 

## Sentence Generation

In [15]:
## Load Model Checkpoint
checkpoint_path = "/Users/aditya/Documents/self_learning/ERA V3/week 12/model artifacts/gpt_model_and_loss.pth"

# load model checkpoint
model = GPT(config)
model = model.to(device)
_dict = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model_state_dict = _dict["model_state_dict"]
model.load_state_dict(model_state_dict)


<All keys matched successfully>

In [16]:
## Using Greedy decoding
def predict_next_word(text, model, seq_len=50):
    for _ in range(seq_len):
        xb = torch.tensor(encode(text)).unsqueeze(0).to(device)
        yb = model(xb)
        next_word = yb[0, -1].argmax().item()
        text += itos[next_word]
    return text

In [22]:
print(predict_next_word(text="I pray you.", model=model, seq_len=200))

I pray you.

GREMIO:
I am a presently for the presentle's face.

TRANIO:
I pray you, sir, I pray you to do me.

GREMIO:
I am you a love so love in this master master
to me so longer.

TRANIO:
I am a poor that yo
