In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass

from tqdm import tqdm
import torch.optim as optim

from src.gpt_base import GPT

## Load Shakespear data

In [2]:
## Load Data
_path = "/Users/aditya/Documents/self_learning/ERA V3/week 12/input.txt"
with open(_path, "r") as f:
    text = f.read()

text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [3]:
# First, let's create character-level encoding
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

In [49]:
vocab_size

65

In [4]:
# Encoding/Decoding functions
def encode(s): 
    return [stoi[c] for c in s]

def decode(l): 
    return ''.join([itos[i] for i in l])

## Model Training

In [18]:
@dataclass
class GPTConfig:
    block_size: int = 1024  # max sequence length
    vocab_size: int = 50257
    num_layer: int = 12  # number of layers
    num_head: int = 12  # number of heads
    emb_dim: int = 768  # embedding dimension
    dropout: float = 0.1  # dropout rate

In [19]:
config = GPTConfig()
model = GPT(config)

In [20]:
# Hyperparameters
batch_size = 64
epochs = 3
learning_rate = 1e-4
block_size = 1024 # seq length

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x TransformerBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (mlp): FeedForward(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1

In [21]:
from torchinfo import summary

# Create example input tensor with correct dtype
summary(
    model,
    input_size=(batch_size, config.block_size),
    dtypes=[torch.long],  # Specifically tell it to use long tensors
    device=device,
    col_names=["input_size", "output_size", "num_params", "mult_adds"],
    depth=4,  # Show nested layers up to this depth
    row_settings=["var_names"]
)

Layer (type (var_name))                            Input Shape               Output Shape              Param #                   Mult-Adds
GPT (GPT)                                          [64, 1024]                [64, 1024, 50257]         --                        --
├─ModuleDict (transformer)                         --                        --                        --                        --
│    └─Embedding (wte)                             [64, 1024]                [64, 1024, 768]           38,597,376                2,470,232,064
│    └─Embedding (wpe)                             [1, 1024]                 [1, 1024, 768]            786,432                   786,432
│    └─Dropout (drop)                              [64, 1024, 768]           [64, 1024, 768]           --                        --
│    └─ModuleList (h)                              --                        --                        --                        --
│    │    └─TransformerBlock (0)                   [6

In [9]:
text_encoded = torch.tensor(encode(text), dtype=torch.long)

In [10]:
def get_batch(data):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [11]:
# Training loop
for epoch in range(epochs):
    pbar = tqdm(range(len(text_encoded) // (batch_size * block_size)))
    running_loss = 0.0
    
    for i in pbar:
        # Get batch
        xb, yb = get_batch(data=text_encoded)
        
        # Forward pass
        optimizer.zero_grad()
        logits = model(xb)
        loss = F.cross_entropy(logits.view(-1, vocab_size), yb.view(-1))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Update running loss
        running_loss += loss.item()
        pbar.set_description(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
    
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {running_loss/len(pbar):.4f}")

  0%|          | 0/17 [00:00<?, ?it/s]

: 