In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import requests
from torch.utils.data import Dataset, DataLoader

#hyperparameters
n_layer = 4
n_embedding = 192
n_head = 3
batch_size = 32
block_size = 128 # what is the maximum context length for predictions?
learning_rate = 1e-3
dropout_rate = 0.1
num_epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [None]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

In [None]:
# building vocabulary (character-level for simplicity)
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)} #Maps each character to a unique integer index
itos = {i: ch for i, ch in enumerate(chars)}

# Encode/decode functions
def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

In [None]:
# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

# Prepare data
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))  # 90% train, 10% val
train_data = data[:n]
val_data = data[n:]

In [None]:
train_dataset = TextDataset(train_data, block_size)
val_dataset = TextDataset(val_data, block_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# The Model architecture from scratch

In [None]:
class Head(nn.Module):
  "one head of self attention"
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embedding,head_size,bias=False)
    self.query = nn.Linear(n_embedding,head_size,bias=False)
    self.value = nn.Linear(n_embedding,head_size,bias=False)
    self.tril = torch.tril(torch.ones(block_size, block_size)).to(device)  # Lower triangular matrix for masking


  def forward(self,x):
      B,T,C = x.shape
      k = self.key(x)
      q = self.query(x)
      v = self.value(x)

      "compute attention"
      w = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
      w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
      w = F.softmax(w,dim=-1)

      v = self.value(x)
      out = w @ v
      return out


In [None]:
class MultiHeadAttention(nn.Module):
  "multiple heads of attention in parallel"
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embedding,n_embedding)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self,x):
    out = torch.cat([h(x) for h in self.heads],dim=-1)
    out = self.dropout(self.proj(out))
    return out


In [None]:
class FeedForward(nn.Module):
  "a linear project layer"
  def __init__(self,n_embeddings):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embeddings,4*n_embeddings),
        nn.ReLU(),
        nn.Linear(4*n_embeddings,n_embeddings),
        nn.Dropout(dropout_rate)
    )

  def forward(self,x):
    x = self.net(x)
    return x

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, n_embedding, n_head):
    super().__init__()
    head_size = n_embedding // n_head
    self.sa = MultiHeadAttention(n_head,head_size)
    self.ffwd = FeedForward(n_embedding)
    self.ln1 = nn.LayerNorm(n_embedding)
    self.ln2 = nn.LayerNorm(n_embedding)

  def forward(self,x):
    x = x + self.sa(self.ln1(x))   # includes the risidual connections
    x = x + self.ffwd(self.ln2(x))
    return x


In [None]:
class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size,n_embedding)
    self.position_embedding = nn.Embedding(block_size,n_embedding)
    self.blocks = nn.Sequential(*[TransformerBlock(n_embedding, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embedding)
    self.lm_head = nn.Linear(n_embedding,vocab_size)


  def forward(self,x,targets=None):
    B, T = x.shape

    token_emb = self.token_embedding(x)
    pos_emb = self.position_embedding(torch.arange(T,device=device))
    x = token_emb + pos_emb
    x = self.blocks(x)
    x = self.ln_f(x)
    logits= self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B * T, C)
      targets = targets.view(B * T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
# Initialize model
model = GPTLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
#training loop

for epoch in range(num_epochs):
  model.train()
  total_loss = 0
  num_batches = 0
  for xb, yb in train_loader:
    xb, yb = xb.to(device), yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    num_batches += 1
  avg_loss = total_loss / num_batches
  print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")




Epoch 1/10, Average Loss: 1.1867
Epoch 2/10, Average Loss: 0.9786
Epoch 3/10, Average Loss: 0.9075
Epoch 4/10, Average Loss: 0.8685
Epoch 5/10, Average Loss: 0.8437
Epoch 6/10, Average Loss: 0.8271
Epoch 7/10, Average Loss: 0.8153
Epoch 8/10, Average Loss: 0.8062
Epoch 9/10, Average Loss: 0.7990
Epoch 10/10, Average Loss: 0.7930


In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=500)
print(decode(generated[0].tolist()))



ROMEO:
Come, because that a stubbary? Ratchy not the self;
For as waving and chase notes the house,
Nor all the northern stars like a lanen
To take treap up the clock breaks;
And brought deliver wrong not I draw.

PETRUCHIO:
How do you herd? then my father, good word with this young Rutland?
There shall were my welcome with a well allegiance:
So merning good here stands my looks.

KING HENRY VI:
I'll have thee still store thou shalt be bled by mine
It is, that nor reasons would callow to thee a
