# Learn GPT from scratch

In [39]:
import os

if not os.path.isfile("./datasets/corpora/shakespeare.txt"):
    !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O datasets/corpora/shakespeare.txt

In [40]:
with open("datasets/corpora/shakespeare.txt", 'r', encoding='utf-8') as f:
    text = f.read()

In [41]:
# Putting hyperparameters at the top because I learned this the hard way
# 64 * NUM_HEADS
EMBEDDING_NDIM=256
VOCAB_SIZE=128
BATCH_SIZE=64
# "Context window"
BLOCK_SIZE=256

## Tokenization and dataset creation

In [42]:
%pip install torch pandas numpy tensorboard


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
import math

torch.manual_seed(1337)

<torch._C.Generator at 0x7fef50768610>

In [44]:
# Simple dumb ASCII character-level "encoding" since all training data is ASCII
def encode_text(text):
    return([ord(t) for t in text])

def decode_text(indices):
    return([chr(x) for x in indices])


In [45]:
# Tensorify data, put it in dataset
data = torch.tensor(encode_text(text), dtype=torch.int32)

test_split_idx = int(0.8 * len(data))
val_split_idx = int(0.9 * len(data))
train_data = data[:test_split_idx]
test_data = data[test_split_idx:val_split_idx]
val_data = data[val_split_idx:]
print(f"{len(data)} chars of data")

1115394 chars of data


We have to make a custom PyTorch dataset class to automatically generate the "context" windows at load time. This allows us to avoid keeping these windows around in memory when not in use:

In [46]:
class TextDataset(Dataset):
    def __init__(self, data_tensor, context_size):
        self.data_tensor = data_tensor
        self.context_size = context_size
    
    def __len__(self):
        return len(self.data_tensor) - self.context_size

    def __getitem__(self, index):
        x = self.data_tensor[index:index + self.context_size]
        y = self.data_tensor[index + 1:index + self.context_size + 1]
        
        return x, y

## Attention is all you need (注目こそが必要なすべて)

In [66]:
class MultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, device=None, dtype=None):
        super(MultiheadAttention, self).__init__()

        # Save variables
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.d_k = embed_dim // num_heads

        self.Q = nn.Linear(embed_dim, embed_dim, bias=False)
        self.K = nn.Linear(embed_dim, embed_dim, bias=False)
        self.V = nn.Linear(embed_dim, embed_dim, bias=False)

        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, query, key, value, attn_mask=None):
        batch_size = query.size(0)

        # Apply linear layers
        q = self.Q(query) # [B, C, E]
        k = self.K(key) # [B, C, E]
        v = self.V(value) # [B, C, E]

        # Mutate dimensions so the attention matmul can get rid of the inner d_k
        q = q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)  # [batch_size, num_heads, C, d_k]
        k = k.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)  # [batch_size, num_heads, C, d_k]
        v = v.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)  # [batch_size, num_heads, C, d_k]
        
        # Get raw attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # [B, num_heads, C, C]

        # Apply mask, if necessary
        if attn_mask is not None:
            """
            MAY BE WORTH DEBUGGING

            if key_padding_mask.dim() == 3:
                # If the mask is 3D, add an extra dimension for the num_heads
                key_padding_mask = key_padding_mask.unsqueeze(1)  # [batch_size, 1, seq_len, seq_len]
            else:
                # If the mask is 2D, add dimensions for the num_heads and the 'query' sequence length
                key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
            """
            # Apply the mask to attention scores
            scores = scores.masked_fill(attn_mask, float('-inf'))

        # Scale by sqrt(k)
        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        out = attn @ v # [B, num_heads, C, d_k]

        # Concat and project
        # Swap C and num_heads, force memory to coalesce, then fuse back num_heads and d_k together
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)
        # Project: give attention "time to think". Maybe this should be part of a different module but whatever
        out = self.out_proj(out)
        return((out, None))



In [48]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.GELU(),
            nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return(self.net(x))

In [60]:
class Block(nn.Module):
    """Self-attention"""
    def __init__(self, embed_dim, num_heads, mask, dropout=0.2):
        super(Block, self).__init__()  
        self.register_buffer("mask", mask)
        self.head = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout)
        #self.head = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True)
        self.ffwd = FeedForward(embed_dim=embed_dim, dropout=dropout)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # Residual connections
        x = self.ln1(x)
        attn_output, _ = self.head(x, x, x, attn_mask=self.mask) 
        x = x + attn_output
        out = x + self.ffwd(self.ln2(x))
        return out


In [50]:
class GPT(nn.Module):
    def __init__(self, embedding_dim, vocab_size, context_size):
        super(GPT, self).__init__()

        self.embedding_dim = embedding_dim
        self.output_dim = vocab_size
        self.context_size = context_size

        NUM_HEADS=4
        NUM_LAYERS=4
        
        # Initialize layers
        self.tok_embed = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embed = nn.Embedding(context_size, embedding_dim)

        mask = torch.tril(torch.ones(self.context_size, self.context_size)).bool()
        mask = ~mask
        self.register_buffer("mask", mask)

        self.blocks = nn.Sequential(
            *[Block(embed_dim=embedding_dim, num_heads=NUM_HEADS, mask=mask, dropout=0.2) for _ in range(NUM_LAYERS)]
        )

        self.ln_f = nn.LayerNorm(self.embedding_dim)
        # Final feed-forward layer from embeddings
        self.ffwd = nn.Linear(embedding_dim, out_features=vocab_size, bias=False)

    def forward(self, x):
        tok_embed = self.tok_embed(x)
        pos_embed = self.pos_embed(
            torch.arange(0, self.context_size, device="cuda")
        )
        x = tok_embed + pos_embed

        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.ffwd(x)
        return(logits)
    
    def infer(self, x):
        with torch.no_grad():
            res = self.forward(x)
            return(res)


## Training

In [51]:
def compute_loss(model, criterion, x, y):
    logits = model(x)
    B,C,V = logits.shape
    logits = logits.view(B*C, V)
    y = y.view(B*C)
    loss = F.cross_entropy(logits, y.long())
    return loss

In [67]:
LR=3e-4

train_dataset = TextDataset(train_data, BLOCK_SIZE)
test_dataset = TextDataset(test_data, BLOCK_SIZE)

# Janky training code
model = GPT(
    embedding_dim=EMBEDDING_NDIM, 
    vocab_size=VOCAB_SIZE,
    context_size=BLOCK_SIZE,
    )

model = model.to('cuda')
optimizer = optim.AdamW(model.parameters(), lr=LR)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)
criterion = F.cross_entropy

global_step = 0

In [68]:
from torch.utils.tensorboard import SummaryWriter

EPOCHS = 1
STEPS = 5000
VAL_INTERVAL = 100

model.train()

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=4
)

test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=4, shuffle=True)

writer = SummaryWriter()

step = 0

for epoch in range(EPOCHS):
    for data, target in train_dataloader:
        data = data.to('cuda')
        target = target.to('cuda')

        loss = compute_loss(model, criterion, data, target)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #scheduler.step()

        writer.add_scalar("Loss/train", loss.cpu().detach().numpy(), global_step)
        global_step += 1

        # TODO!!! WTF???
        if step % VAL_INTERVAL == 0:
            total_loss = 0
            total_samples = 0

            with torch.no_grad():
                model.eval()
                for x, y in test_dataloader:
                    x = x.to("cuda")
                    y = y.to("cuda")

                    batch_loss = compute_loss(model, criterion, x, y)
                    total_loss += batch_loss.item() * 512
                    total_samples += 512
                    if total_samples > 10:
                        break

            model.train()
            average_loss = total_loss / total_samples

            print(f"Step {step}; loss: {average_loss}")
            writer.add_scalar("Loss/val", average_loss, global_step)


        step += 1
        if step >= STEPS:
            break

writer.close()


Step 0; loss: 4.62758731842041
Step 100; loss: 2.5372843742370605
Step 200; loss: 2.486722946166992
Step 300; loss: 2.3916263580322266
Step 400; loss: 2.269087314605713
Step 500; loss: 2.1484358310699463
Step 600; loss: 2.057586193084717
Step 700; loss: 1.9845455884933472
Step 800; loss: 1.910020351409912
Step 900; loss: 1.8550803661346436
Step 1000; loss: 1.8193731307983398
Step 1100; loss: 1.767741322517395
Step 1200; loss: 1.7612113952636719
Step 1300; loss: 1.7009034156799316
Step 1400; loss: 1.6827564239501953
Step 1500; loss: 1.6604313850402832
Step 1600; loss: 1.633068323135376
Step 1700; loss: 1.6335963010787964
Step 1800; loss: 1.6095472574234009
Step 1900; loss: 1.6086715459823608
Step 2000; loss: 1.5876469612121582
Step 2100; loss: 1.5713247060775757
Step 2200; loss: 1.5546257495880127
Step 2300; loss: 1.5589814186096191
Step 2400; loss: 1.5507397651672363
Step 2500; loss: 1.5470337867736816
Step 2600; loss: 1.547551155090332
Step 2700; loss: 1.5338884592056274
Step 2800; lo

In [69]:
PATH = "checkpoints/model.pt"

In [70]:

# Store
torch.save({
    'steps': step,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, PATH)

In [18]:
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

Now we test for overfitting:

In [26]:
import gc
gc.collect()

841

In [57]:
model.eval()
total_loss = 0.0
total_samples = 0

val_dataset = TextDataset(val_data, BLOCK_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=512, num_workers=4)
with torch.no_grad():
    for x, y in val_dataloader:
        x = x.to("cuda")
        y = y.to("cuda")

        batch_loss = compute_loss(model, criterion, x, y)
        total_loss += batch_loss.item() * x.size(0)
        total_samples += x.size(0)
        if total_samples > 100000:
            break

    average_loss = total_loss / total_samples
    print(average_loss)

1.7774584962397206


In [71]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_params

3286528

Finally, we generate. NOTE: seeds shorter than 256 chars have nonsense until you reach the context window. I think it's because Karpathy jammed the whole Shakespeare into one file with no act/scene breaks and both he and I didn't split it, so there's only one padding that the model sees, ever. TODO: fix this in the data loading step

In [58]:
g_cuda = torch.Generator(device='cuda')

seed = """
Plot histograms of the gradient values during training. If you notice a significant number of gradients are near zero (vanishing gradients) or very large values (exploding gradients), it could be a problem. TensorBoard is a useful tool for visualizing these histograms.
"""

contexts = torch.tensor(encode_text(seed), dtype=torch.int32).to('cuda')
GEN_LENGTH=1024

model.eval()
for i in range(GEN_LENGTH):
    transform = nn.LogSoftmax(1)
    # What happens if GEN_LENGTH > CONTEXT? don't worry about it
    #x = F.pad(contexts[:, -BLOCK_SIZE:], (0, BLOCK_SIZE - contexts.size(0)), "constant", 0)
    x = contexts[-BLOCK_SIZE:]
    if x.size(0) < BLOCK_SIZE:
        x = F.pad(x, (0, BLOCK_SIZE - x.size(0)), "constant", 0).unsqueeze(0) # B*T
    else:
        x = x.unsqueeze(0)

    preds = model.infer(x)
    preds = preds.squeeze(0)
    probs = torch.softmax(preds, dim=-1)

    # TODO: Broken because of bug with the trailing 0s. FIX THIS
    # next_char = torch.multinomial(torch.exp(preds[(-1 if i >= BLOCK_SIZE else i), :]), num_samples=1, generator=g_cuda)
    next_char = torch.multinomial(torch.exp(preds[-1, :]), num_samples=1, generator=g_cuda)

    #context = torch.cat(context, next_char)
    contexts = torch.cat((contexts, next_char), dim=0)
    print(decode_text(next_char.cpu().numpy())[-1], end="")

#print("".join(decode_text(contexts.cpu().numpy())))

Tutus, to Marcius, noble Marcius
Made to my voices! doing and hangs upon them!
Take it to down our foes and hates with stain,
Which thus follows slay with on I meland,
What I am after her to her fearful haunt it?

PAULINA:
But you are well to hold the king.

ISABELLA:
And I will not go royalty to thy hand.

LUCIO:
Since I do not well in such goodly talk of.
I think I have a stay of it!

HENRY BOLINGBROKE:
Who say I hate been a day's mind;
Till we here and so very little and way,
And wash the city has nest seen the feast.

DUCHESS OF YORK:
No, by the matter.

ISABELLA:
Flitter than desire never yet looks so.

HENRY BOLINGBROKE:
I am not possible perceived
And both place, where I may not rafes,
And like me one air. What you'll your love day?

KING RICHARD II:
Then be thou--

GLOUCESTER:
No, Lord Hastings:
Else queen, though my trowbers grands me to-morrow
Here to Bolingbroke's match;
When the your life and spur at homely speak.

BUCKINGHAM:
My father was I follow: if you be your your kin