In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math

In [10]:
# config
n_heads = 6
n_emb = 16 * 16 * 3
block_size = 256
batch_size = 64
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
dropout = 0.2
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_layers = 6

patch_wh = 16
n_channel = 3
n_classes = 16

In [11]:
class PatchEmbedding(nn.Module):   
    def __init__(self, patch_size=patch_wh, n_channels=n_channel):
        super().__init__()
        self.patch_size = patch_size
        embed_dim = patch_size * patch_size * n_channels

        self.proj = nn.Conv2d(n_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        # B, C, H, W  = x.size()
        
        # A Conv2D layer is equivalent to a linear projection layer since data in/out is same size (embed_dim)
        # Intuition: taking every number used to represent each patch (channels * width * height) and passing it through
        # a learned linear projection in order to embed it (same dimension as start)
        x = self.proj(x) # (B, C, H, W) --> (B, C * P^2, H / P, W / P)

        # Intuition: flatten is taking each "patch" and converting the 2d image into a long 1d image (concatenating every row into one long row)
        x = x.flatten(2) # (B, C * P^2, H / P, W / P) --> (B, C * P^2, H * W / P^2)
        x = x.transpose(1, 2) # (B, C * P^2, H * W / P^2) ---> (B, H * W / P^2, C * P^2)

        return x


In [12]:
# Intuition: adjusting the embeddings to contain more rich contextual information influenced by the other words in the sequence

class SelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.attn = nn.Linear(n_emb, n_emb * 3)
        self.attn_dropout = nn.Dropout(0.0)
        self.resid_dropout = nn.Dropout(0.0)
        self.proj = nn.Linear(n_emb, n_emb)

        self.bias = None
    
    # Multi-dimentional matrix math for parallism efficiency in multi-headed attention
    def forward(self, x):

        B, T, C = x.size() # B = batches, T = sequence length, C = embedding dimension
        # B should be 1 during inference time?

        # Create the mask for attention weights if it doesn't exist or if the size of the mask is different from the size of the input
        # if self.bias is None or self.bias.size(-1) != T:
            # self.bias = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(x.device)
            # self.bias.requires_grad = False

        # Single head. 
        # Intuition: allows words to gain more context from other words in the sequence
        # Multi-head.
        # Intuition: allows words to gain different contexts from other words in the sequence compared to single head (seeking different kinds of information)

        # The linear layer is trying to convert the embedding into Q, K, V matrices
        # Intuition:
        # - Q is what what token is looking for
        # - K is attributes about the token that are being looked at
        # - V is the actual values of the token that are being looked at
        QKV_merged = self.attn(x)
        Q, K, V = QKV_merged.split(n_emb, dim=2) # splits back into 3 after merged matrix calcs

        # Remember for efficiency, Q, K, V is really a list of Qs, Ks, Vs (one per token)

        # TODO: what does C // n_heads really mean
        Q = Q.view(B, T, n_heads, C // n_heads).transpose(1, 2)
        K = K.view(B, T, n_heads, C // n_heads).transpose(1, 2)
        V = V.view(B, T, n_heads, C // n_heads).transpose(1, 2)

        # 1. Intuition: a matmul between Q and K gives the relative importance of each word in the sequence to each other word
        # (this is litearlly a dot product showing similarity between two vectors)
        # 2. This is then scaled to prevent this from getting large (by sqrt the size of an embedding)
        att = (Q @ K.transpose(2, 3)) * 1.0 / math.sqrt(K.size(3))

        # Don't allow later tokens to influence earlier tokens [NOT NEEDED IN ENCODER - VIT]
        # att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) #settings 0s to -inf

        # Normalizing the attention weights
        att = F.softmax(att, dim=-1) #convert to probabilities

        # Dropout. Intuition: to prevent overfitting and to allow the model to generalize better
        att = self.attn_dropout(att)

        # Intuition: the attention weights are multiplied by the values to get the final output  
        # Deeper intuiton: V is what should be added to the token to make it more like the tokens its paying attention to
        out = att @ V

        # TODO: why contiguous
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        out = self.resid_dropout(self.proj(out))

        # Summary: each token asks for something (Q), gets a response (K), and then adds something (V) to itself to make it more like the responder who said K

        return out

In [13]:
# Intuition: the MLP is the "feedforward" part of the transformer and 
# it adjusts the embeddings differently than the self-attention layer does 
# by having non-linearities
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(n_emb, 4 * n_emb)
        self.gelu = nn.GELU()
        self.proj = nn.Linear(4 * n_emb, n_emb)
        self.dropout = nn.Dropout(0.)

    def forward(self, x):
        # Linear layer to adjust the embeddings
        x = self.l1(x)

        # Non-linear activation
        x = self.gelu(x)

        # Another linear layer to adjust the embeddings
        x = self.proj(x)

        # Dropout layer
        # Intuition: to prevent overfitting (memorizing the training data)
        x = self.dropout(x)

        return x

In [14]:
# The block is the combination of the self-attention and the MLP

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.attn = SelfAttention()
        self.mlp = MLP()

        self.norm1 = nn.LayerNorm(n_emb)
        self.norm2 = nn.LayerNorm(n_emb)
    
    def forward(self, x):
        # Intuition: the residual connection is a skip connection that allows the model to learn what to add to the embeddings.
        # This allows the gradient to flow through the residual connection to the self-attention and MLP layers
        x = x + self.attn(self.norm1(x)) # self-attention

        # Intuition: the mlp feedforward layer is the smarts of the transformer and is where knowledge from training is stored
        x = x + self.mlp(self.norm2(x)) # MLP
        
        return x


In [15]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer = nn.ModuleDict(dict(
            patch_emb = PatchEmbedding(),
            pte = nn.Embedding(block_size, n_emb),
            drop = nn.Dropout(0.0),
            blocks = nn.ModuleList([Block() for _ in range(n_layers)]),
            ln_f = nn.LayerNorm(n_emb),
        ))
        self.head = nn.Linear(n_emb, n_classes, bias=False)

    def forward(self, x):
        # x is the input sequence of tokens
        # t is the length of the sequence
        t = x.size(1)

        # Intuition: the position embeddings are added to the token embeddings to give the model information about the position of the tokens in the sequence
        # In this case, the position embeddings are learned by the model
        pos = torch.arange(t, device=device).unsqueeze(0) # unsqueeze to add batch dimension

        # Patch embeddings + position embeddings
        # The patch embeddings are the initial embeddings that are learned by the model
        patch_emb = self.transformer.patch_emb(x)
        p_emb = self.transformer.pte(pos)

        # Dropout: to prevent overfitting (memorizing the training data)
        x = self.transformer.drop(patch_emb + p_emb)

        # Blocks
        # Intuition: the blocks are the combination of the self-attention and the MLP
        # The blocks are the main part of the transformer that change the embeddings to contain more rich contextual information influenced by the other words in the sequence
        for block in self.transformer.blocks:
            x = block(x)

        # Layer norm
        # Intuition: layer norm helps during training by normalizing the embeddings to have a mean of 0 and a standard deviation of 1
        # This helps the model learn better by preventing the embeddings from getting too large or too small
        x = self.transformer.ln_f(x)

        # Intuition: the head is the final linear layer that converts the embeddings into logits (probabilities for each token) 
        return self.head(x)

In [16]:
# Initialize the model and optimizer
model = GPT().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [17]:
# print total number of parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'Total Parameters: {total_params}')

Total Parameters: 43328256
