In [1]:
import torch
import torch.nn as nn

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers (# of transformer blocks)
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [None]:
#replaces relu function as a non-linear activation function... (makes things smoother than relu)
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [None]:
#Two layers in this 1. Linear + GELU and 2. Another Linear 
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            # Hidden layer neurons = 4 * len(emb dim)
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            # Non linear activation function
            GELU(),
            # Must squish output back down to emb dim so that each transformer block will output same length tensor
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [5]:
ff = FeedForward(GPT_CONFIG_124M)

In [None]:
# Give it some input, 3 embedding tokens of length 768 
input = torch.randn(3,768)
input

tensor([[ 0.0625, -0.9522, -0.2944,  ...,  1.8323, -0.9015,  1.0337],
        [ 0.0315, -0.2071,  0.7770,  ...,  0.3065, -1.2143,  0.2909],
        [ 0.5598, -0.0201, -1.6299,  ..., -1.3097, -1.2106, -1.2475]])

In [7]:
output = ff(input)

In [None]:
#notice it keeps the shape of the input, so that we can keep passing it through
output.shape

torch.Size([3, 768])