<a href="https://colab.research.google.com/github/G0nkly/pytorch_sandbox/blob/main/GPT_dimensions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import torch
import torch.functional as F
import torch.nn as nn

In [2]:
# Lets look at each of the layers
# 1) Encoding
# 2) Embedding
# 3) Positional Encoding
# 4) Attention: key, query, value
# 5) Feedforward
# 6) Block/Layernorm
# 7) Classification / LM Head

In [3]:
################
# DATA EXAMPLE #
################

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open(mode="r", file="input.txt") as f:
  text = f.read()

vocab = list(sorted(set(text)))

--2025-06-14 08:11:25--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-06-14 08:11:25 (16.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [4]:
##################
# HYPERPARAMTERS #
##################

In [5]:
vocab_size = len(vocab)
embedding_dim = 32
block_size = 8

In [6]:
############
# ENCODING #
############

In [7]:
stoi = {v:k for k,v in enumerate(vocab)}
itos = {k:v for k,v in enumerate(vocab)}
encode = lambda seq: [stoi[char] for char in seq]
decode = lambda numbers: "".join([itos[num] for num in numbers])

def get_batch(split: str):
  dataset = train if split == "train" else val

In [8]:
#############
# EMBEDDING #
#############

In [9]:
embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

In [10]:
t = torch.tensor(encode("Haubi"))

In [11]:
embedding(t)

tensor([[ 0.4646, -0.0149,  0.3435,  0.5761, -1.1145, -0.7385,  1.0395, -0.2676,
          2.1257,  0.9982,  0.6423,  1.0178, -0.6761, -0.9002,  0.4569,  0.1439,
          0.5009,  0.5609,  0.0184, -0.7316,  0.8684, -0.9999,  0.1195,  1.2926,
          1.0712,  0.1777,  0.6568,  1.1869, -0.5965,  0.0420,  0.8793,  0.3581],
        [ 0.7694, -1.0650, -1.0797, -1.1844,  1.3605, -1.5668, -0.6813,  2.4511,
         -1.7009, -0.4427,  0.1543, -1.1184, -2.0370, -0.5854,  0.1013,  0.9152,
          0.5762, -0.9898, -0.5361, -0.1852,  0.0799,  0.1055, -1.3476,  0.1201,
         -1.5262, -0.9159, -2.1541, -1.2946,  0.9344, -1.3839,  0.9120,  0.9466],
        [ 1.0494, -0.8446, -0.8193,  2.0454, -0.3023, -2.3372, -0.6200, -0.3865,
         -0.9838,  0.4093,  0.3887, -1.2767,  0.6851, -0.7245,  0.7791, -0.1937,
          1.4171, -0.1410,  1.7504,  0.8271,  1.7866,  0.1745,  1.0301,  0.1924,
         -0.8051, -0.2964,  0.1315, -0.2587, -1.5750, -1.8424, -0.5006, -1.4816],
        [-1.4492,  0.1909

In [12]:
#########################
# POSITIONAL 'ENCODING' #
#########################

In [13]:
postional_embedding = nn.Embedding(num_embeddings=5, embedding_dim=32)

In [14]:
postional_embedding(torch.arange(5)).shape

torch.Size([5, 32])

In [23]:
embedded_tensor = embedding(t) + postional_embedding(torch.arange(5, dtype=torch.long))
embedded_tensor

tensor([[ 8.6069e-01,  1.8929e+00, -4.3834e-01,  9.6341e-01,  6.0632e-01,
         -4.3953e-01,  2.5188e+00, -4.3335e-01,  1.1392e+00,  1.3037e+00,
          3.0560e-01,  2.5644e+00,  9.4646e-01, -1.7907e+00, -7.3763e-01,
         -8.2333e-02,  2.1173e+00, -8.1425e-01,  6.2488e-01, -1.5842e+00,
         -2.3368e+00, -1.2851e+00, -2.2978e+00, -5.0649e-01, -5.4852e-01,
          3.6887e-01,  1.5811e+00,  2.3222e+00, -1.6001e+00, -5.1744e-01,
          6.5312e-01,  8.7307e-01],
        [ 1.7661e-01, -1.7812e+00,  5.9334e-01, -1.0513e+00,  1.6564e+00,
         -1.6683e+00, -1.2088e+00,  2.4184e+00, -3.3611e+00, -5.1567e-01,
         -1.2049e+00, -2.0708e+00, -1.2121e+00,  3.9219e-01, -5.6400e-01,
          1.1737e+00,  8.4079e-01,  5.9288e-01, -1.0628e+00,  4.0933e-01,
          4.8127e-01,  6.8801e-01, -4.4802e-01,  4.3441e-01, -4.1006e-01,
         -1.6611e+00, -2.3569e+00, -1.8499e+00,  7.6150e-01, -1.1566e+00,
          2.9548e+00,  1.2468e+00],
        [ 4.8376e-01, -1.1898e+00,  3.91

In [16]:
#############
# ATTENTION #
#############

In [20]:
# since its not MultiHeadAttention input_dim == output_dim

key_layer = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
query_layer = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
value_layer = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
tril = torch.tril(torch.ones(block_size, block_size, dtype=torch.long))

In [36]:
k = key_layer(embedded_tensor)
q = query_layer(embedded_tensor)
wei = (q @ k.transpose(-2,-1)) / (embedding_dim ** -0.5)
wei[0,:], wei

(tensor([ 6.1532,  7.2973, 24.7115, 17.0612,  5.2714], grad_fn=<SliceBackward0>),
 tensor([[  6.1532,   7.2973,  24.7115,  17.0612,   5.2714],
         [ -7.9225,   2.1976,  -1.3756,  -6.1142,  10.7823],
         [ -9.6927,  15.3615,  23.9902, -12.0375,  -6.4471],
         [  3.1362,   6.9147,  -6.5888,  24.6519,  18.4033],
         [  8.3315,   4.9903, -26.4701,  17.0638,   5.6914]],
        grad_fn=<DivBackward0>))

In [41]:
test = torch.ones(3,4)
test

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [42]:
torch.sum(test, dim=0)

tensor([3., 3., 3., 3.])