# LLM Pretraining

## Preparing arbitrary text for LLM training data

```mermaid
flowchart LR
    InputText --> TokenizedText --> TokenIDs --> TokenEmbeddings --> Transformer
```

1. Input Text is the raw sequence of arbitrary characters.
2. *Tokenizer* parses the arbitrary stream of input characters to tokens
3. Map each unique Token to an ID
4. Convert those Token IDs into token embeddings, suitable for input to an LLM for pretraining


```mermaid

flowchart LR
    inputtext["My name is Rob."] --> tokenizedtext["['My', 'name', 'is', 'Rob', '.']"] --> tokenids["[40134, 2052, 133, 389, 12]"] --> tokenembeddings["[[0.12, -0.45, ...], [0.34, 0.88, ...], ...]"] --> transformer["Encoder/Decoder/Seq-to-Seq"]
```

```mermaid
---
title: "1 to 1 mapping of: token -> ID -> embedding vector"
---

flowchart LR
    subgraph inputtexts["Tokens"]
        direction TB
        txt1["My"]
        txt2["name"]
        txt3["is"]
        txt4["Rob"]
        txt5["."]
    end
    subgraph tokenids["Token IDs"]
        direction TB
        id1["40134"]
        id2["2052"]
        id3["133"]
        id4["389"]
        id5["12"]
    end

    subgraph embeddings["Token Embeddings (Seq × Dim)"]
        direction TB
        e1["e₁ = [0.12, -0.45, 0.67, ...]"]
        e2["e₂ = [0.34, 0.88, -0.12, ...]"]
        e3["e₃ = [0.05, 0.22, 0.91, ...]"]
        e4["e₄ = [0.77, -0.56, 0.11, ...]"]
        e5["e₅ = [0.03, 0.44, -0.78, ...]"]
    end

    txt1 --> id1
    txt2 --> id2
    txt3 --> id3
    txt4 --> id4
    txt5 --> id5
    id1 --> e1
    id2 --> e2
    id3 --> e3
    id4 --> e4
    id5 --> e5

    input["..."] --> inputtexts
    embeddings --> transformer["..."]
```

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import sys

import tiktoken

print((sys.version_info.major, sys.version_info.minor, sys.version_info.micro))
print(torch.__version__)
print(f'torch.cuda.is_available() = {torch.cuda.is_available()}')

(3, 12, 4)
2.5.1+cu121
torch.cuda.is_available() = True


In [2]:
input_token_ids = torch.tensor([2, 3, 5, 1])

vocab_size = 6               # len(vocabulary)
output_dim = 3               # len(embedding_vector), GPT-3 uses 12,288 dimensions

In [3]:
torch.manual_seed(123)       # to replicate on subsequent runs

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer)
print()
print(embedding_layer.weight)               # embedding layer's underlying weight matrix

Embedding(6, 3)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [4]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [5]:
[embedding_layer(x) for x in input_token_ids]

[tensor([ 1.2753, -0.2010, -0.1606], grad_fn=<EmbeddingBackward0>),
 tensor([-0.4015,  0.9666, -1.1481], grad_fn=<EmbeddingBackward0>),
 tensor([-2.8400, -0.7849, -1.4096], grad_fn=<EmbeddingBackward0>),
 tensor([0.9178, 1.5810, 1.3010], grad_fn=<EmbeddingBackward0>)]

In [6]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [8]:
with open("data/pretraining-text.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

batch_size = 8
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length
)

it = iter(dataloader)
inputs, targets = next(it)

token_embeddings = token_embedding_layer(inputs)

In [9]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

print(pos_embeddings.shape)

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([4, 256])
torch.Size([8, 4, 256])


```mermaid
---
title: "1 to 1 mapping of: token -> ID -> embeddings (token_embed + positional_embed = input_embed)"
---

flowchart LR
    subgraph inputtexts["Tokens"]
        direction TB
        txt1["My"]
        txt2["name"]
        txt3["is"]
        txt4["Rob"]
        txt5["."]
    end

    subgraph tokenids["Token IDs"]
        direction TB
        id1["40134"]
        id2["2052"]
        id3["133"]
        id4["389"]
        id5["12"]
    end

    subgraph tokenembeddings["Token Embeddings (Seq × Dim)"]
        direction TB
        e1["t₁ = [0.12, -0.45, 0.67, ...]"]
        e2["t₂ = [0.34, 0.88, -0.12, ...]"]
        e3["t₃ = [0.05, 0.22, 0.91, ...]"]
        e4["t₄ = [0.77, -0.56, 0.11, ...]"]
        e5["t₅ = [0.03, 0.44, -0.78, ...]"]
    end

    subgraph posembeddings["Positional Embeddings (Seq × Dim)"]
        direction TB
        p1["p₁ = [0.01, 0.03, -0.02, ...]"]
        p2["p₂ = [0.05, -0.11, 0.08, ...]"]
        p3["p₃ = [-0.07, 0.04, 0.09, ...]"]
        p4["p₄ = [0.10, -0.02, 0.05, ...]"]
        p5["p₅ = [0.00, 0.12, -0.06, ...]"]
    end

    subgraph inputembeddings["Input Embeddings (Token ⊕ Positional)"]
        direction TB
        i1["t₁ + p₁"]
        i2["t₂ + p₂"]
        i3["t₃ + p₃"]
        i4["t₄ + p₄"]
        i5["t₅ + p₅"]
    end

    %% Addition nodes
    add1(("⊕"))
    add2(("⊕"))
    add3(("⊕"))
    add4(("⊕"))
    add5(("⊕"))

    %% Flows
    txt1 --> id1 --> e1 --> add1 --> i1
    txt2 --> id2 --> e2 --> add2 --> i2
    txt3 --> id3 --> e3 --> add3 --> i3
    txt4 --> id4 --> e4 --> add4 --> i4
    txt5 --> id5 --> e5 --> add5 --> i5

    p1 --> add1
    p2 --> add2
    p3 --> add3
    p4 --> add4
    p5 --> add5

    input["..."] --> inputtexts
    inputembeddings --> transformer["Transformer"]
```

### Context Vectors

- AKA hidden states or contextual embeddings) are the outputs of the Transformer after self-attention has incorporated information from previous tokens.
- RAW_input_embeddings = token_embeddings + positional_embeddings.
- Transformer layer (decoder-only models) turn RAW_input_embeddings (input_embeddings) into context vectors

```mermaid
---
title: "1 to 1 mapping of: token -> ID -> embeddings -> context vectors"
---

flowchart LR
    subgraph inputtexts["Tokens"]
        direction TB
        txt1["My"]
        txt2["name"]
        txt3["is"]
        txt4["Rob"]
        txt5["."]
    end

    subgraph tokenids["Token IDs"]
        direction TB
        id1["40134"]
        id2["2052"]
        id3["133"]
        id4["389"]
        id5["12"]
    end

    subgraph tokenembeddings["Token Embeddings (Seq × Dim)"]
        direction TB
        e1["t₁ = [0.12, -0.45, 0.67, ...]"]
        e2["t₂ = [0.34, 0.88, -0.12, ...]"]
        e3["t₃ = [0.05, 0.22, 0.91, ...]"]
        e4["t₄ = [0.77, -0.56, 0.11, ...]"]
        e5["t₅ = [0.03, 0.44, -0.78, ...]"]
    end

    subgraph posembeddings["Positional Embeddings (Seq × Dim)"]
        direction TB
        p1["p₁ = [0.01, 0.03, -0.02, ...]"]
        p2["p₂ = [0.05, -0.11, 0.08, ...]"]
        p3["p₃ = [-0.07, 0.04, 0.09, ...]"]
        p4["p₄ = [0.10, -0.02, 0.05, ...]"]
        p5["p₅ = [0.00, 0.12, -0.06, ...]"]
    end

    subgraph inputembeddings["Input Embeddings (Token ⊕ Positional)"]
        direction TB
        i1["t₁ + p₁"]
        i2["t₂ + p₂"]
        i3["t₃ + p₃"]
        i4["t₄ + p₄"]
        i5["t₅ + p₅"]
    end

    subgraph contextvectors["Context Vectors (Seq × Dim after Transformer)"]
        direction TB
        c1["c₁ = f(i₁)"]
        c2["c₂ = f(i₁,i₂)"]
        c3["c₃ = f(i₁,i₂,i₃)"]
        c4["c₄ = f(i₁,i₂,i₃,i₄)"]
        c5["c₅ = f(i₁,i₂,i₃,i₄,i₅)"]
    end

    %% Addition nodes
    add1(("⊕"))
    add2(("⊕"))
    add3(("⊕"))
    add4(("⊕"))
    add5(("⊕"))

    %% Flows
    txt1 --> id1 --> e1 --> add1 --> i1
    txt2 --> id2 --> e2 --> add2 --> i2
    txt3 --> id3 --> e3 --> add3 --> i3
    txt4 --> id4 --> e4 --> add4 --> i4
    txt5 --> id5 --> e5 --> add5 --> i5

    p1 --> add1
    p2 --> add2
    p3 --> add3
    p4 --> add4
    p5 --> add5

    input["..."] --> inputtexts
    inputembeddings --> transformer["Transformer"]
    transformer --> contextvectors
```

In [11]:
# compute context vectors

inputs = torch.tensor([
    [0.43, 0.15, 0.89],
    [0.55, 0.87, 0.66],
    [0.57, 0.85, 0.64],
    [0.22, 0.58, 0.33],
    [0.77, 0.25, 0.10],
    [0.05, 0.80, 0.55]
])

attention_scores = torch.empty(6, 6)
for i, x in enumerate(inputs):
    for j, y in enumerate(inputs):
        attention_scores[i, j] = torch.dot(x, y)

# PyTorch overloads @ (idk how), i think new Python?
# attention_scores = inptus @ inputs.T


In [12]:
attention_scores

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [18]:
attention_weights = torch.softmax(attention_scores, dim=-1)

for row in attention_weights:
    print(sum(row))

attention_weights

tensor(1.0000)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.0000)
tensor(1.)


tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [19]:
all_context_vectors = attention_weights @ inputs

```mermaid

---
title: "LLM forward pass: token → embeddings → attention → context → logits → probabilities"
---

flowchart LR
    subgraph inputtexts["Tokens"]
        direction TB
        txt1["My"]
        txt2["name"]
        txt3["is"]
        txt4["Rob"]
        txt5["."]
    end

    subgraph tokenids["Token IDs"]
        direction TB
        id1["40134"]
        id2["2052"]
        id3["133"]
        id4["389"]
        id5["12"]
    end

    subgraph tokenembeddings["Token Embeddings (Seq × Dim)"]
        direction TB
        e1["t₁ = [0.12, -0.45, 0.67, ...]"]
        e2["t₂ = [0.34, 0.88, -0.12, ...]"]
        e3["t₃ = [0.05, 0.22, 0.91, ...]"]
        e4["t₄ = [0.77, -0.56, 0.11, ...]"]
        e5["t₅ = [0.03, 0.44, -0.78, ...]"]
    end

    subgraph posembeddings["Positional Embeddings (Seq × Dim)"]
        direction TB
        p1["p₁ = [0.01, 0.03, -0.02, ...]"]
        p2["p₂ = [0.05, -0.11, 0.08, ...]"]
        p3["p₃ = [-0.07, 0.04, 0.09, ...]"]
        p4["p₄ = [0.10, -0.02, 0.05, ...]"]
        p5["p₅ = [0.00, 0.12, -0.06, ...]"]
    end

    subgraph inputembeddings["Input Embeddings (Token ⊕ Positional)"]
        direction TB
        i1["t₁ + p₁"]
        i2["t₂ + p₂"]
        i3["t₃ + p₃"]
        i4["t₄ + p₄"]
        i5["t₅ + p₅"]
    end

    subgraph attention["Self-Attention Mechanism"]
        direction TB
        qk["Q · Kᵀ = Attention Scores"]
        soft["Softmax → Attention Weights (αᵢⱼ)"]
        weighted["Σ αᵢⱼ Vⱼ = Weighted Values"]
    end

    subgraph contextvectors["Context Vectors (Seq × Dim after Transformer)"]
        direction TB
        c1["c₁ = f(i₁)"]
        c2["c₂ = f(i₁,i₂)"]
        c3["c₃ = f(i₁,i₂,i₃)"]
        c4["c₄ = f(i₁,i₂,i₃,i₄)"]
        c5["c₅ = f(i₁,i₂,i₃,i₄,i₅)"]
    end

    subgraph outputlogits["Output Layer"]
        direction TB
        linear["Linear Projection (Wᵒᵘᵗ) → Logits"]
        vocabsoft["Softmax over Vocab → Next-token Probabilities"]
    end

    %% Addition nodes
    add1(("⊕"))
    add2(("⊕"))
    add3(("⊕"))
    add4(("⊕"))
    add5(("⊕"))

    %% Flows
    txt1 --> id1 --> e1 --> add1 --> i1
    txt2 --> id2 --> e2 --> add2 --> i2
    txt3 --> id3 --> e3 --> add3 --> i3
    txt4 --> id4 --> e4 --> add4 --> i4
    txt5 --> id5 --> e5 --> add5 --> i5

    p1 --> add1
    p2 --> add2
    p3 --> add3
    p4 --> add4
    p5 --> add5

    input["..."] --> inputtexts
    inputembeddings --> attention
    attention --> contextvectors
    contextvectors --> linear --> vocabsoft

```