##### Master Degree in Computer Science and Data Science for Economics

# GPT

### Elisabetta Rocchetti

# Setup

In [1]:
!pip install jaxtyping
!pip install transformer_lens
!pip install circuitsvis
from dataclasses import dataclass
from typing import Tuple, List, Optional, Dict
from jaxtyping import Float, Int
import torch
from torch import Tensor
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformer_lens import HookedTransformer
import einops
import numpy as np
import circuitsvis as cv
from IPython.display import display, HTML

device = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available() else "cpu"
)
gpt2 = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
hooked_gpt2 = HookedTransformer.from_pretrained(
    "gpt2-small", fold_ln=False, center_unembed=False, center_writing_weights=False
)

Collecting circuitsvis
  Downloading circuitsvis-1.43.3-py3-none-any.whl.metadata (983 bytes)
Downloading circuitsvis-1.43.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: circuitsvis
Successfully installed circuitsvis-1.43.3


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer


# GPT from scratch

Today we will see how to code a decoder-only transformer from scratch. This tutorial is adapted from [this](https://arena-ch1-transformers.streamlit.app/%5B1.1%5D_Transformer_from_Scratch) beautiful course, so if you are interested in getting deeper knowledge in this topic just go there and complete the whole tutorial (also, if you missed anythung during this lecture, you can go there and catch up).

## Inputs and Outputs - recap

<img src = "https://raw.githubusercontent.com/callummcdougall/computational-thread-art/master/example_images/misc/transformer-overview-new.png" width = "50%"/>

### Inputs

Tokenizers offer multiple functions that apparently do the same thing and I have to read the [documentation](https://huggingface.co/docs/transformers/v4.50.0/en/main_classes/tokenizer#transformers.PreTrainedTokenizer) each time I have to choose which one to use.

In [19]:
text = "The raccoon sat on the mat."
token_ids = tokenizer.encode(text)
print(f"Token (ids): {token_ids}")
print(f"Tokens (string): {tokenizer.tokenize(text)}")
print(f"Text string: {tokenizer.decode(token_ids, skip_special_tokens= False)}")
print(f"Stuff to input a model: {tokenizer(text, return_tensors='pt')}") #specify return type as tensor

Token (ids): [464, 3444, 20912, 3332, 319, 262, 2603, 13]
Tokens (string): ['The', 'Ġrac', 'coon', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat', '.']
Text string: The raccoon sat on the mat.
Stuff to input a model: {'input_ids': tensor([[  464,  3444, 20912,  3332,   319,   262,  2603,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


Exercises:

- try to tokenize texts beginning with a capital letter or a space: what happens?  it treats the space as part of the following word.
- try to tokenize long sequences of numbers or arithmetic operations: what happens? arithmetic symbols like +, -, *, and / are treated as individual tokens, just like any other punctuation mark. The model learns their meaning and function based on their position relative to the tokenized number chunks in the training data

In [14]:
print(f"Tokens (string): {tokenizer.tokenize('Test test')}")
print(f"Tokens (numbers): {tokenizer.tokenize('1234567 + 12345')}")

Tokens (string): ['Test', 'Ġtest']
Tokens (numbers): ['123', '45', '67', 'Ġ+', 'Ġ123', '45']


### Outputs

Let's generate text with out `gpt-2` model.

In [22]:
text = "Once upon a"
input_ids = tokenizer(text, return_tensors="pt").to(device)
#temporarily disables gradient calculation and other features used during training to improve performance for inference.
#It's the newer, recommended alternative to torch.no_grad()
with torch.inference_mode():
    output_logits = gpt2(**input_ids)["logits"]
print(f"Logits: {output_logits}")
print(f"Logits shape: {output_logits.shape}")

Logits: tensor([[[ -34.5644,  -34.4081,  -38.3079,  ...,  -41.6996,  -39.7801,
           -35.0520],
         [ -84.7256,  -82.9326,  -87.0166,  ...,  -91.6668,  -86.2355,
           -84.7094],
         [-109.0798, -105.7259, -109.9116,  ..., -114.2847, -107.6933,
          -105.3613]]], device='cuda:0')
Logits shape: torch.Size([1, 3, 50257])


Here we have:

- the batch dimension, which has 1 element, given that we have one sentence
- the sequence length dimension, which contains 3 tokens
- the vocabulary length dimension, which contains 50257 logits, one for each vocab

**The model has predicted a logit vector for each token in our sentence**. We can convert them into probabilities.

In [16]:
output_probas = output_logits.softmax(dim=-1)
print(f"Probabilites over vocabulary: {output_probas}")

Probabilites over vocabulary: tensor([[[8.9157e-04, 1.0424e-03, 2.1106e-05,  ..., 7.1022e-07,
          4.8419e-06, 5.4752e-04],
         [3.2892e-06, 1.9758e-05, 3.3276e-07,  ..., 3.1810e-09,
          7.2667e-07, 3.3427e-06],
         [4.0888e-07, 1.1700e-05, 1.7799e-07,  ..., 2.2447e-09,
          1.6359e-06, 1.6848e-05]]], device='cuda:0')


We can select which is the most probable **next** token at each position.

In [23]:
most_likely_next_tokens = tokenizer.batch_decode(output_logits.argmax(dim=-1)[0])
print(list(zip(tokenizer.tokenize(text), most_likely_next_tokens)))

[('Once', ' the'), ('Ġupon', ' a'), ('Ġa', ' time')]


Given this output, the next token will be...

In [24]:
next_token = output_logits[0, -1].argmax(dim=-1) #retrieve next token id
next_char = tokenizer.decode(next_token)
print(
    "The next token is:", repr(next_char)
)  # repr is to show special tokens and spaces
print("How the sentence becomes: ", text + next_char)

The next token is: ' time'
How the sentence becomes:  Once upon a time


This process is repeated iteratively, appending the next token prediction at the end of the original sentence, and giving the updated sentence to `gpt-2`again.

In [27]:
# Initialize text
text = "Once upon a"
# Convert text to tensor format
tokens = tokenizer(text, return_tensors="pt").to(device)
print("Generating text...\n")
# Generate 10 words iteratively
for i in range(10):
    with torch.inference_mode():
        # Get model predictions
        output_logits = gpt2(**tokens).logits
        # Select the most likely next token
        next_token = output_logits[0, -1].argmax(dim=-1)
        # Decode the token to a character
        next_char = tokenizer.decode(next_token)
    # Display the sequence so far
    current_text = tokenizer.decode(tokens["input_ids"][0])  # Reconstruct the string
    print(f"Generation step {i+1}:")
    #!r is a format specifier that calls the repr() function on the variable's value.
    #For strings, repr() includes the quotes around the string
    print(f"Sequence so far: {current_text!r}")
    print(f"{tokens['input_ids'].shape[-1]+1}th char = {next_char!r}\n")
    # Append the new character and re-tokenize
    text += next_char
    tokens = tokenizer(text, return_tensors="pt").to(device)
print("Final text:", text)

Generating text...

Generation step 1:
Sequence so far: 'Once upon a'
4th char = ' time'

Generation step 2:
Sequence so far: 'Once upon a time'
5th char = ','

Generation step 3:
Sequence so far: 'Once upon a time,'
6th char = ' the'

Generation step 4:
Sequence so far: 'Once upon a time, the'
7th char = ' world'

Generation step 5:
Sequence so far: 'Once upon a time, the world'
8th char = ' was'

Generation step 6:
Sequence so far: 'Once upon a time, the world was'
9th char = ' a'

Generation step 7:
Sequence so far: 'Once upon a time, the world was a'
10th char = ' place'

Generation step 8:
Sequence so far: 'Once upon a time, the world was a place'
11th char = ' of'

Generation step 9:
Sequence so far: 'Once upon a time, the world was a place of'
12th char = ' great'

Generation step 10:
Sequence so far: 'Once upon a time, the world was a place of great'
13th char = ' beauty'

Final text: Once upon a time, the world was a place of great beauty


Exercise: iteratively generate a sentence, and stop when the model predict the end of sequence token as next token, considering that every special token for `gpt-2`is mapped to `<|endoftext|>`.

In [73]:
eos_token_id = tokenizer.eos_token_id
text = "The quick brown fox is"
tokens = tokenizer(text, return_tensors="pt").to(device)
print(f"Initial tokens: {tokens['input_ids'][0]}")

print("Generating text...\n")
# Generate 10 words iteratively
for i in range(10):
  with torch.inference_mode():
        # Get model predictions
        output_logits = gpt2(**tokens).logits
        # Select the most likely next token
        next_token = output_logits[0, -1].argmax(dim=-1)
        # Decode the token to a character
        next_char = tokenizer.decode(next_token)

  # Check if the predicted token is the end-of-sequence token
  if next_token.item() == eos_token_id:
      print("\nEnd of sequence token predicted. Stopping generation.")
      break

  #ValueError -> doesn't change the attention mask!
  #tokens['input_ids'] = torch.cat([tokens['input_ids'][0], next_token], dim=-1).unsqueeze(0)
  text += next_char
  tokens = tokenizer(text, return_tensors="pt").to(device)
  #print(tokens)

print("Final text:", text)

Initial tokens: tensor([  464,  2068,  7586, 21831,   318], device='cuda:0')
Generating text...

Final text: The quick brown fox is a great choice for a dog that is not afraid


## GPT implementation

To understand things by coding them is pretty convenient. Here you will see reported pieces of code inspired or copied from the [same tutorial](https://arena-ch1-transformers.streamlit.app/%5B1.1%5D_Transformer_from_Scratch) I have linked above. The following code shows how to implement:

- LayerNorm (transforming the input to have zero mean and unit variance)
- Positional embedding (a lookup table from position indices to residual stream vectors)
- Attention (the method of computing attention patterns for residual stream vectors)
- MLP (the collection of linear and nonlinear transformations which operate on each residual stream vector in the same way)
- Embedding (a lookup table from tokens to residual stream vectors)
- Unembedding (a matrix for converting residual stream vectors into a distribution over tokens)

## Architecture overview

You can see a decoder-only architecture like the one in GPT as being constituted by 3 main modules:

1) Embedding module
2) Transformer block, with attention and multi layered perceptrons (MLP)
3) Unembedding module

<img src="https://raw.githubusercontent.com/callummcdougall/computational-thread-art/master/example_images/misc/transformer-new.png" width="60%">


### Tokenization and Embedding module

Think about these two modules as a 2-step lookup table:

1) mapping tokens to integers (as we saw previously by using the tokenizer)
2) mapping integer to vectors (learnt during the training phase)

### Transformer blocks

After having your tokens as vectors, you would expect to work with a tensor $x_0$ of shape `[batch, seq_len, d_model]` where :

- `batch` is the dimension referring to the number of sequences that are being processed at the same time
- `seq_len` is the length of each sequence in the batch, thus how many tokens it contains. Usually, you will have to make every sequence of the same length to allow for this batching to work.
- `d_model` is the length of embedding vectors as processed by the model, you can refer to this as `hidden_size` too

This tensor will enter a series of Transformer blocks containing attention heads followed by MLP.

**Attention**. These modules have the power of moving information from *prior* positions to the current token. Note that you are dealing with causal attention, thus a token at position $i$ *cannot* give attention to tokens at positions greater than $i$ (although it can *receive* attention from them). Each attention layer has `n_heads` attention heads with distinct attention patterns, indicating how much attention a token $i$ (*destination* token) give to previous tokens $j$ with $j<i$ (*source* tokens). Note: if this terminology is strange, ask the lecturer to explain it again!

It can be useful to know that we expect attention patters to be tensors of shape `[batch, n_head, seq_len, seq_len]`, with each patter putting into relation each token in the sequence with every other previous token.

<img src="https://raw.githubusercontent.com/callummcdougall/computational-thread-art/master/example_images/misc/transformer-attn-new.png" width="90%">

In [74]:
reference_text = "Once upon a time, there was a fox who lived in a forest."
tokens = hooked_gpt2.to_tokens(reference_text).to(device)
logits, cache = hooked_gpt2.run_with_cache(tokens)
html = cv.attention.attention_pattern(
    tokens=hooked_gpt2.to_str_tokens(reference_text),
    attention=cache["pattern", 3][0][7],
)
styled_html = f"""
<div style="width:800px; font-size:16px;">
    {html}
</div>
"""

display(HTML(styled_html))

**MLP**. These are standard neural networks with one hidden layer and nonlinear activation functions (e.g. GELU). If attention has moved information among tokens, MLPs process the moved information.

<img src="https://raw.githubusercontent.com/callummcdougall/computational-thread-art/master/example_images/misc/transformer-mlp-new-2.png" width="70%">

### Unembedding
This module is just mapping `[batch, seq_len, d_model]` tensors to `[batch, seq_len, d_vocab]`, which is the dimensionality for out outputs.

### Technicalities

- at the beginnig of each layer, there is a normalization step (each input vector will have mean 0 and standard deviation 1)
- we use positional embeddings to inform the network about the absolute positions of tokens (imagine an attention pattern which does not account for the fact that nearby tokens are more relevant!)

## Actual implementation

We need to fix the dimensionality of the tensors we are going to work with. Since we will take learnt parameters and activations from the pretrained model, we need to make sure that we have the same dimensions as `gpt-2` model.

### Configs

In [75]:
print(gpt2)
print(gpt2.config)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
GPT2Config {
  "activation_function": "gelu_new",
  "

In [76]:
sequence = "Once upon a time, "
tokenized_sequence = tokenizer.tokenize(sequence)
tokens = tokenizer(sequence, return_tensors="pt").to(device)["input_ids"]
print("Tokenized sequence:", tokenized_sequence)
print("Token IDs:", tokens)

Tokenized sequence: ['Once', 'Ġupon', 'Ġa', 'Ġtime', ',', 'Ġ']
Token IDs: tensor([[7454, 2402,  257,  640,   11,  220]], device='cuda:0')


In [77]:
batch = 1  # starting with only one batch (thus 1 sentence)
seq_len = len(tokenized_sequence)  # 6


@dataclass
class Config:
    n_ctx: int = gpt2.config.n_ctx  # 1024
    d_model: int = gpt2.config.n_embd  # hidden size, or embedding dimension
    n_heads: int = gpt2.config.n_head  # number of attention heads
    n_layers: int = gpt2.config.n_layer  # number of transformer blocks
    d_mlp: int = 4 * d_model  # MLP hidden size, 3072
    d_head: int = d_model // n_heads  # dimension of each attention head, 64
    layer_norm_eps: float = gpt2.config.layer_norm_epsilon  # layer norm epsilon
    d_vocab: int = gpt2.config.vocab_size  # number of tokens in the vocabulary
    init_range: float = (
        gpt2.config.initializer_range
    )  # initialization range for weights
    debug: bool = True


cfg = Config()
print(cfg)

Config(n_ctx=1024, d_model=768, n_heads=12, n_layers=12, d_mlp=3072, d_head=64, layer_norm_eps=1e-05, d_vocab=50257, init_range=0.02, debug=True)


### Tests

In [95]:
def rand_float_test(cls, shape):
    cfg = Config(debug=True)
    layer = cls(cfg).to(device)
    random_input = torch.randn(shape).to(device)
    print("Input shape:", random_input.shape)
    output = layer(random_input)
    if isinstance(output, tuple):
        output = output[0]
    print("Output:", output)
    print("Output shape:", output.shape, "\n")


def rand_int_test(cls, shape):
    cfg = Config(debug=True)
    layer = cls(cfg).to(device)
    random_input = torch.randint(100, 1000, shape).to(device)
    print("Input shape:", random_input.shape)
    output = layer(random_input)
    if isinstance(output, tuple):
        output = output[0]
    print("Output:", output)
    print("Output shape:", output.shape, "\n")

### Embedding layer (5-10 mins)

This layer takes as input a sequence of integers (output by the tokenizer) and has as output a tensor of shape `[batch, seq_len, d_model]`.

In [110]:
class Embed(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        #self.W_E = nn.Parameter(torch.empty((cfg.d_vocab, cfg.d_model)))
        self.W_E = nn.Embedding(cfg.d_vocab, cfg.d_model)
        #intializes a tensor with values drawn from a normal (Gaussian) distribution
        nn.init.normal_(self.W_E.weight, std=cfg.init_range)

    def forward(
        self, int_tokens: Int[Tensor, "batch seq_len"]
    ) -> torch.Tensor: #-> Float[Tensor, "batch seq_len d_model"]:
        # just a mapping from int tokens to float vectors
        #pass
        return self.W_E(int_tokens)

In [111]:
rand_int_test(Embed, [batch, seq_len])

Input shape: torch.Size([1, 6])
Output: tensor([[[ 0.0194,  0.0280,  0.0072,  ..., -0.0065,  0.0386, -0.0521],
         [-0.0228, -0.0094,  0.0326,  ..., -0.0083,  0.0151,  0.0153],
         [ 0.0273,  0.0170, -0.0455,  ...,  0.0049, -0.0128, -0.0266],
         [ 0.0137, -0.0252, -0.0181,  ...,  0.0292,  0.0027,  0.0111],
         [-0.0075,  0.0160, -0.0111,  ..., -0.0052,  0.0076,  0.0102],
         [ 0.0168, -0.0200,  0.0109,  ..., -0.0117,  0.0048, -0.0405]]],
       device='cuda:0', grad_fn=<EmbeddingBackward0>)
Output shape: torch.Size([1, 6, 768]) 



### Positional embeddings layer (10-15 mins)

This layer is just the same as the previous ones, but the input is not a sequence of token ids but a sequence of integers representing the position of tokens in the sentence. GPT uses learnt positional embeddings.

In [None]:
class PosEmbed(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, int_tokens: Int[Tensor, "batch seq_len"]
    ) -> Float[Tensor, "batch seq_len d_model"]:
        # take first seq_len learnt positional embeddings
        pass

In [None]:
rand_int_test(PosEmbed, [batch, seq_len])

### Layer Norm (10-15 mins)

The next module is the layer normalization. This module:

* Makes mean 0
* Normalizes to have variance 1
* Scales with learned weights
* Translates with learned bias

Use the PyTorch [LayerNorm documentation](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) as a reference. A few more notes:

* The layernorm implementation always has `affine=True`, i.e. you do learn parameters $\gamma$ and $\beta$.
* Remember that, after the centering and normalization, each vector of length `d_model` in your input should have mean 0 and variance 1.
* As the PyTorch documentation page says, your variance should be computed using `unbiased=False`.
* The `layer_norm_eps` argument in your config object corresponds to the $\epsilon$ term in the PyTorch documentation (it is included to avoid division-by-zero errors).

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, embedding: Float[Tensor, "batch seq_len d_model"]
    ) -> Float[Tensor, "batch seq_len d_model"]:
        # compute mean
        # compute standard deviation + eps
        # compute normalized embedding
        pass


rand_float_test(LayerNorm, [batch, seq_len, cfg.d_model])

### Attention (30-45 mins)

* **Step 1:** Produce an attention pattern - for each destination token, probability distribution over previous tokens (including current token)
    * Linear map from input -> query, key shape `[batch, seq_len, head_index, d_head]`
    * Dot product every *pair* of queries and keys to get attn_scores `[batch, head_index, query_pos, key_pos]` (query = dest, key = source)
    * **Scale** and mask `attn_scores` to make it lower triangular, i.e. causal
    * Softmax along the `key_pos` dimension, to get a probability distribution for each query (destination) token - this is our attention pattern!
* **Step 2:** Move information from source tokens to destination token using attention pattern (move = apply linear map)
    * Linear map from input -> value `[batch, key_pos, head_index, d_head]`
    * Mix along the `key_pos` with attn pattern to get `z`, which is a weighted average of the value vectors `[batch, query_pos, head_index, d_head]`
    * Map to output, `[batch, position, d_model]` (position = query_pos, we've summed over all heads)

Note - when we say **scale**, we mean dividing by `sqrt(d_head)`. The purpose of this is to avoid vanishing gradients (which is a big problem when we're dealing with a function like softmax - if one of the values is much larger than all the others, the probabilities will be close to 0 or 1, and the gradients will be close to 0).

Below is a much larger, more detailed version of the attention head diagram from earlier. This should give you an idea of the actual tensor operations involved. A few clarifications on this diagram:

* Whenever there is a third dimension shown in the pictures, this refers to the `head_index` dimension. We can see that all operations within the attention layer are done independently for each head.
* The objects in the box are activations; they have a batch dimension (for simplicity, we assume the batch dimension is 1 in the diagram). The objects to the right of the box are our parameters (weights and biases); they have no batch dimension.
* We arrange the keys, queries and values as `(batch, seq_pos, head_idx, d_head)`, because the biases have shape `(head_idx, d_head)`, so this makes it convenient to add the biases (recall the rules of array broadcasting!).

<img src="https://raw.githubusercontent.com/callummcdougall/computational-thread-art/master/example_images/misc/transformer-attn-21.png" width="1400">

A couple of notes / hints:

* Don't forget the attention score scaling (this should come before the masking).
* You can use `torch.where`, or the `torch.masked_fill` function when masking the attention scores.

In [None]:
class Attention(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, embedding: Float[Tensor, "batch seq_len d_model"]
    ) -> Float[Tensor, "batch seq_len d_model"]:
        # compute K, Q, V projections
        # compute attention scores
        # scale and mask attention scores (causal attention)
        # softmax attention scores
        # compute weighted sum of values
        # compute output projection
        pass

In [None]:
rand_float_test(Attention, [batch, seq_len, cfg.d_model])

### MLP (10-15 mins)

Next, you should implement the MLP layer, which consists of:

* A linear layer, with weight `W_in`, bias `b_in`
* A nonlinear function (we usually use GELU; the function `gelu_new` has been imported for this purpose)
* A linear layer, with weight `W_out`, bias `b_out`

In [None]:
def gelu_new(
    input: Float[torch.Tensor, "batch pos d_mlp"],
) -> Float[torch.Tensor, "batch pos d_mlp"]:
    # Implementation of GeLU used by GPT2 - subtly different from PyTorch's
    return (
        0.5
        * input
        * (
            1.0
            + torch.tanh(
                np.sqrt(2.0 / np.pi) * (input + 0.044715 * torch.pow(input, 3.0))
            )
        )
    )


class MLP(nn.Module):
    def __init__(self, cgf: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, embedding: Float[Tensor, "batch seq_len d_model"]
    ) -> Float[Tensor, "batch seq_len d_model"]:
        # compute in projection
        # apply activation
        # compute out projection
        pass

In [None]:
rand_float_test(MLP, [batch, seq_len, cfg.d_model])

### Transformer block: asssembling everything together! (10 mins)

Now, we can put together the attention, MLP and layernorms into a single transformer block.

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, input_embedding: Float[Tensor, "batch seq_len d_model"]
    ) -> Float[Tensor, "batch seq_len d_model"]:
        # normalize input
        # compute attention and add skip connection
        # normalize embedding
        # compute MLP and add skip connection
        # return output
        pass

In [None]:
rand_float_test(TransformerBlock, [batch, seq_len, cfg.d_model])

### Unembedding (10 mins)

The unembedding is jus a linear layer (with weight `W_U` and bias `b_U`). This is basically a map from embeddings to logits

In [None]:
class Unembed(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, embedding: Float[Tensor, "batch seq_len d_model"]
    ) -> Float[Tensor, "batch seq_len d_vocab"]:
        # compute logits
        pass

In [None]:
rand_float_test(Unembed, [batch, seq_len, cfg.d_model])

### Full Transformer (10 mins)

In [None]:
class GPT(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.cfg = cfg
        # complete here

    def forward(
        self, input_tokens: Int[Tensor, "batch seq_len"]
    ) -> Float[Tensor, "batch seq_len d_vocab"]:
        # compute embeddings + positional embeddings
        # compute transformer blocks outputs
        # normalize output
        # compute logits
        pass

    def load_gpt2_weights(self, gpt2: GPT2LMHeadModel) -> None:
        state_dict = {}

        state_dict["embed.W_E"] = gpt2.transformer.wte.weight
        state_dict["pos_embed.W_pos"] = gpt2.transformer.wpe.weight

        for l in range(cfg.n_layers):
            state_dict[f"blocks.{l}.ln1.w"] = gpt2.transformer.h[l].ln_1.weight
            state_dict[f"blocks.{l}.ln1.b"] = gpt2.transformer.h[l].ln_1.bias

            # In GPT-2, q,k,v are produced by one big linear map, whose output is
            # concat([q, k, v])
            W = gpt2.transformer.h[l].attn.c_attn.weight
            W_Q, W_K, W_V = torch.tensor_split(W, 3, dim=1)
            W_Q = einops.rearrange(W_Q, "m (i h)->i m h", i=cfg.n_heads)
            W_K = einops.rearrange(W_K, "m (i h)->i m h", i=cfg.n_heads)
            W_V = einops.rearrange(W_V, "m (i h)->i m h", i=cfg.n_heads)

            state_dict[f"blocks.{l}.attn.W_Q"] = W_Q
            state_dict[f"blocks.{l}.attn.W_K"] = W_K
            state_dict[f"blocks.{l}.attn.W_V"] = W_V

            qkv_bias = gpt2.transformer.h[l].attn.c_attn.bias
            qkv_bias = einops.rearrange(
                qkv_bias,
                "(qkv index head)->qkv index head",
                qkv=3,
                index=cfg.n_heads,
                head=cfg.d_head,
            )
            state_dict[f"blocks.{l}.attn.b_Q"] = qkv_bias[0]
            state_dict[f"blocks.{l}.attn.b_K"] = qkv_bias[1]
            state_dict[f"blocks.{l}.attn.b_V"] = qkv_bias[2]

            W_O = gpt2.transformer.h[l].attn.c_proj.weight
            W_O = einops.rearrange(W_O, "(i h) m->i h m", i=cfg.n_heads)
            state_dict[f"blocks.{l}.attn.W_O"] = W_O
            state_dict[f"blocks.{l}.attn.b_O"] = gpt2.transformer.h[l].attn.c_proj.bias

            state_dict[f"blocks.{l}.ln2.w"] = gpt2.transformer.h[l].ln_2.weight
            state_dict[f"blocks.{l}.ln2.b"] = gpt2.transformer.h[l].ln_2.bias

            W_in = gpt2.transformer.h[l].mlp.c_fc.weight
            state_dict[f"blocks.{l}.mlp.W_in"] = W_in
            state_dict[f"blocks.{l}.mlp.b_in"] = gpt2.transformer.h[l].mlp.c_fc.bias

            W_out = gpt2.transformer.h[l].mlp.c_proj.weight
            state_dict[f"blocks.{l}.mlp.W_out"] = W_out
            state_dict[f"blocks.{l}.mlp.b_out"] = gpt2.transformer.h[l].mlp.c_proj.bias
        state_dict["unembed.W_U"] = gpt2.lm_head.weight.T

        state_dict["ln_final.w"] = gpt2.transformer.ln_f.weight
        state_dict["ln_final.b"] = gpt2.transformer.ln_f.bias
        self.load_state_dict(state_dict)

In [None]:
rand_int_test(GPT, [batch, seq_len])

## Try GPT out!

In [None]:
demo_gpt2 = GPT(Config(debug=False)).to(device)
# demo_gpt2.load_gpt2_weights(gpt2)
demo_gpt2.load_state_dict(hooked_gpt2.state_dict(), strict=False)

In [None]:
# Initialize text
text = "Once upon a"
# Convert text to tensor format
tokens = tokenizer(text, return_tensors="pt").to(device)
print("Generating text...\n")
# Generate 10 characters iteratively
for i in range(20):
    with torch.inference_mode():
        # Get model predictions
        output_logits = demo_gpt2(tokens["input_ids"])
        # Select the most likely next token
        next_token = output_logits[0, -1].argmax(dim=-1)
        # Decode the token to a character
        next_char = tokenizer.decode(next_token)
    # Display the sequence so far
    current_text = tokenizer.decode(tokens["input_ids"][0])  # Reconstruct the string
    print(f"Generation step {i+1}:")
    print(f"Sequence so far: {current_text!r}")
    print(f"{tokens['input_ids'].shape[-1]+1}th char = {next_char!r}\n")
    # Append the new character and re-tokenize
    text += next_char
    tokens = tokenizer(text, return_tensors="pt").to(device)
print("Final text:", text)