#Let's implement GPT2 using only Numpy!

###Clone the repository containing external utility code

In [None]:
!git clone https://github.com/jaymody/picoGPT.git temp_dir
!mv temp_dir/* .
!rm -rf temp_dir
!pip install fire

Cloning into 'temp_dir'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 56 (delta 26), reused 14 (delta 14), pack-reused 23[K
Receiving objects: 100% (56/56), 17.79 KiB | 1.98 MiB/s, done.
Resolving deltas: 100% (31/31), done.
Collecting fire
  Downloading fire-0.6.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.6.0-py2.py3-none-any.whl size=117029 sha256=b7f6c9eb2f450ebe8efff14b65708d12c72c023ef840cbaee11b326d471ef0e9
  Stored in directory: /root/.cache/pip/wheels/d6/6d/5d/5b73fa0f46d01a793713f8859201361e9e581ced8c75e5c6a3
Successfully built fire
Installing collected packages: fire
Succ

# GELU activation function for non-linearity in neural networks.

In [None]:
import numpy as np

def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

# Softmax function for converting logits to probabilities.

In [None]:
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Normalizes the input 'x' to have 0 mean and unit variance, then scales and shifts it.

In [None]:
def layer_norm(x, g, b, eps: float = 1e-5):
    mean = np.mean(x, axis=-1, keepdims=True)
    variance = np.var(x, axis=-1, keepdims=True)
    x = (x - mean) / np.sqrt(variance + eps)
    return g * x + b

# Linear transformation (matrix multiplication and bias addition).

In [None]:
def linear(x, w, b):
    return x @ w + b

# A two-layer feed-forward network with GELU non-linearity.

In [None]:
def ffn(x, c_fc, c_proj):
    a = gelu(linear(x, **c_fc))
    x = linear(a, **c_proj)
    return x

# Computes scaled dot-product attention with optional masking.

In [None]:
def attention(q, k, v, mask):
    return softmax(q @ k.T / np.sqrt(q.shape[-1]) + mask) @ v

# Splits inputs for multi-head attention, applies attention, and recombines outputs.

In [None]:
def mha(x, c_attn, c_proj, n_head):
    x = linear(x, **c_attn)
    qkv = np.split(x, 3, axis=-1)
    qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), qkv))
    causal_mask = (1 - np.tri(x.shape[0], dtype=x.dtype)) * -1e10
    out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]
    x = np.hstack(out_heads)
    x = linear(x, **c_proj)
    return x

# Applies a transformer block with multi-head attention and a feed-forward network.

In [None]:
def transformer_block(x, mlp, attn, ln_1, ln_2, n_head):
    x = x + mha(layer_norm(x, **ln_1), **attn, n_head=n_head)
    x = x + ffn(layer_norm(x, **ln_2), **mlp)
    return x

# Constructs the GPT-2 model from inputs through multiple transformer blocks.

In [None]:
def gpt2(inputs, wte, wpe, blocks, ln_f, n_head):
    x = wte[inputs] + wpe[range(len(inputs))]
    for block in blocks:
        x = transformer_block(x, **block, n_head=n_head)
    x = layer_norm(x, **ln_f)
    return x @ wte.T

# Generates text by repeatedly applying the model and selecting the highest probability token.

In [None]:
def generate(inputs, params, n_head, n_tokens_to_generate):
    for _ in range(n_tokens_to_generate):
        logits = gpt2(inputs, **params, n_head=n_head)
        next_id = np.argmax(logits[-1])
        inputs.append(int(next_id))
    return inputs[len(inputs) - n_tokens_to_generate:]

# The main function in the script will:

- Load model parameters (GPT2 Weights)
- Encode the input
- Generate text
- Decode the output.

# Try it yourself below!

It will take some time running it for the first time since it has to download GPT2's weights that were originally trained and provided from OpenAI

In [None]:
!python gpt2.py "Hello my friend I am going to"

2024-04-02 13:58:14.581714: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 13:58:14.581764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 13:58:14.583739: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-02 13:58:14.594902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Fetching checkpoint: 1.00kb [00:00, 5.27Mb/s]        