In [2]:
# B T C = Batch Size, Sequence Length (in tokens), Embedding Dimension
# End in the same projection as before - 4, 16, 8
# Q K V = (8, 16) (16, 8) (8, 16)

In [3]:
import torch

q = torch.randn(4, 8, 16)
k = torch.randn(4, 16, 8)
v = torch.randn(4, 8, 16)

attention_scores = q @ k
output = attention_scores @ v
attention_scores.shape
output.shape

torch.Size([4, 8, 16])

In [4]:
# from dataclasses import dataclass

# @dataclass
class GPTConfig:
    block_size: int = 256
    vocab_size: int = 65
    n_layer: int = 6
    n_head: int = 4
    n_embd: int = 8

import torch.nn as nn 
from torch.nn import functional as F
import math

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd

        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                             .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qvk = self.c_attn(x)
        q, k, v = qvk.split(self.n_embd, dim=2)

        """
        Number of tokens stays the same, only embeddings change by the size of number of heads.
        Taking only specific number of embeddings are like applying filters to certain embedding
        dimensions, so we can get various many filters (like kernels in CNN)
        """

        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, T, number of heads, embedding for each head) -> (B, number of heads, T, embedding for each head)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2).transpose(-2, -1)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        attention_scores = (q @ k) * (1.0 / math.sqrt(k.size(-1)))
        attention_scores = attention_scores.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
        attention_scores = F.softmax(attention_scores, dim=-1)

        y = attention_scores @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        
        return y

In [5]:
attention = MultiHeadSelfAttention(GPTConfig)
y = attention(torch.randn(4, 16, 8))
# print(q.shape, k.shape, v.shape)
# print(k.size(-1))
y.shape

torch.Size([4, 16, 8])

In [6]:
from transformers import GPT2LMHeadModel
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

  from .autonotebook import tqdm as notebook_tqdm


transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [7]:
tensor = torch.arange(0, 10)
tensor

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
embedding = nn.Embedding(4, 4) # entries, embd dimensions
embd_tensor = torch.LongTensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
embedding(embd_tensor)

tensor([[[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]],

        [[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]],

        [[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]],

        [[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]],

        [[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]],

        [[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]],

        [[-0.6464,  0.9868,  0.6733, -0.1397],
         [ 0.6725, -0.8907,  1.2728,  0.6364],
         [ 1.6949,  0.4887, -0.3030,  0.2740]]],

In [23]:
props = torch.randint(0, 10, (5, 1))
indices = torch.rand((5, 10))

output = torch.gather(indices, -1, props)
output

tensor([[0.8868],
        [0.8982],
        [0.6461],
        [0.1346],
        [0.7476]])

In [24]:
import torch
from torch.nn import functional as F

model = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
model.eval()
model.to('cuda')
torch.manual_seed(42)
torch.cuda.manual_seed(42)
tokens = [15496, 11, 314, 1101, 257, 3303, 2746, 11] # "Hello, I'm a language model,"
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(5, 1) # (5, 8)
x = tokens.to('cuda')

# generate!
while x.size(1) < 30: # max_length=30
    # forward the model to get the logits
    with torch.no_grad():
        logits = model(x)[0] # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

# print the generated text
import tiktoken
enc = tiktoken.get_encoding('gpt2')
for i in range(5):
    tokens = x[i, :30].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
> Hello, I'm a language model, and I wrote it off on the grounds that a language model would make me more fluent. But I'm not
> Hello, I'm a language model, I really like languages. I like languages because like, they're good. And the way we talk about languages
> Hello, I'm a language model, a language model I'm using for data modelling. All I did was test the results and then I wrote some
