# Build GPT from scratch

In [15]:
MANUAL_SEED = 1337

## Preprocess dataset

In [1]:
with open('../data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("Length of dataset in characters: ", len(text))

Length of dataset in characters:  1115394


In [2]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# Check all the unique characters in dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print("Vocabulary size: ", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size:  65


In [4]:
# Create a mapping of characters to integers
stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for i,c in enumerate(chars)}
encode = lambda msg: [stoi[c] for c in msg]               # encode message
decode = lambda e_msg: "".join([itos[i] for i in e_msg])  # decode encoded message

print(encode("Hello, world!"))
print(decode(encode("Hello, world!")))

[20, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42, 2]
Hello, world!


## Tokenize the entire dataset

In [6]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10])

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


## Split dataset

In [7]:
n = int(0.9 *len(data))
train_data = data[:n]
val_data = data[n:]

## Get chunck of data for training

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target is: {target}")


When input is tensor([18]) the target is: 47
When input is tensor([18, 47]) the target is: 56
When input is tensor([18, 47, 56]) the target is: 57
When input is tensor([18, 47, 56, 57]) the target is: 58
When input is tensor([18, 47, 56, 57, 58]) the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


In [13]:
torch.manual_seed(MANUAL_SEED)
batch_size = 4  # number of independent sequences to train on in parallel
block_size = 8  # lmaximum context length for predictions

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))  # choose random starting points for each sequence in the batch
    x = torch.stack([data[i:i+block_size] for i in ix])  
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


In [14]:
xb, yb = get_batch('train')
print("inputs:")
print(xb)
print("targets:")
print(yb)


inputs:
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


## Bigram language model

- B: Batch_size
- T: Length of the input sequence
- C: Number of features per token (size of the embedding)

In [67]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(MANUAL_SEED)

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()

        # Initialize a matrix of shape (vocab_size, vocab_size) with  random values
        # that will be optimized during training
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indeces in the current context
        # the goal is to get (B, T+1), (B, T+2), ... (B, T+max_new_tokens )

        for _ in range(max_new_tokens):
            # get the predictions 
            logits, loss = self(idx)   
            
            # focus only on the last time step (the logit of the last token in the context)
            logits = logits[:, -1, :] # becomes (B, C)

            # softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)

            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence
            idx = torch.cat([idx, idx_next], dim=1)
        
        return idx

model = BigramLanguageModel()
logits, loss = model(xb, yb)
print(logits.shape)
print(loss.item())

idx = torch.zeros((1,1), dtype=torch.long) # Contains the character corresponding to index 0 (newline)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist())) # decode the first batch of the generated completions


torch.Size([256, 65])
4.611330509185791

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


### Samples from model

- Generation on initialization (loss 4.8786): "SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ"

- Generation after ~20000 steps of trainign (loss 2.5770): "Fouthe f Goversthy harmarend t:
Musthee aved tef t thaphapayeeraie ce. t, ndedigetlot;
W:


A ityove
"

## Train the Model

In [26]:
# create a pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [34]:
batch_size = 32
for steps in range(10000):

    # sample batch
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.576992988586426


## Better estimate the loss

Since the loss is calculated for a single batch, it may not representate well the loss of the whole data. 

So to estimate we'll calculate the loss for many batches and then return the mean.

In [49]:
@torch.no_grad()
def estimate_loss(eval_iters=100):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits, loss = model(xb, yb)
            losses[k] = loss.item()
        
        out[split] = losses.mean().item()
    model.train()
    return out

print(estimate_loss())

{'train': 2.4555540084838867, 'val': 2.4908673763275146}


In [41]:
print(decode(model.generate(idx, max_new_tokens=200)[0].tolist())) # decode the first batch of the generated completions



hintheth miouro oulof I and s l SS:
Iffof; hat t-asoresere ashath fover;

AUMENGHave ie? nds to wisus, athal
Fiotha her owa
Fouidif toury aris dor yoress ane hit in,
O:
LETAUns.
Isat t ust far thas s 


## Increase attention

Every token will receive the context of the previous tokens.

For now, we'll average the values of the features from the previous tokens

In [50]:
torch.manual_seed(MANUAL_SEED)
B,T,C = 4, 8, 2
x = torch.randn((B,T,C))

xbow = torch.zeros((B,T, C))  # x bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t,C)
        xbow[b,t] = xprev.mean(dim=0)

In [53]:
# The last row of xbow is the average of all the previous rows in x
x[0], xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

### Optimize calculation with matrix multiplication

In [54]:
torch.tril(torch.ones(3,3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [60]:
# Each row of matrix c is the sum of the previous rows of matrix b
torch.manual_seed(MANUAL_SEED)
a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10,(3,2)).float()
c = a @ b 

print("---------A---------\n")
print(a)
print("\n---------B---------\n")
print(b)
print("\n---------C---------\n")
print(c)

---------A---------

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

---------B---------

tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])

---------C---------

tensor([[ 5.,  7.],
        [ 7.,  7.],
        [12., 10.]])


In [61]:
# Each row of matrix c is the mean of the previous rows of matrix b
torch.manual_seed(MANUAL_SEED)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b 

print("---------A---------\n")
print(a)
print("\n---------B---------\n")
print(b)
print("\n---------C---------\n")
print(c)

---------A---------

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

---------B---------

tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])

---------C---------

tensor([[5.0000, 7.0000],
        [3.5000, 3.5000],
        [4.0000, 3.3333]])


In [64]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x     # (T,T) @ (B,T,C) -> (B,T,T) @ (B,T,C) -> (B,T,C)
torch.allclose(xbow, xbow2)

True

### Using softmax

Even though we initialize wei as zeros, it could be a weighted matrix of the 
relationship of the tokens. 

Then, we set future tokens' relationship to -inf so they don't interere



In [66]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)

#tokens from the past cannot comunicate
wei = wei.masked_fill(tril == 0, float('-inf')) # fill every position of wei that is 0 in tril with -inf

wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

### Apply to the model