In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math
import tqdm

In [14]:
with open('wiki.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
# load from file
import json

vocab = {}
token_to_string = {}

with open('vocab_wiki.json', 'r') as f:
    vocab = json.load(f)

for i in vocab:
    token_to_string[vocab[i]] = i


print('vocab size: ', len(vocab))

def tokenize(text):
    encoded = []
    tokens_raw = []
    cur = []
    for c in tqdm.tqdm(text):
        cur.append(c)
        if vocab.get("".join(cur)) is None:
            encoded.append(vocab.get("".join(cur[:-1])))
            tokens_raw.append("".join(cur[:-1]))
            cur = [c]

    print('chars in text: ', len(text))
    print('tokens after encoding: ', len(tokens_raw))

    return encoded, tokens_raw

def decode(encoded):
    decoded = []
    for e in encoded:
        decoded.append(token_to_string[e])
    string = "".join(decoded)
    # return decoded
    return string

encoded, tokens_raw = tokenize(text)

print(encoded[:100])

decoded = decode(encoded)

print(decoded[:100])

vocab_size = len(vocab)
data = torch.tensor(encoded, dtype=torch.long)

print(data[:100])

n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

In [16]:
# config
n_heads = 8
n_emb = 512
block_size = 256
batch_size = 64
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
dropout = 0.2
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_layers = 8

train_data = train_data.to(device)
val_data = val_data.to(device)

In [17]:
# Intuition: adjusting the embeddings to contain more rich contextual information influenced by the other words in the sequence

class SelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.attn = nn.Linear(n_emb, n_emb * 3)
        self.attn_dropout = nn.Dropout(0.0)
        self.resid_dropout = nn.Dropout(0.0)
        self.proj = nn.Linear(n_emb, n_emb)

        self.bias = None
    
    # Multi-dimentional matrix math for parallism efficiency in multi-headed attention
    def forward(self, x):

        B, T, C = x.size() # B = batches, T = sequence length, C = embedding dimension
        # B should be 1 during inference time?

        # Create the mask for attention weights if it doesn't exist or if the size of the mask is different from the size of the input
        if self.bias is None or self.bias.size(-1) != T:
            self.bias = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(x.device)
            self.bias.requires_grad = False

        # Single head. 
        # Intuition: allows words to gain more context from other words in the sequence
        # Multi-head.
        # Intuition: allows words to gain different contexts from other words in the sequence compared to single head (seeking different kinds of information)

        # The linear layer is trying to convert the embedding into Q, K, V matrices
        # Intuition:
        # - Q is what what token is looking for
        # - K is attributes about the token that are being looked at
        # - V is the actual values of the token that are being looked at
        QKV_merged = self.attn(x)
        Q, K, V = QKV_merged.split(n_emb, dim=2) # splits back into 3 after merged matrix calcs

        # Remember for efficiency, Q, K, V is really a list of Qs, Ks, Vs (one per token)

        # TODO: what does C // n_heads really mean
        Q = Q.view(B, T, n_heads, C // n_heads).transpose(1, 2)
        K = K.view(B, T, n_heads, C // n_heads).transpose(1, 2)
        V = V.view(B, T, n_heads, C // n_heads).transpose(1, 2)

        # 1. Intuition: a matmul between Q and K gives the relative importance of each word in the sequence to each other word
        # (this is litearlly a dot product showing similarity between two vectors)
        # 2. This is then scaled to prevent this from getting large (by sqrt the size of an embedding)
        att = (Q @ K.transpose(2, 3)) * 1.0 / math.sqrt(K.size(3))

        # Don't allow later tokens to influence earlier tokens
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) #settings 0s to -inf

        # Normalizing the attention weights
        att = F.softmax(att, dim=-1) #convert to probabilities

        # Dropout. Intuition: to prevent overfitting and to allow the model to generalize better
        att = self.attn_dropout(att)

        # Intuition: the attention weights are multiplied by the values to get the final output  
        # Deeper intuiton: V is what should be added to the token to make it more like the tokens its paying attention to
        out = att @ V

        # TODO: why contiguous
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        out = self.resid_dropout(self.proj(out))

        # Summary: each token asks for something (Q), gets a response (K), and then adds something (V) to itself to make it more like the responder who said K

        return out

In [18]:
# Intuition: the MLP is the "feedforward" part of the transformer and 
# it adjusts the embeddings differently than the self-attention layer does 
# by having non-linearities
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(n_emb, 4 * n_emb)
        self.gelu = nn.GELU()
        self.proj = nn.Linear(4 * n_emb, n_emb)
        self.dropout = nn.Dropout(0.)

    def forward(self, x):
        # Linear layer to adjust the embeddings
        x = self.l1(x)

        # Non-linear activation
        x = self.gelu(x)

        # Another linear layer to adjust the embeddings
        x = self.proj(x)

        # Dropout layer
        # Intuition: to prevent overfitting (memorizing the training data)
        x = self.dropout(x)

        return x

In [19]:
# The block is the combination of the self-attention and the MLP

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.attn = SelfAttention()
        self.mlp = MLP()

        self.norm1 = nn.LayerNorm(n_emb)
        self.norm2 = nn.LayerNorm(n_emb)
    
    def forward(self, x):
        # Intuition: the residual connection is a skip connection that allows the model to learn what to add to the embeddings.
        # This allows the gradient to flow through the residual connection to the self-attention and MLP layers
        x = x + self.attn(self.norm1(x)) # self-attention

        # Intuition: the mlp feedforward layer is the smarts of the transformer and is where knowledge from training is stored
        x = x + self.mlp(self.norm2(x)) # MLP
        
        return x


In [20]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_emb),
            pte = nn.Embedding(block_size, n_emb),
            drop = nn.Dropout(0.0),
            blocks = nn.ModuleList([Block() for _ in range(n_layers)]),
            ln_f = nn.LayerNorm(n_emb),
        ))
        self.head = nn.Linear(n_emb, vocab_size, bias=False)

    def forward(self, x):
        # x is the input sequence of tokens
        # t is the length of the sequence
        t = x.size(1)

        # Intuition: the position embeddings are added to the token embeddings to give the model information about the position of the tokens in the sequence
        # In this case, the position embeddings are learned by the model
        pos = torch.arange(t, device=device).unsqueeze(0) # unsqueeze to add batch dimension

        # Token embeddings + position embeddings
        # The token embeddings are the initial embeddings that are learned by the model
        t_emb = self.transformer.wte(x)
        p_emb = self.transformer.pte(pos)

        # Dropout: to prevent overfitting (memorizing the training data)
        x = self.transformer.drop(t_emb + p_emb)

        # Blocks
        # Intuition: the blocks are the combination of the self-attention and the MLP
        # The blocks are the main part of the transformer that change the embeddings to contain more rich contextual information influenced by the other words in the sequence
        for block in self.transformer.blocks:
            x = block(x)

        # Layer norm
        # Intuition: layer norm helps during training by normalizing the embeddings to have a mean of 0 and a standard deviation of 1
        # This helps the model learn better by preventing the embeddings from getting too large or too small
        x = self.transformer.ln_f(x)

        # Intuition: the head is the final linear layer that converts the embeddings into logits (probabilities for each token) 
        return self.head(x)

In [21]:
# Get a batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data

    # Randomly select a batch of data
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Intuition: the block size is the length of the sequence that the model is trying to predict
    # X is the input sequence of tokens, Y is the target sequence of tokens
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x.to(device), y.to(device)

In [22]:
# Initialize the model and optimizer
model = GPT().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [23]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in tqdm.tqdm(range(eval_iters)):

            # Sample a batch of data and forward pass
            X, Y = get_batch(split)
            # print(X.shape, Y.shape)
            logits = model(X)

            # Compute loss 
            # Intuition: cross entropy will convert logits into probabilities and compute the loss between the probabilities and the targets
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B * T, C), Y.view(B * T))
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
# Training loop
for iter in tqdm.tqdm(range(max_iters)):
    if iter % eval_interval == 0:
        print("Estimate")
        losses = estimate_loss()
        print(f"Step {iter}: Train Loss {losses['train']:.4f}, Validation Loss {losses['val']:.4f}")
    
    # Sample a batch of data
    xb, yb = get_batch('train')

    # Forward pass
    logits = model(xb)
    B, T, C = logits.shape
    loss = F.cross_entropy(logits.view(B * T, C), yb.view(B * T))
    
    # Backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # Clip gradients to prevent exploding gradients (this ensures training stability for large models)
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()

# Save the trained model
torch.save(model.state_dict(), "gpt_model.pth")
print("Model saved.")

In [None]:
# print total number of parameters
total_params = sum(p.numel() for p in model.parameters())
print(f'Total Parameters: {total_params}')

In [None]:
@torch.no_grad()
def generate(model, start_text, max_new_tokens=1000):
    model.eval()
    generated = torch.tensor(tokenize(start_text)[0], dtype=torch.long).unsqueeze(0).to(device)
    for _ in tqdm.trange(max_new_tokens):

        # Take the last block_size tokens from the generated text and forward pass
        logits = model(generated[:, -block_size:])

        # The shape of logits is (batch_size, block_size, vocab_size)
        # so we need to convert it to (batch_size, vocab_size) to get the next token
        # -1 takes the prediction for the last token
        logits = logits[:, -1, :]

        # Sample from the distribution
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        generated = torch.cat((generated, next_token), dim=1)

    # Decode the generated text
    return decode(generated.squeeze().tolist())

# Load the model for inference
model.load_state_dict(torch.load("gpt_model.pth", map_location=device))
model = model.to(device)

# Example usage
start_text = "   "
print(generate(model, start_text))