# Mini-LLMs
Harvard AI Bootcamp

## Make a copy of this notebook! Editing directly will not be saved.

Let's build GPT with a Shakespeare dataset!

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

Hyperparameters are adjustable settings or configurations that determine a machine learning model's structure, learning rate, or optimization process. They are set before training and remain constant throughout the learning process. Properly tuning hyperparameters can significantly affect a model's performance, generalization ability, and convergence speed.

In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 50
eval_interval = 100
learning_rate = 1e-3
device = None
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

1. **batch_size:** It defines the number of independent sequences processed in parallel during each iteration of model training, influencing the efficiency and resource utilization of the training process.

2. **block_size:** This hyperparameter sets the maximum context length for predictions, determining the size of input sequences and the range of dependencies the model can capture.

3. **max_iters:** It specifies the maximum number of iterations (training steps) during model training, controlling the duration of the training process.

4. **eval_interval:** This hyperparameter sets how often the model's performance is evaluated on the training and validation sets, aiding in monitoring training progress and preventing overfitting.

5. **learning_rate:** It determines the step size in updating the model's parameters during optimization, influencing the convergence and stability of the training process.

6. **device:** It dynamically selects the computing device (GPU or CPU) available for training the model, adapting to the hardware environment.

7. **eval_iters:** It sets the number of iterations used for estimating the loss during evaluation, impacting the reliability of performance assessment.

8. **n_embd:** This hyperparameter defines the dimensionality of the model's embeddings, influencing the model's capacity to represent and learn complex patterns in the data.

9. **n_head:** It specifies the number of self-attention heads in the model, determining the diversity and parallel processing capability of attention mechanisms.

10. **n_layer:** This hyperparameter sets the number of transformer blocks or layers in the model, affecting its depth and capacity to capture hierarchical features.

11. **dropout:** It controls the probability of dropout regularization during training, where this hyperparameter influences model generalization and prevents overfitting.

In [None]:
# TODO: change device to use GPU if GPU is available, otherwise use CPU

In [None]:
# getting the Shakespear dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
torch.manual_seed(1337)

# TODO: open the file, read the input data input.txt and store that in a variable called text
text = None

In [None]:
# TODO: compute all the unique characters that occur in this text, store that in chars
chars = None
vocab_size = len(chars)

In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val

In [None]:
# TODO: split in train_data and val_data based on n
train_data = None
val_data = None

The get_batch function appears to be responsible for generating a batch of input-target pairs from a given dataset

In [None]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

One-head self-attention refers to a specific instance where a single attention mechanism or "head" is employed. Self-attention mechanisms are a key component in transformer architectures, allowing the model to weigh different parts of the input sequence differently, based on learned attention scores.



In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

The purpose of using multiple attention heads is to allow the model to attend to different parts of the input sequence simultaneously, enabling the capture of diverse patterns and dependencies

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

FeedForward module is a component that consists of one or more linear layers followed by non-linear activation functions. The purpose of a feedforward module is to introduce non-linearity into the network and enable it to learn complex mappings from input to output.



In [None]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

Bigram models are a type of statistical language model used for predicting the likelihood of a word based on the occurrence of its preceding word in a sequence. These models assume that the probability of a word depends only on the previous word, capturing local dependencies in the data. Bigram models are simple but effective, commonly used in tasks like text generation, machine translation, and information retrieval. However, they have limitations in handling long-range dependencies and contextual nuances compared to more advanced models like neural language models.

In [None]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


In [None]:
# TODO: create a PyTorch optimizer with AdamW and pass in the model parameters and learning rate
optimizer = None

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))