In [2]:
from importlib.metadata import version

import matplotlib
import tiktoken
import torch
import torch.nn as nn
import numpy as np

# Define Model

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [4]:
# Overall structure
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
        self.tok_emb.weight = self.out_head.weight # weight_tying, Raschka suggests that the model is easier to train without weight tying

        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/((2 * cfg["n_layers"]))**0.5)

    def _init_weights(self, module, std = 0.02):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [6]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.dropout(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut  # Add the original input back

        return x

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, num_heads, context_length, dropout=0.0, qkv_bias=False):
        super().__init__()

        assert d_out % num_heads == 0, "embed_dim is indivisible by num_heads"

        self.num_heads = num_heads
        self.context_length = context_length
        self.head_dim = d_out // num_heads

        self.qkv = nn.Linear(d_in, 3 * d_out, bias=qkv_bias)
        self.proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer(
            "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        batch_size, num_tokens, embed_dim = x.shape

        # (b, num_tokens, embed_dim) --> (b, num_tokens, 3 * embed_dim)
        qkv = self.qkv(x)
        qkv = qkv.view(batch_size, num_tokens, 3, self.num_heads, self.head_dim)
        # (b, num_heads, num_tokens, head_dim)
        q, k, v = qkv.permute(2, 0, 3, 1, 4)

        # (b, num_heads, num_tokens, num_tokens)
        attn_scores = torch.einsum('bhij,bhkj->bhik', q, k)
        attn_scores = attn_scores.masked_fill(
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf
        )
        attn_weights = torch.softmax(attn_scores / k.shape[-1]**-0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        # (b, num_heads, num_tokens, head_dim)
        context_vec = torch.einsum('bhij,bhjk->bhik', attn_weights, v)

        # (b, num_tokens, num_heads, head_dim)
        context_vec = context_vec.transpose(1, 2)
        context_vec = context_vec.contiguous().view(batch_size, num_tokens, embed_dim)
        context_vec = self.proj(context_vec)
        return context_vec

In [14]:
# MLP in Transformer Block
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.fc = nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"])
        self.gelu = nn.GELU()
        self.proj = nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])

    def forward(self, x):
        return self.proj(self.gelu(self.fc(x)))

In [24]:
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch.shape)
print(batch)

torch.Size([2, 4])
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [18]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("\nOutput shape:", out.shape)
print(out[0,:3,:])


Output shape: torch.Size([2, 4, 50257])
tensor([[ 0.8852,  0.0329, -0.4911,  ...,  0.2467, -0.5240, -0.0369],
        [-0.0906,  0.3250, -0.2124,  ..., -0.0765, -0.1040, -0.0471],
        [-0.0909,  0.3702,  0.3636,  ...,  0.6508,  0.0633,  0.0326]],
       grad_fn=<SliceBackward0>)


# Load Pre-Trained Weights

In [29]:
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})
gpt = GPTModel(NEW_CONFIG)
gpt.eval();

In [31]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [33]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        gpt.trf_blocks[b].att.qkv.weight = assign(
            gpt.trf_blocks[b].att.qkv.weight, 
            (params["blocks"][b]["attn"]["c_attn"])["w"].T)
        gpt.trf_blocks[b].att.qkv.bias = assign(
            gpt.trf_blocks[b].att.qkv.bias, 
            ((params["blocks"][b]["attn"]["c_attn"])["b"]))
        gpt.trf_blocks[b].att.proj.weight = assign(
            gpt.trf_blocks[b].att.proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.proj.bias = assign(
            gpt.trf_blocks[b].att.proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.fc.weight = assign(
            gpt.trf_blocks[b].ff.fc.weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.fc.bias = assign(
            gpt.trf_blocks[b].ff.fc.bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.proj.weight = assign(
            gpt.trf_blocks[b].ff.proj.weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.proj.bias = assign(
            gpt.trf_blocks[b].ff.proj.bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

NameError: name 'params' is not defined

In [63]:
import sys
import os
sys.path.append('/Users/erict/Desktop/ML-Review/dl/17_nlp/raschka_files')
from gpt_download import download_and_load_gpt2

In [65]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

checkpoint: 100%|███████████████████████████████████████████████████████████| 77.0/77.0 [00:00<00:00, 45.1kiB/s]
encoder.json: 100%|███████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 3.17MiB/s]
hparams.json: 100%|█████████████████████████████████████████████████████████| 90.0/90.0 [00:00<00:00, 52.1kiB/s]
model.ckpt.data-00000-of-00001: 100%|███████████████████████████████████████| 498M/498M [00:23<00:00, 21.2MiB/s]
model.ckpt.index: 100%|███████████████████████████████████████████████████| 5.21k/5.21k [00:00<00:00, 1.42MiB/s]
model.ckpt.meta: 100%|██████████████████████████████████████████████████████| 471k/471k [00:00<00:00, 1.58MiB/s]
vocab.bpe: 100%|████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 1.89MiB/s]


In [67]:
load_weights_into_gpt(gpt, params)
device = torch.device("mps") if torch.mps.is_available() else torch.device("cpu")
gpt.to(device);

# Generate

In [None]:
# Function for top_p, taken from https://github.com/johnma2006/candle/blob/main/candle/nlp/generation.py
def nucleus_sample(probs: np.array,
                   top_p: int):
    """Nucleus sampling. Filter to top probs such that the sum prob is just less than top_p.
    
    References:
        [1] Ari Holtzman, Jan Buys, Li Du, Maxwell Forbes, Yejin Choi.
            The Curious Case of Neural Text Degeneration. arXiv:1904.09751, 2019
    
    Args:
        probs (np.array): Array of probabilities with shape (vocab_size, batch).
            Modifies probs in place.
        top_p (int): Filter to the top `k` probs such that the sum probs is <= top_p and k is largest.
    
    """
    sorted_probs = np.sort(probs, axis=0)[::-1]
    cum_probs = sorted_probs.cumsum(axis=0)
    top_k = (cum_probs <= top_p).sum(axis=0)

    ranking = probs.shape[0] - np.argsort(np.argsort(probs, axis=0), axis=0)
    mask = (ranking <= top_k) | (ranking == 1)  # | (ranking == 1) accounts for when the edge case if highest prob > top_p

    probs[~mask] = 0
    probs /= probs.sum(axis=0)

    return probs

In [114]:
def generate(model, token_ids, max_new_tokens, context_size, temperature = 1, top_k = None, eos_id = None):
    # idx has shape (batch, n_tokens) 
    # We will continually append to idx
    for _ in range(max_new_tokens):
        # Crop current context to supported context size
        idx_cond = token_ids[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        # (batch, n_tokens, vocab_size)
        logits = logits[:, -1, :]  
        if top_k is not None: # relevant when temperature is high
            # Returns the top k for last dim, in sorted order
            top_logits, _ = torch.topk(logits, top_k)
            # this is threshold
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
        logits /= temperature
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples = 1)  # (batch, 1)
        if next_token == eos_id:
            break
        token_ids = torch.cat((token_ids, next_token), dim=1)  # (batch, n_tokens+1)
    return token_ids

In [116]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.mps.manual_seed(seed)

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [118]:
set_seed(123)

token_ids = generate(
    model=gpt,
    token_ids=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))