# Configura tokenização 🔣

In [1]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

# Função texto -> token
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

# Função token -> texto
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

# Baixa e configura modelo *GPT* (livro base) ↪

Baixa arquivos relacionados ao modelo do livro base

In [3]:
import os

# Baixa o arquivo gpt_download.py do livro
if not os.path.exists("gpt_download.py"):
  url = "https://raw.githubusercontent.com/rasbt/"\
        "LLMs-from-scratch/main/ch05/"\
        "01_main-chapter-code/gpt_download.py"
  cmd = ! wget {url}

# Baixa o modelo do GPT do livro
if not os.path.exists("previous_chapters.py"):
  url = "https://raw.githubusercontent.com/rasbt/"\
        "LLMs-from-scratch/main/ch05/"\
        "01_main-chapter-code/previous_chapters.py"
  cmd = ! wget {url}


Baixa modelo GPT2

In [4]:
from gpt_download import download_and_load_gpt2

# Baixa modelo
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 138kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.47MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 78.8kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:29<00:00, 16.8MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 8.15MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 1.72MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.79MiB/s]


Configuração do modelo

In [5]:
import torch
from previous_chapters import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Tamanho do vocabulário
    "context_length": 256, # Tamanho do contexto reduzido (original: 1024)
    "emb_dim": 768,        # Dimensão do embedding
    "n_heads": 12,         # Número de cabeças de atenção
    "n_layers": 12,        # Número de camadas
    "drop_rate": 0.1,      # Taxa de dropout
    "qkv_bias": False      # Viés de consulta-chave-valor
}

# Configuração de modelos disponíveis
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Termina de configura modelo (no nosso caso o "gpt2-small (124M)")
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval();

Carrega pesos do GPT com os devidos ajustes

In [6]:
import numpy as np

# Função auxiliar de ajuste de tensores
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

# Função que carrega os pesos do GPT com ajustes
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

# Carrega os pesos
load_weights_into_gpt(gpt, params)

# Usa o cuda se possível (otimização)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device);

# Geração de texto 🖊

In [7]:
# Função de geração de texto
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # Filtra os logits com amostragem top_k
        if top_k is not None:
            # Mantém apenas os top_k valores
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val,
                          torch.tensor(float("-inf")).to(logits.device), logits)

        # Aplica o escalonamento de temperatura
        if temperature > 0.0:
            logits = logits / temperature

            # Aplica softmax para obter as probabilidades
            probs = torch.softmax(logits, dim=-1)

            # Amostra a partir da distribuição
            idx_next = torch.multinomial(probs, num_samples=1)

        # Caso contrário, o mesmo de antes: obtém o idx da entrada do
        # vocabulário com o valor de logits mais alto
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        # Para de gerar precocemente se o token de fim de sequência for
        # encontrado e eos_id for especificado
        if idx_next == eos_id:
            break

        # Mesmo de antes: anexa o índice amostrado à sequência em execução
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

Gera texto

In [8]:
# Seed escolhida
torch.manual_seed(78)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=50,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print(f"Texto de saída:\n\n{token_ids_to_text(token_ids, tokenizer)}")

Texto de saída:

Every effort moves you

as one, like God and earth

as waters, on

a plane to move. Each

spirit makes every motion

the entire physical force of existence. No

place is the centre where the movements unfold -
