In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import pandas as pd
import altair as alt

torch.manual_seed(2025)

<torch._C.Generator at 0x79ec376f1a10>

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [3]:
class Tokenizer:
    def __init__(self, text):
        self.chars = sorted(list(set(text)))
        self.stoi = { ch:i for i,ch in enumerate(self.chars) }
        self.itos = { i:ch for i,ch in enumerate(self.chars) }
    def encode(self, s):
        return [self.stoi[c] for c in s]
    def decode(self, l):
        return ''.join([self.itos[i] for i in l])

In [4]:
#batch_size = 4 # how many independent sequences will we process in parallel?
#block_size = 8 # what is the maximum context length for predictions?
class DataLoader:
    """
    Gestiona los splits de entrenamiento/validación y provee lotes
    de secuencias de caracteres ya tokenizados.
    """
    def __init__(self, data, train_size=0.9, block_size=8, batch_size=4):
        """
        Prepara los subconjuntos de entrenamiento y validación.

        :param self: instancia de DataLoader
        :param data: tensor 1D de índices que representan el corpus completo
        :param train_size: proporción del corpus destinada a entrenamiento (0-1)
        """
        data = torch.tensor(data, dtype=torch.long)
        n = int(train_size*len(data)) # first 90% will be train, rest val
        self.train_data = data[:n]
        self.val_data = data[n:]
        self.block_size = block_size
        self.batch_size = batch_size


    def get_batch(self, split):
        # generate a small batch of data of inputs x and targets y
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data) - self.block_size, (self.batch_size,))
        x = torch.stack([data[i:i+self.block_size] for i in ix])
        y = torch.stack([data[i+1:i+self.block_size+1] for i in ix])
        return x, y

In [5]:
tokenizer = Tokenizer(text)

print(tokenizer.encode("tokenizer test"))
print(tokenizer.decode(tokenizer.encode("tokenizer test")))

[58, 53, 49, 43, 52, 47, 64, 43, 56, 1, 58, 43, 57, 58]
tokenizer test


In [6]:
data_loader = DataLoader(tokenizer.encode(text), batch_size=64, block_size=32)
train_data = data_loader.train_data
val_data = data_loader.val_data

print(train_data[:100])
print(val_data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10,  0, 19, 53, 53, 42,  1, 51, 53,
        56, 56, 53, 61,  6,  1, 52, 43, 47, 45, 46, 40, 53, 59, 56,  1, 14, 39,
        54, 58, 47, 57, 58, 39,  8,  0,  0, 14, 13, 28, 32, 21, 31, 32, 13, 10,
         0, 19, 53, 53, 42,  1, 51, 53, 56, 56, 53, 61,  6,  1, 52, 43, 47, 45,
        46, 40, 53, 59, 56,  1, 19, 56, 43, 51, 47, 53,  8,  0, 19, 53, 42,  1,
        57, 39, 60, 43,  1, 63, 53, 59,  6,  1])


In [8]:
batch = data_loader.get_batch('train')
xb, yb = batch

print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([64, 32])
tensor([[58, 43, 56,  ..., 58, 46, 43],
        [58, 46, 39,  ..., 57,  1, 57],
        [58, 46, 10,  ..., 40, 43, 61],
        ...,
        [ 6,  1, 39,  ..., 52, 42,  1],
        [ 6,  1, 39,  ..., 58, 47, 57],
        [54, 50, 43,  ..., 46, 43,  1]])
targets:
torch.Size([64, 32])
tensor([[43, 56,  5,  ..., 46, 43, 43],
        [46, 39, 58,  ...,  1, 57, 61],
        [46, 10,  0,  ..., 43, 61, 39],
        ...,
        [ 1, 39, 52,  ..., 42,  1, 42],
        [ 1, 39, 52,  ..., 47, 57,  1],
        [50, 43, 39,  ..., 43,  1, 41]])


In [None]:
for b in range(1): # 1 batch (batch dimension)
    for t in range(8): # 8 tokens (time dimension)
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [58] the target: 43
when input is [58, 43] the target: 56
when input is [58, 43, 56] the target: 5
when input is [58, 43, 56, 5] the target: 57
when input is [58, 43, 56, 5, 57] the target: 1
when input is [58, 43, 56, 5, 57, 1] the target: 15
when input is [58, 43, 56, 5, 57, 1, 15] the target: 46
when input is [58, 43, 56, 5, 57, 1, 15, 46] the target: 59


In [27]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return x

In [30]:
def show_example(fn, args=[]):
    if __name__ == "__main__" and True:
        return fn(*args)

def example_positional():
    # Usamos la versión sin dropout; solo requiere d_model
    pe = PositionalEncoding(20)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim],
                    "dimension": dim,
                    "position": list(range(100)),
                }
            )
            for dim in [4, 5, 6, 7]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )


show_example(example_positional)

In [35]:
def plot_embedding_heatmap(seq_len=64, d_model=32):
    """
    Genera un mapa de calor de las componentes de los embeddings
    para los primeros `seq_len` tokens del corpus de entrenamiento.
    """
    token_ids = data_loader.train_data[:seq_len]
    vocab_size = len(tokenizer.chars)

    # Capa de embedding simple (semilla fijada arriba para reproducibilidad)
    embed = nn.Embedding(vocab_size, d_model)
    emb = embed(token_ids).detach().cpu().numpy()  # (seq_len, d_model)

    df = pd.DataFrame(emb)
    df["position"] = range(seq_len)
    df_long = df.melt(id_vars="position", var_name="dimension", value_name="value")
    df_long["dimension"] = df_long["dimension"].astype(int)

    return (
        alt.Chart(df_long)
        .mark_rect()
        .encode(
            x=alt.X("position:O", title="Posición"),
            y=alt.Y("dimension:O", title="Dimensión"),
            color=alt.Color("value:Q", title="Valor"),
            tooltip=["position", "dimension", "value"],
        )
        .properties(
            width=700,
            height=300,
            title="Mapa de calor de embeddings de entrada",
        )
    )

plot_embedding_heatmap(seq_len=64, d_model=32)

In [None]:
def attention(query, key, value, mask=False, dropout_layer=None): # mask=True, decoder-only model for text generation
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask:
        _, T, _ = query.shape
        tril = torch.tril(torch.ones(T, T, device=query.device))
        scores = scores.masked_fill(tril == 0, float('-inf'))

    scores = F.softmax(scores, dim=-1)
    if dropout_layer is not None:
        scores = dropout_layer(scores)
    return torch.matmul(scores, value)


In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, n_embd, head_size, dropout=0.0):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        #self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x, mask=False):
        key = self.key(x)
        query = self.query(x)
        value = self.value(x)
        out = attention(query, key, value, mask, dropout_layer=self.dropout)
        return out # mask=True, decoder-only model for text generation

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, n_embd, num_heads, dropout=0.0):
        super().__init__()
        assert n_embd % num_heads == 0, "n_embd must be divisible by num_heads"
        head_size = n_embd // num_heads
        self.heads = nn.ModuleList([Head(n_embd, head_size, dropout=dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=False):
        out = torch.cat([h(x, mask=mask) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
        

In [79]:
multihead_attn = MultiHeadAttention(n_embd=512, num_heads=4)

# Ejemplo de uso
query = torch.randn(1, 64, 512)  # (batch_size, seq_len, d_model)
key = torch.randn(1, 64, 512)
value = torch.randn(1, 64, 512)

output = multihead_attn(query, mask=True) # mask=True, decoder-only model for text generation
print(output.shape)  # (batch_size, seq_len, d_model)

torch.Size([1, 64, 512])


In [None]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd, dropout=0.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_heads, mask=False, dropout=0.0):
        # n_embd: embedding dimension, n_head: the number of heads in the self-attention
        super().__init__()
        head_size = n_embd // n_heads
        self.mask = mask
        self.msa = MultiHeadAttention(n_embd, n_heads, dropout=dropout) # Masked multi-head attention for decoder-only model
        self.ln1 = nn.LayerNorm(n_embd)
        self.ffwd = FeedForward(n_embd, dropout=dropout)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.msa(self.ln1(x), mask=self.mask)
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
class GPTLanguageModel(nn.Module):

    def __init__(self, vocab_size, n_embd, n_head, n_layer, dropout=0.0):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = PositionalEncoding(n_embd)
        self.blocks = nn.ModuleList([Block(n_embd, n_head, mask=True, dropout=dropout) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        x = self.position_embedding_table(tok_embd)

        for block in self.blocks:
            x = block(x)

        x = self.ln_f(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, block_size=32):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


class GPTConfig:
    n_embd = 384
    n_heads = 6
    n_layers = 6
    vocab_size = len(tokenizer.chars)
    dropout = 0.1

    def __init__(self, vocab_size, n_embd, n_head, n_layer, dropout=0.1):
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.n_head = n_head
        self.n_layer = n_layer
        self.dropout = dropout

In [None]:
learning_rate = 3e-4
max_iters = 5000
eval_interval = 100
eval_iters = 200
dropout = 0.1
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = Tokenizer(text)
data_loader = DataLoader(tokenizer.encode(text), batch_size=64, block_size=32)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = data_loader.get_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


config = GPTConfig(len(tokenizer.chars), 384, 6, 6, dropout=dropout)

model = GPTLanguageModel(config.vocab_size, config.n_embd, config.n_head, config.n_layer, dropout=config.dropout)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
m = model.to(device)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = data_loader.get_batch('train')
    xb, yb = xb.to(device), yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(tokenizer.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

step 0: train loss 4.3443, val loss 4.3551
step 100: train loss 2.2703, val loss 2.2998
step 200: train loss 2.0058, val loss 2.0720
step 300: train loss 1.8625, val loss 1.9787
step 400: train loss 1.7618, val loss 1.8993
step 500: train loss 1.7004, val loss 1.8530
step 600: train loss 1.6501, val loss 1.8302
step 700: train loss 1.6242, val loss 1.7947
step 800: train loss 1.5941, val loss 1.7681
step 900: train loss 1.5670, val loss 1.7511
step 1000: train loss 1.5362, val loss 1.7222
step 1100: train loss 1.5202, val loss 1.7185
step 1200: train loss 1.5085, val loss 1.7065
step 1300: train loss 1.5025, val loss 1.6995
step 1400: train loss 1.4807, val loss 1.6804
step 1500: train loss 1.4755, val loss 1.6804
step 1600: train loss 1.4596, val loss 1.6600
step 1700: train loss 1.4480, val loss 1.6554
step 1800: train loss 1.4357, val loss 1.6598
step 1900: train loss 1.4328, val loss 1.6445
step 2000: train loss 1.4277, val loss 1.6478
step 2100: train loss 1.4138, val loss 1.6395
