In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

<torch._C.Generator at 0x1d141e92fd0>

In [1]:
with open('harrypotterbooks.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print(text[:500])

THE BOY WHO LIVED Mr and Mrs Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much .They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense .Mr Dursley was the director of a firm called Grunnings which made drills .He was a big beefy man with hardly any neck although he did have a very large mustache .Mrs Dursley was thin and blonde and had nearly twice the usual amo


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size) # veamos el vocabulario

 !.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz~‘•■□
71


In [4]:
stoi = { ch:i for i,ch in enumerate(chars)} # token:id
itos = { i:ch for i,ch in enumerate(chars)} # id:token
encode = lambda s: [stoi[c] for c in s] # devuelve lista de ids en base a tokens proporcionados
decode = lambda l: ''.join([itos[i] for i in l]) # devuelve el texto en base a los ids proporcionados
print(encode('Hi Harry'))
print(decode(encode('Hi Harry')))

[21, 48, 0, 21, 40, 57, 57, 64]
Hi Harry


In [7]:
data = torch.tensor(encode(text), dtype = torch.long) # transformamos el libro en ids
print(data[:100]) # primeros 100 carácteres del libro

tensor([33, 21, 18,  0, 15, 28, 38,  0, 36, 21, 28,  0, 25, 22, 35, 18, 17,  0,
        26, 57,  0, 40, 53, 43,  0, 26, 57, 58,  0, 17, 60, 57, 58, 51, 44, 64,
         0, 54, 45,  0, 53, 60, 52, 41, 44, 57,  0, 45, 54, 60, 57,  0, 29, 57,
        48, 61, 44, 59,  0, 17, 57, 48, 61, 44,  0, 62, 44, 57, 44,  0, 55, 57,
        54, 60, 43,  0, 59, 54,  0, 58, 40, 64,  0, 59, 47, 40, 59,  0, 59, 47,
        44, 64,  0, 62, 44, 57, 44,  0, 55, 44])


In [8]:
n = int(0.9*len(data)) # ocuparemos el 90% del libro para entrenar
train_data = data[:n]
val_data = data[n:] # 10% de validación

In [21]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, )) # tomamos ids del texto según el tamaño del batch
    x = torch.stack([data[i:i+block_size] for i in ix]) # extraemos parrafos según la ventana de contexto
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # desplazamos una posición respecto a x
    return x,y

xb, yb = get_batch('train') # (B, T)

for b in range(batch_size):
    for t in range(block_size): # para cada token
        context = xb[b, :t+1] # ventana contexto
        target = yb[b,t]
        print(f'Cuando el input es {context.tolist()}, el target: {target}')

Cuando el input es [55], el target: 51
Cuando el input es [55, 51], el target: 54
Cuando el input es [55, 51, 54], el target: 43
Cuando el input es [55, 51, 54, 43], el target: 44
Cuando el input es [55, 51, 54, 43, 44], el target: 43
Cuando el input es [55, 51, 54, 43, 44, 43], el target: 0
Cuando el input es [55, 51, 54, 43, 44, 43, 0], el target: 59
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59], el target: 47
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47], el target: 57
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47, 57], el target: 54
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47, 57, 54], el target: 60
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47, 57, 54, 60], el target: 46
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47, 57, 54, 60, 46], el target: 47
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47, 57, 54, 60, 46, 47], el target: 0
Cuando el input es [55, 51, 54, 43, 44, 43, 0, 59, 47, 57, 54, 60, 46, 47, 0], el target: 59
Cuando 

In [22]:
@torch.no_grad() # no almacenamos variables intermedias
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out # retorna diccionario con la media de losses para train y val según las iteraciones que definamos en eval_iters

In [23]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        # el buffer es como un tensor, pero no se modifica durante el entrenamiento

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5 # agregamos escalamiento 1/sqrt(C)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

In [24]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # Multiples cabezales en paralelo
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1) # concatenamos en la última dimensióm
        out = self.proj(out) # proyección del resultado
        out = self.dropout(out)
        return out
        # cada cabezal tiene como salida (B, T, n_embd/num_heads), después concatenamos en un solo vector

In [25]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # proyección 
            # multiplicamos por 4, ya que en el paper "All Atenttion is You Need" ocupa 4 veces la dimensión de embeddings
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [26]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # agregamos conexión residual para que no se pierda información
        # a diferencia del paper original, el batch norm se está aplicando actualmente antes del feed forward
        x = x + self.ffwd(self.ln2(x))
        return x

In [27]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # agregamos conexión residual para que no se pierda información
        # a diferencia del paper original, el batch norm se está aplicando antes del feed forward
        x = x + self.ffwd(self.ln2(x))
        return x

In [28]:
# construimos un decoder, cómo es un modelo de generación de texto no es necesario ocupar el encoder
class PotterGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        # son varios bloques de multihead y feedforward
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets = None):
        B, T = idx.shape
        
        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) 
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):

            idx_cond = idx[:, -block_size:] # recortamos el contexto en caso de que idx supere el block_size
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # último token o embedding de cada fila (mantiene dimensiones B y C), (B, C)
            probs = F.softmax(logits, dim = 1) # softmax en la última dimensión (vocab_size)
            idx_next = torch.multinomial(probs, num_samples = 1) # obtenemos id desde distribución multinomial
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx

In [29]:
# Hyperparametros
batch_size = 32
block_size = 50 # ventana de contexto utilizada en la función get_batch()
max_iters = 5000
eval_interval = 500
learning_rate = 1e-4
eval_iters = 200 # iteraciones ocupadas en la validación
n_embd = 64 # cada head tiene 64/4 dimensiones
n_head = 4 # 4 cabezales
n_layer = 3 # número de blocks
dropout = 0.2
# ---------------

In [30]:
model = PotterGPT()

In [31]:
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [32]:
for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none = True) 
    loss.backward() 
    optimizer.step()

step 0: train loss 4.4685, val loss 4.4683
step 500: train loss 2.6340, val loss 2.6161
step 1000: train loss 2.4822, val loss 2.4601
step 1500: train loss 2.4179, val loss 2.3975
step 2000: train loss 2.3762, val loss 2.3575
step 2500: train loss 2.3424, val loss 2.3239
step 3000: train loss 2.3200, val loss 2.3012
step 3500: train loss 2.2871, val loss 2.2638
step 4000: train loss 2.2556, val loss 2.2362
step 4500: train loss 2.2267, val loss 2.1964


In [33]:
idx = torch.zeros((1, 1), dtype = torch.long) 
print(decode(model.generate(idx, max_new_tokens=2000)[0].tolist()))

 pooiedit ing anig th ce creathulitw pey .Thow brerer wof .Whit yousherd fame arave tinsle wat faryankenst herak to becoverbe he to ack Whanor ionetle hat thar Dein nott id wang wetrces any us itwot he whariow thastelly grehabbe wamed sooord Harry Pind yon sleru faryat las woup wis winde shilroy ed llupald atts Cadd hoitunc and 3rot Wiole to .Ho .Thiry Dange avechn Cryednt .Theoll Hant pland of coferpeds eas and noaGlores Tavedow tho .Gustould thour le e ..Criurt wailfornd latdok dund .Rmuron the roo gvit He waine th ate Fo why higgokenther istom ered latigh touusar lugt ay dli yof cerontsh thillikce homau d warye to thar an ucteund as gendfor hi tt yronxppacielde ogely therle me herte Haw ed hawss cerhe bumepperet thedy meaw garkilos te yast thagrouir andtk theedvo Dithe the rome caghe wass hre .Thor dowerabo eas tint bull that shaved his Proor ownemtes buplitand abrofim gom clonche CacheMrte Ror fre Deromppelld .Cre belesanis duvee .Buint wofor on estere meanrtcurmand iaifded thas ev