## Direitos Autorais Modelo ChatGPT em R:
Modelo apresentado na aula 27 do curso de estat√≠stica e Machine Learning do IME-USP.

Autoria: Prof. Alexandre Galv√£o Patriota

## üîπ Par√¢metros de Entrada do Modelo GPT

A fun√ß√£o de inicializa√ß√£o do modelo recebe os seguintes argumentos:

| **Par√¢metro** | **Significado** | **Interpreta√ß√£o** |
|----------------|-----------------|-------------------------------|
| **block_size** | Tamanho da janela de contexto (n√∫mero de tokens consecutivos considerados no bloco attention). | Define o comprimento da janela de condicionamento ‚Äî quantos tokens anteriores o modelo utiliza para prever o pr√≥ximo. |
| **n_embd** | Dimens√£o do vetor de embedding. | Tamanho do espa√ßo latente cont√≠nuo onde cada token √© representado por um vetor real. |
| **N_Layers** | N√∫mero de camadas do Transformer (profundidade da rede). | Quantas vezes o bloco ‚ÄúAten√ß√£o + Feed-Forward + Normaliza√ß√£o‚Äù √© repetido ao longo da rede. |
| **nvoc** | Tamanho do vocabul√°rio. | N√∫mero de categorias poss√≠veis no modelo multinomial. Cada predi√ß√£o escolhe uma entre `nvoc` op√ß√µes (No m√©todo Greedy). |
| **head** | N√∫mero de *heads* de aten√ß√£o. | Quantas proje√ß√µes paralelas de aten√ß√£o s√£o calculadas ‚Äî cada *head* modela um tipo distinto de depend√™ncia contextual. |
| **p0** | Taxa de *dropout* (padr√£o 0.1). | Probabilidade de zerar aleatoriamente algumas ativa√ß√µes durante o treino, reduzindo o *overfitting*. |

---

## Par√¢metros de configura√ß√£o 

In [131]:
config <- list(
  #Corpus for training (global)
  file_name = "ABC.txt",
  train = !TRUE,
  run = TRUE,
  read_weights = !TRUE,

  #gpt parameters (global)
  block_size = 16,   #Maximum context
  n_embd = 128,      #Embedding dimension
  N_Layers = 2,      #Number of layers
  Head = 2,          #Number of heads

  #Training parameters (global)
  lr = 0.003,        #Learning rate
  batch_size = 64,   #Batch size
  p0 = 0.2,          #Dropout proportion
  epochs = 400,        #Number of epochs
  num_workers = 6,  #Number of cpu workers

  max_new_tokens = 700
)

In [132]:
library(torch)

GPT <- torch::nn_module(
  initialize = function(block_size, n_embd, N_Layers, nvoc, Head, p0 = 0.1) {

    self$N   <- N_Layers
    self$wpe <- torch::nn_embedding(block_size, n_embd)
    self$wte <- torch::nn_embedding(nvoc, n_embd, padding_idx = 1)

    self$MM  <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) torch::nn_multihead_attention(n_embd, Head, dropout = p0, batch_first = TRUE)
    ))

    self$scale1 <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) torch::nn_layer_norm(n_embd)
    ))

    self$scale2 <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) torch::nn_layer_norm(n_embd)
    ))

    self$scale3 <- torch::nn_layer_norm(n_embd, elementwise_affine = TRUE)

    self$FFN <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) {
        torch::nn_sequential(
          torch::nn_linear(n_embd, 4 * n_embd),
          torch::nn_gelu(),
          torch::nn_linear(4 * n_embd, n_embd),
          torch::nn_dropout(p0)
        )
      }
    ))

    # cabe√ßa linear de sa√≠da (mantive seu nome ln_f)
    self$ln_f  <- torch::nn_linear(n_embd, nvoc, bias = FALSE)
    self$drop0 <- torch::nn_dropout(p = p0)
  },

  forward = function(x, return_intermediates = FALSE) {
    # x: (B, T)
    B <- x$size(1)
    T <- x$size(2)

    # posi√ß√µes 1..T (long)
    x1 <- torch::torch_arange(1, T,
      dtype = torch::torch_long(),
      device = x$device
    )

    # m√°scara causal (pro√≠be olhar para o futuro)
    wei <- torch::torch_triu(torch::torch_ones(T, T, device = x$device), diagonal = 1)$to(
      dtype = torch::torch_bool()
    )

    # embeddings
    output <- self$wte(x) + self$wpe(x1)$unsqueeze(1)  # (B, T, E)
    output <- self$drop0(output)

    # (opcional) inspe√ß√£o r√°pida
    # cat("wei shape:", as.character(wei$size()), "\n"); print(wei$to(dtype = torch_int()))
    # cat("x1 shape:", as.character(x1$size()), "\n"); print(x1)

    for (j in 1:self$N) {
      # pr√©-norm + aten√ß√£o multihead
      QKV <- self$scale1[[j]](output)  # (B, T, E) pois batch_first = TRUE
      attn_out <- self$MM[[j]](
        query = QKV, key = QKV, value = QKV,
        attn_mask = wei, need_weights = FALSE
      )[[1]]
      output <- output + attn_out

      # feed-forward com pr√©-norm
      output <- output + self$FFN[[j]](self$scale2[[j]](output))
    }

    # norm final + cabe√ßa linear ‚Üí logits (B, T, nvoc)
    output <- self$scale3(output)
    logits <- self$ln_f(output)

    if (return_intermediates) {
      return(list(
        x1     = x1$cpu(),
        wei    = wei$to(dtype = torch_int())$cpu(),
        out    = output$cpu(),
        logits = logits$cpu()
      ))
    }
    logits
  }
)

## Visualizando todas as estruturas que compoem o modelo:

In [133]:
library(torch)

# instanciar um modelo simples
model <- GPT(
  block_size = 8,
  n_embd = 16,
  N_Layers = 2,
  nvoc = 32,
  Head = 2
)

# entrada: 1 batch, T=8, tipo long (√≠ndices de tokens)
x <- torch_tensor(matrix(1:8, nrow = 1))

# executar e inspecionar intermedi√°rios
res <- model(x, return_intermediates = TRUE)
res$x1      # posi√ß√µes 1..T
res$wei     # m√°scara causal (T x T)

torch_tensor
 1
 2
 3
 4
 5
 6
 7
 8
[ CPULongType{8} ]

torch_tensor
 0  1  1  1  1  1  1  1
 0  0  1  1  1  1  1  1
 0  0  0  1  1  1  1  1
 0  0  0  0  1  1  1  1
 0  0  0  0  0  1  1  1
 0  0  0  0  0  0  1  1
 0  0  0  0  0  0  0  1
 0  0  0  0  0  0  0  0
[ CPUIntType{8,8} ]

## üß† Fluxo Computacional do Modelo `GPT` (Fun√ß√£o `forward()`)

```text
Entrada x  (tokens inteiros)
      ‚îÇ
      ‚ñº
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë  üîπ Embeddings                               ‚ïë
‚ïë  self$wte(x)  ‚Üí embedding sem√¢ntico          ‚ïë
‚ïë  self$wpe(x1) ‚Üí embedding posicional         ‚ïë
‚ïë  output = wte(x) + wpe(x1)                   ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
      ‚îÇ
      ‚ñº
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë  üîπ M√°scara causal (wei)                     ‚ïë
‚ïë  Matriz (T√óT) triangular superior = 1        ‚ïë
‚ïë  ‚Üí impede que o token veja o "futuro"        ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
      ‚îÇ
      ‚ñº
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë  üîÅ N vezes (para cada camada j)              ‚ïë
‚ïë  ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê ‚ïë
‚ïë  ‚îÇ 1. Normaliza√ß√£o:  self$scale1[[j]](x)   ‚îÇ ‚ïë
‚ïë  ‚îÇ 2. Multi-Head Attention:                ‚îÇ ‚ïë
‚ïë  ‚îÇ    Q,K,V = output                       ‚îÇ ‚ïë
‚ïë  ‚îÇ    attn_out = self$MM[[j]](Q,K,V,mask)  ‚îÇ ‚ïë
‚ïë  ‚îÇ 3. Res√≠duo:  output ‚Üê output + attn_out ‚îÇ ‚ïë
‚ïë  ‚îÇ 4. Normaliza√ß√£o:  self$scale2[[j]](x)   ‚îÇ ‚ïë
‚ïë  ‚îÇ 5. Feed-Forward: self$FFN[[j]](...)     ‚îÇ ‚ïë
‚ïë  ‚îÇ 6. Res√≠duo:  output ‚Üê output + FFN_out  ‚îÇ ‚ïë
‚ïë  ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
      ‚îÇ
      ‚ñº
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë  üîπ Normaliza√ß√£o final + Cabe√ßa Linear        ‚ïë
‚ïë  output ‚Üê self$scale3(output)                 ‚ïë
‚ïë  logits ‚Üê self$ln_f(output)                   ‚ïë
‚ïë  (dimens√µes: [batch, seq_len, vocabul√°rio])   ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
      ‚îÇ
      ‚ñº
Sa√≠da: `logits`  ‚Üí pontua√ß√µes para cada token poss√≠vel

## üß© Interpreta√ß√£o 

| **Etapa** | **Significado Matem√°tico** | **Interpreta√ß√£o Estat√≠stica** |
|------------|----------------------------|--------------------------------|
| **wte(x)** | embedding de palavras | converte cada token discreto em vetor cont√≠nuo ( x‚Çú ‚àà ‚Ñù·µê ) |
| **wpe(x1)** | embedding posicional | injeta informa√ß√£o de ordem (posi√ß√£o temporal) |
| **wei** | m√°scara causal | implementa  P(X‚Çú‚Çä‚ÇÅ ‚à£ X‚ÇÅ:‚Çú),  proibindo olhar para  X‚Çç‚Çä‚Çú‚Çé |
| **MM (multihead)** | autoaten√ß√£o | estima depend√™ncias condicionais entre tokens |
| **FFN** | feed-forward | mistura n√£o linear ‚Äî ajusta as representa√ß√µes locais |
| **ln_f** | camada linear final | converte o espa√ßo latente em logits para o vocabul√°rio |
| **logits** | sa√≠da final | aproxima  PÃÇ_Œ∏(X‚Çú‚Çä‚ÇÅ = v·µ¢ ‚à£ X‚ÇÅ:‚Çú) |

---

### üìò **Resumo**

O `forward()` implementa o c√°lculo da **verossimilhan√ßa condicional**  
f_Œ∏(X‚Çú‚Çä‚ÇÅ ‚à£ X‚ÇÅ:‚Çú)  
atrav√©s de uma sequ√™ncia de transforma√ß√µes:

‚û°Ô∏è **embeddings ‚Üí aten√ß√£o ‚Üí normaliza√ß√µes ‚Üí logits**

---

### Gerando o texto referencia de treino para o modelo

In [134]:
# ============================================================
# Gera√ß√£o de dados ABC ‚Äî sequ√™ncias Markovianas
# ============================================================

file_name="ABC.txt"
# Defini√ß√£o do vocabul√°rio
voc <- c("AABCBBC", "BCABCCA", "CAAACB\n\n")
#p <- rbind(c(0.4,0.4,0.2), c(0.6, 0.2, 0.2), c(0.7, 0.2, 0.1))
p <- list()
p[[1]] <- cbind(c(0.4, 0.4, 0.2))
set.seed(1)
M = 150000
# for( i in 2:M)
#    p[[t]] = p%*%p[[i - 1]]

aux = function(s) voc[sample(1:3, 1, prob=p[[1]])]
ABC <- paste(sapply(1:M, aux), collapse="")
write(ABC[1], file=file_name)



## Extra√ß√£o do vocabul√°rio do texto

In [135]:
#encoding into token ids

file <- base::readChar(config$file_name, file.info(config$file_name)$size)
voc <- c("<PAD>", sort(unique(unlist(strsplit(file, "")))))
print(voc)

[1] "<PAD>" "\n"    "A"     "B"     "C"    


In [136]:
Encoder = function(file = file0, vocabulary = voc){
  file = unlist(strsplit(file, ""))
  filex = numeric(length(file))
  for(i in 1:length(vocabulary)){
    filex[file == vocabulary[i]] <- i
  }
  return(filex)
}

Decoder = function(file = file1, vocabulary = voc){
  filex = file
  for(i in 1:length(vocabulary)){
    filex[file == i] <- vocabulary[i]
  }
  return(filex)
}

In [137]:
encoded <- Encoder(file = file, vocabulary = voc)
nvoc <- length(voc)
print(nvoc)

[1] 5


In [138]:
# texto de exemplo
file0 <- "ABA C\nC"
file0 <- gsub(" ", "", file0)   # (se quiser tirar espa√ßos)

# encoder ‚Üí √≠ndices
enc <- Encoder(file = file0, vocabulary = voc)
enc
# ex.: 3 4 3 5 2 5

# decoder ‚Üí volta para s√≠mbolos
dec <- Decoder(file = enc, vocabulary = voc)
dec
paste(dec, collapse = "")
# deve reconstruir exatamente 'file0'

### Defini√ß√£o e Treino do modelo

In [139]:
Model <- GPT(block_size = config$block_size,
                n_embd = config$n_embd,
                N_Layers = config$N_Layers,
                nvoc = nvoc,
                Head = config$Head)
                

In [76]:
params <- Model$named_parameters()

for (p in names(params)) {
  cat(sprintf("%-20s %s ‚Üí %d par√¢metros\n",
              p,
              paste(dim(params[[p]]), collapse = " x "),
              prod(dim(params[[p]]))))
}

total_params <- sum(sapply(params, function(x) prod(dim(x))))
cat("Total de par√¢metros no modelo:", total_params, "\n")

wpe.weight           16 x 128 ‚Üí 2048 par√¢metros
wte.weight           5 x 128 ‚Üí 640 par√¢metros
MM.0.out_proj.weight 128 x 128 ‚Üí 16384 par√¢metros
MM.0.out_proj.bias   128 ‚Üí 128 par√¢metros
MM.0.in_proj_weight  384 x 128 ‚Üí 49152 par√¢metros
MM.0.in_proj_bias    384 ‚Üí 384 par√¢metros
MM.1.out_proj.weight 128 x 128 ‚Üí 16384 par√¢metros
MM.1.out_proj.bias   128 ‚Üí 128 par√¢metros
MM.1.in_proj_weight  384 x 128 ‚Üí 49152 par√¢metros
MM.1.in_proj_bias    384 ‚Üí 384 par√¢metros
scale1.0.weight      128 ‚Üí 128 par√¢metros
scale1.0.bias        128 ‚Üí 128 par√¢metros
scale1.1.weight      128 ‚Üí 128 par√¢metros
scale1.1.bias        128 ‚Üí 128 par√¢metros
scale2.0.weight      128 ‚Üí 128 par√¢metros
scale2.0.bias        128 ‚Üí 128 par√¢metros
scale2.1.weight      128 ‚Üí 128 par√¢metros
scale2.1.bias        128 ‚Üí 128 par√¢metros
scale3.weight        128 ‚Üí 128 par√¢metros
scale3.bias          128 ‚Üí 128 par√¢metros
FFN.0.0.weight       512 x 128 ‚Üí 65536 par√¢metros
FFN.0

## Sanity check

Aplicacao da softmax no modelo n√£o treinado ainda.


In [140]:
# 1) modelo
Model <- GPT(
  block_size = config$block_size,
  n_embd     = config$n_embd,
  N_Layers   = config$N_Layers,
  nvoc       = nvoc,            # tamanho do vocabul√°rio que voc√™ calculou
  Head       = config$Head,
  p0         = config$p0
)

# 2) entrada v√°lida
T <- 8
x <- torch::torch_tensor(matrix(sample(1:nvoc, T, replace = TRUE), nrow = 1),
                         dtype = torch::torch_long())

# 3) forward -> logits (B, T, nvoc)
logits <- Model(x)

# 4) softmax no √∫ltimo eixo -> probas
p <- torch::nnf_softmax(logits, dim = -1)

# 5) verifique que cada distribui√ß√£o em (b, t, :) soma 1
torch::torch_sum(p[1, 1, ])    # deve imprimir ~1
torch::torch_sum(p, dim = -1)  # vetor (B, T) todo de 1's

torch_tensor
1
[ CPUFloatType{} ][ grad_fn = <SumBackward0> ]

torch_tensor
 1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000
[ CPUFloatType{1,8} ][ grad_fn = <SumBackward1> ]

### Defini√ß√£o do modelo de treino e do modelo de teste

In [141]:
file0 <- readChar("ABC.txt", file.info("ABC.txt")$size)
voc <- c("<PAD>", sort(unique(unlist(strsplit(file0, "")))))
Encoded <- Encoder(file = file0, vocabulary = voc)

In [142]:
p_train = 0.8

# Divis√£o dos dados
n <- length(Encoded)

BD.train <- torch_tensor(Encoded[1:round(p_train*n)],dtype=torch_int())
BD.test <- torch_tensor(Encoded[round(p_train*n+1):n], dtype = torch_int())


### Treino do modelo

In [143]:
for(i in 1:config$epochs){

    # 1) posi√ß√µes iniciais aleat√≥rias (garante que caiba um bloco + 1)
    idx = sample(1:(round(p_treino*n) - config$batch_size), config$batch_size)

    # 2) para cada idx, empilha a sequ√™ncia idx + 0..block_size
    #    (forma: (batch_size) x (block_size+1)) e depois "flatten"
    idx2 = as.integer(c(t(pmin(outer(as.integer(idx), 0:config$block_size, '+'), n))))

    Z <- BD.train[idx2, drop = FALSE]$view(c(length(idx), config$block_size + 1))
    X <- Z[,1:config$block_size]
    Y <- Z[,2:(config$block_size+1)]
    
    Model$train()          # ativa modo de treino (dropout etc.)
    FIT <- Model(X)        # forward pass
    loss0 <- loss(FIT$view(c(-1, FIT$size(-1))), v$view(-1))

    print(loss0$item())

}

ERROR: Error in loss(FIT$view(c(-1, FIT$size(-1))), v$view(-1)): n√£o foi poss√≠vel encontrar a fun√ß√£o "loss"


In [144]:
optimizer <- torch::optim_adamw(Model$parameters, lr= config$lr)
loss0 <- torch::nn_cross_entropy_loss()
loss_store <- numeric(config$epochs)

for (i in 1:config$epochs) {

  # 1) posi√ß√µes iniciais aleat√≥rias (garante que caiba um bloco+1)
  idx  <- sample(1:(round(p_treino * n) - config$batch_size), config$batch_size)

  # 2) para cada idx, empilha a sequ√™ncia idx + 0..block_size  (flatten depois)
  idx2 <- as.integer(c(t(pmin(outer(as.integer(idx), 0:config$block_size, `+`), n))))

  # 3) monta os pares (X, Y) com janela deslizante de tamanho block_size
  Z <- BD.train[idx2, drop = FALSE]$view(c(length(idx), config$block_size + 1))
  X <- Z[, 1:config$block_size]
  Y <- Z[, 2:(config$block_size + 1)]

  # 4) forward + perda
  FIT  <- Model$train()(X)   # chama o forward
  loss = loss0(FIT$flatten(end_dim=2), Y$flatten())
  optimizer$zero_grad()
  loss$backward()
  optimizer$step()
  loss_store[i] = loss$item()
  cli::cli_progress_message(paste("Epoca: ", i, "Train loss: ", loss_store[i]))
}

Epoca:  1 Train loss:  1.75601923465729

Epoca:  2 Train loss:  1.13897490501404

Epoca:  3 Train loss:  1.05270624160767

Epoca:  4 Train loss:  1.00460243225098

Epoca:  5 Train loss:  0.943425834178925

Epoca:  6 Train loss:  0.990464210510254

Epoca:  7 Train loss:  0.965478360652924

Epoca:  8 Train loss:  0.957518935203552

Epoca:  9 Train loss:  0.955720484256744

Epoca:  10 Train loss:  0.923735499382019

Epoca:  11 Train loss:  0.990632891654968

Epoca:  12 Train loss:  0.959781885147095

Epoca:  13 Train loss:  0.935571908950806

Epoca:  14 Train loss:  0.924338757991791

Epoca:  15 Train loss:  0.937005639076233

Epoca:  16 Train loss:  0.911421239376068

Epoca:  17 Train loss:  0.84186840057373

Epoca:  18 Train loss:  0.929946541786194

Epoca:  19 Train loss:  0.895896375179291

Epoca:  20 Train loss:  0.89698988199234

Epoca:  21 Train loss:  0.84135639667511

Epoca:  22 Train loss:  0.875084400177002

Epoca:  23 Train loss:  0.848853588104248

Epoca:  24 Train loss:  0.8

# Prevendo o pr√≥ximo token

In [145]:
prompt = "A"
x = Encoder(prompt)
x = torch_tensor(x, dtype=torch_int())$unsqueeze(1)
nnf_softmax(Model$eval()(x), -1)

#Proximo token mais prov√°vel = B P(B)=0.4691

torch_tensor
(1,.,.) = 
  0.0000  0.0010  0.3372  0.5416  0.1202
[ CPUFloatType{1,1,5} ][ grad_fn = <SoftmaxBackward0> ]

### N√£o precisa nem aplicar a softmax, basta pegar a saida maxima da funcao logitus.

In [146]:
prompt = "A"
x = Encoder(prompt)
x = torch_tensor(x, dtype=torch_int())$unsqueeze(1)
next_token = torch_argmax(Model$eval()(x), -1)
print(next_token)

torch_tensor
 4
[ CPULongType{1,1} ]


In [147]:
prompt = "A"
x = Encoder(prompt)
x = torch_tensor(x, dtype=torch_int())$unsqueeze(1)
next_token = torch_argmax(Model$eval()(x), -1)
print("token 2, dado token 1")
print(next_token)
x <- torch_cat(list(x, next_token), -1)
next_token = torch_argmax(Model$eval()(x)[,-1], -1)
print("token 3, dado token 2")
print(next_token)

[1] "token 2, dado token 1"
torch_tensor
 4
[ CPULongType{1,1} ]
[1] "token 3, dado token 2"
torch_tensor
 5
[ CPULongType{1} ]


In [148]:
data.frame(
  indice = seq_along(voc),
  token = voc
)

indice,token
<int>,<chr>
1,<PAD>
2,
3,A
4,B
5,C


In [149]:
prompt = "A"
x = Encoder(prompt)
x = torch_tensor(x, dtype=torch_int())$unsqueeze(1)
next_token = torch_argmax(Model$eval()(x), -1)
print("token 2, dado token 1")
print(next_token)
x <- torch_cat(list(x, next_token), -1)
next_token = torch_argmax(Model$eval()(x)[,-1], -1)
print("token 3, dado token 2")
print(next_token)



[1] "token 2, dado token 1"
torch_tensor
 4
[ CPULongType{1,1} ]
[1] "token 3, dado token 2"
torch_tensor
 5
[ CPULongType{1} ]


In [150]:
prompt <- "A"
x <- Encoder(prompt)
x <- torch_tensor(x, dtype = torch_int())$unsqueeze(1)  # [1, T]

# gerar mais 2 passos (exemplo)
Model$eval()
# passo 1
logits <- Model(x)[, -1, ]                 # [1, V] (√∫ltima posi√ß√£o de tempo)
next_token <- torch_argmax(logits, dim = -1)$unsqueeze(2)  # [1, 1]
x <- torch_cat(list(x, next_token), dim = 2)               # [1, T+1]


# passo 2
logits <- Model(x)[, -1, ]                 # [1, V]
next_token <- torch_argmax(logits, dim = -1)$unsqueeze(2)  # [1, 1]
x <- torch_cat(list(x, next_token), dim = 2)               # [1, T+2]

print(x)

torch_tensor
 3  4  5
[ CPULongType{1,3} ]


## Gerando um texto, considerando a janela de contexto para previ√£o do pr√≥ximo token

M√©tod Greedy

In [151]:
# gera texto a partir de um prompt
generate <- function(prompt, max_new = config$max_new_tokens) {
  Model$eval()  # modo avalia√ß√£o (dropout desligado)
  with_no_grad({
    # encode ‚Üí tensor [B=1, T]
    x <- torch_tensor(Encoder(prompt), dtype = torch_int())$unsqueeze(1)

    for (i in 1:max_new) {
      # 1) recorte de contexto (janela causal)
      if (x$size(2) <= config$block_size) {
        ctx <- x
      } else {
        T <- x$size(2)
        ctx <- x[, (T - config$block_size + 1):T]
      }

      # 2) forward e pega o √∫ltimo passo temporal
      logits <- Model(ctx)            # [B, T_ctx, V]
      last_logits <- logits[, -1, ]   # [B, V]

      # 3) escolhe pr√≥ximo token e concatena no dim=2
      next_token <- torch_argmax(last_logits, dim = -1)$unsqueeze(2)  # [B,1]
      x <- torch_cat(list(x, next_token), dim = 2)                     # [B,T+1]
    }

    # decode
    generated_idx <- as.integer(as_array(x$squeeze(1)))
    paste(voc[generated_idx], collapse = "")
  })
}

# exemplo
cat(generate("A"), "\n")

ABCBBCBCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABCABCCABC 


## Gerando texto com m√©todo topK(2)
Corre√ß√£o: colocar uma entropia na escolha do proximo token, entre os dois mais prov√°veis.

In [152]:
for (i in 1:config$max_new_tokens) {

  if (x$size(2) <= config$block_size) {
    logits   <- Model$eval()(x)[, -1, ]
    logits   <- logits$topk(k_top)                       # [[1]] valores, [[2]] √≠ndices
    vals     <- logits[[1]]$to(dtype = torch_float())    # <<< garante float
    probs    <- torch::nnf_softmax(vals, dim = -1)       # <<< funcional (corre√ß√£o)
    selected <- torch_multinomial(probs, num_samples = 1)
    next_token <- logits[[2]][, selected$item()]$unsqueeze(1)
    x <- torch_cat(list(x, next_token), -1)
  } else {
    xx       <- x[, (x$size(2) - config$block_size + 1):x$size(2)]
    logits   <- Model$eval()(xx)[, -1, ]
    logits   <- logits$topk(k_top)
    vals     <- logits[[1]]$to(dtype = torch_float())    # <<< idem
    probs    <- torch::nnf_softmax(vals, dim = -1)       # <<< funcional
    selected <- torch_multinomial(probs, num_samples = 1)
    next_token <- logits[[2]][, selected$item()]$unsqueeze(1)
    x <- torch_cat(list(x, next_token), -1)
  }
cat(Decoder(as.numeric(next_token)))
}



BBCBCABCCABCABCCAAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCBCABCCABCABCCAAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCAABCBBCAABCBBCBCABCCAAABCBBCAABCBBCAABCBBCAABCBBCBCABCCAAABCBBCAABCBBCAABCBBCBCABCCABCABCCAAABCBBCBCABCCABCABCCABCABCCAAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCBCABCCABCABCCABCABCCAAABCBBCBCABCCABCABCCAAABCBBCAABCBBCBCABCCAAABCBBCBCABCCABCABCCABCABCCAAABCBBCBCABCCABCABCCABCABCCABCABCCAAABCBBCBCABCCABCABCCAAABCBBCBCABCCABCABCCABCABCCAAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCAABCBBCAABCBBCAABCBBCBCABCCABCABCCAAABCBBCAABCBBCAABCBBCBCABCCABCABCCAAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCBCABCCAAABCBBCBCABCCABCABCCAAABCBBCBCABCCABCABCCAAABCBBCBCABCCABCABCCAAABCBBCBCABCCAAABCBBCBCAB