## Direitos Autorais Modelo ChatGPT em R:
Modelo apresentado na aula 27 do curso de estat√≠stica e Machine Learning do IME-USP.

Autoria: Prof. Alexandre Galv√£o Patriota

## üîπ Par√¢metros de Entrada do Modelo GPT

A fun√ß√£o de inicializa√ß√£o do modelo recebe os seguintes argumentos:

| **Par√¢metro** | **Significado** | **Interpreta√ß√£o** |
|----------------|-----------------|-------------------------------|
| **block_size** | Tamanho da janela de contexto (n√∫mero de tokens consecutivos considerados no bloco attention). | Define o comprimento da janela de condicionamento ‚Äî quantos tokens anteriores o modelo utiliza para prever o pr√≥ximo. |
| **n_embd** | Dimens√£o do vetor de embedding. | Tamanho do espa√ßo latente cont√≠nuo onde cada token √© representado por um vetor real. |
| **N_Layers** | N√∫mero de camadas do Transformer (profundidade da rede). | Quantas vezes o bloco ‚ÄúAten√ß√£o + Feed-Forward + Normaliza√ß√£o‚Äù √© repetido ao longo da rede. |
| **nvoc** | Tamanho do vocabul√°rio. | N√∫mero de categorias poss√≠veis no modelo multinomial. Cada predi√ß√£o escolhe uma entre `nvoc` op√ß√µes (No m√©todo Greedy). |
| **head** | N√∫mero de *heads* de aten√ß√£o. | Quantas proje√ß√µes paralelas de aten√ß√£o s√£o calculadas ‚Äî cada *head* modela um tipo distinto de depend√™ncia contextual. |
| **p0** | Taxa de *dropout* (padr√£o 0.1). | Probabilidade de zerar aleatoriamente algumas ativa√ß√µes durante o treino, reduzindo o *overfitting*. |

---

In [5]:
library(torch)

GPT <- torch::nn_module(
  initialize = function(block_size, n_embd, N_Layers, nvoc, Head, p0 = 0.1) {

    self$N   <- N_Layers
    self$wpe <- torch::nn_embedding(block_size, n_embd)
    self$wte <- torch::nn_embedding(nvoc, n_embd, padding_idx = 1)

    self$MM  <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) torch::nn_multihead_attention(n_embd, Head, dropout = p0, batch_first = TRUE)
    ))

    self$scale1 <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) torch::nn_layer_norm(n_embd)
    ))

    self$scale2 <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) torch::nn_layer_norm(n_embd)
    ))

    self$scale3 <- torch::nn_layer_norm(n_embd, elementwise_affine = TRUE)

    self$FFN <- torch::nn_module_list(lapply(
      1:N_Layers,
      function(x) {
        torch::nn_sequential(
          torch::nn_linear(n_embd, 4 * n_embd),
          torch::nn_gelu(),
          torch::nn_linear(4 * n_embd, n_embd),
          torch::nn_dropout(p0)
        )
      }
    ))

    # cabe√ßa linear de sa√≠da (mantive seu nome ln_f)
    self$ln_f  <- torch::nn_linear(n_embd, nvoc, bias = FALSE)
    self$drop0 <- torch::nn_dropout(p = p0)
  },

  forward = function(x, return_intermediates = FALSE) {
    # x: (B, T)
    B <- x$size(1)
    T <- x$size(2)

    # posi√ß√µes 1..T (long)
    x1 <- torch::torch_arange(1, T,
      dtype = torch::torch_long(),
      device = x$device
    )

    # m√°scara causal (pro√≠be olhar para o futuro)
    wei <- torch::torch_triu(torch::torch_ones(T, T, device = x$device), diagonal = 1)$to(
      dtype = torch::torch_bool()
    )

    # embeddings
    output <- self$wte(x) + self$wpe(x1)$unsqueeze(1)  # (B, T, E)
    output <- self$drop0(output)

    # (opcional) inspe√ß√£o r√°pida
    # cat("wei shape:", as.character(wei$size()), "\n"); print(wei$to(dtype = torch_int()))
    # cat("x1 shape:", as.character(x1$size()), "\n"); print(x1)

    for (j in 1:self$N) {
      # pr√©-norm + aten√ß√£o multihead
      QKV <- self$scale1[[j]](output)  # (B, T, E) pois batch_first = TRUE
      attn_out <- self$MM[[j]](
        query = QKV, key = QKV, value = QKV,
        attn_mask = wei, need_weights = FALSE
      )[[1]]
      output <- output + attn_out

      # feed-forward com pr√©-norm
      output <- output + self$FFN[[j]](self$scale2[[j]](output))
    }

    # norm final + cabe√ßa linear ‚Üí logits (B, T, nvoc)
    output <- self$scale3(output)
    logits <- self$ln_f(output)

    if (return_intermediates) {
      return(list(
        x1    = x1$to("cpu"),
        wei   = wei$to(dtype = torch_int())$to("cpu"),
        out   = output$to("cpu"),
        logits = logits$to("cpu")
      ))
    }
    logits
  }
)

In [10]:
# Criar uma inst√¢ncia do modelo
model <- GPT(
  block_size = 4, #8
  n_embd = 8, #16
  N_Layers = 2, #2
  nvoc = 4, #20
  Head = 2 #2
)

In [11]:
# Criar uma entrada fict√≠cia (batch de tokens)
x <- torch::torch_tensor(matrix(c(1,2,3,4,5,6,7,8), nrow = 1))

# Executar o modelo
res <- model(x, return_intermediates = TRUE)

# Visualizar vari√°veis intermedi√°rias
res$x1
res$wei

ERROR: Error in (function (weight, indices, padding_idx, scale_grad_by_freq, : Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got CPUFloatType instead (while checking arguments for embedding)
Exception raised from checkScalarTypes at /Users/runner/work/libtorch-mac-m1/libtorch-mac-m1/pytorch/aten/src/ATen/TensorUtils.cpp:203 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>) + 52 (0x10456c55c in libc10.dylib)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) + 140 (0x1045691ac in libc10.dylib)
frame #2: at::checkScalarTypes(char const*, at::TensorArg const&, c10::ArrayRef<c10::ScalarType>) + 480 (0x3000c36a0 in libtorch_cpu.dylib)
frame #3: at::native::embedding_symint(at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) + 120 (0x3004cdf7c in libtorch_cpu.dylib)
frame #4: at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeExplicitAutograd__embedding(at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) + 84 (0x3016905d8 in libtorch_cpu.dylib)
frame #5: c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeExplicitAutograd__embedding(at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool)>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool>>, at::Tensor (at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) + 52 (0x30173fccc in libtorch_cpu.dylib)
frame #6: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool)> const&, c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) const + 132 (0x3012f16e4 in libtorch_cpu.dylib)
frame #7: at::_ops::embedding::redispatch(c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) + 156 (0x30123e268 in libtorch_cpu.dylib)
frame #8: c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool), &torch::autograd::VariableType::(anonymous namespace)::embedding(c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool)>, at::Tensor, c10::guts::typelist::typelist<c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool>>, at::Tensor (c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) + 1140 (0x303a22ef8 in libtorch_cpu.dylib)
frame #9: at::_ops::embedding::call(at::Tensor const&, at::Tensor const&, c10::SymInt, bool, bool) + 364 (0x30123dc24 in libtorch_cpu.dylib)
frame #10: at::embedding(at::Tensor const&, at::Tensor const&, long long, bool, bool) + 120 (0x10aaa652c in liblantern.dylib)
frame #11: _lantern_embedding_tensor_tensor_intt_bool_bool + 180 (0x10aaa5ea8 in liblantern.dylib)
frame #12: cpp_torch_namespace_embedding_weight_Tensor_indices_Tensor(XPtrTorchTensor, XPtrTorchIndexTensor, XPtrTorchint64_t, XPtrTorchbool, XPtrTorchbool) + 104 (0x109f99528 in torchpkg.so)
frame #13: _torch_cpp_torch_namespace_embedding_weight_Tensor_indices_Tensor + 620 (0x109a76aac in torchpkg.so)
frame #14: R_doDotCall + 1612 (0x1016eff4c in libR.dylib)
frame #15: bcEval_loop + 128100 (0x10174c2a4 in libR.dylib)
frame #16: bcEval + 684 (0x10171f46c in libR.dylib)
frame #17: Rf_eval + 556 (0x10171eb6c in libR.dylib)
frame #18: R_execClosure + 812 (0x10172172c in libR.dylib)
frame #19: applyClosure_core + 164 (0x101720824 in libR.dylib)
frame #20: Rf_eval + 1224 (0x10171ee08 in libR.dylib)
frame #21: do_docall + 644 (0x1016bd484 in libR.dylib)
frame #22: bcEval_loop + 40204 (0x101736b4c in libR.dylib)
frame #23: bcEval + 684 (0x10171f46c in libR.dylib)
frame #24: Rf_eval + 556 (0x10171eb6c in libR.dylib)
frame #25: R_execClosure + 812 (0x10172172c in libR.dylib)
frame #26: applyClosure_core + 164 (0x101720824 in libR.dylib)
frame #27: Rf_eval + 1224 (0x10171ee08 in libR.dylib)
frame #28: do_begin + 396 (0x101723fcc in libR.dylib)
frame #29: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #30: R_execClosure + 812 (0x10172172c in libR.dylib)
frame #31: applyClosure_core + 164 (0x101720824 in libR.dylib)
frame #32: Rf_eval + 1224 (0x10171ee08 in libR.dylib)
frame #33: Rf_evalList + 204 (0x10171f80c in libR.dylib)
frame #34: Rf_eval + 1312 (0x10171ee60 in libR.dylib)
frame #35: do_set + 360 (0x101724fe8 in libR.dylib)
frame #36: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #37: do_begin + 396 (0x101723fcc in libR.dylib)
frame #38: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #39: R_execClosure + 812 (0x10172172c in libR.dylib)
frame #40: applyClosure_core + 164 (0x101720824 in libR.dylib)
frame #41: Rf_eval + 1224 (0x10171ee08 in libR.dylib)
frame #42: do_set + 360 (0x101724fe8 in libR.dylib)
frame #43: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #44: do_eval + 1352 (0x101726048 in libR.dylib)
frame #45: bcEval_loop + 40204 (0x101736b4c in libR.dylib)
frame #46: bcEval + 684 (0x10171f46c in libR.dylib)
frame #47: Rf_eval + 556 (0x10171eb6c in libR.dylib)
frame #48: forcePromise + 232 (0x10171f6a8 in libR.dylib)
frame #49: Rf_eval + 660 (0x10171ebd4 in libR.dylib)
frame #50: do_withVisible + 64 (0x101726380 in libR.dylib)
frame #51: do_internal + 400 (0x10178f190 in libR.dylib)
frame #52: bcEval_loop + 40764 (0x101736d7c in libR.dylib)
frame #53: bcEval + 684 (0x10171f46c in libR.dylib)
frame #54: Rf_eval + 556 (0x10171eb6c in libR.dylib)
frame #55: R_execClosure + 812 (0x10172172c in libR.dylib)
frame #56: applyClosure_core + 164 (0x101720824 in libR.dylib)
frame #57: Rf_eval + 1224 (0x10171ee08 in libR.dylib)
frame #58: do_begin + 396 (0x101723fcc in libR.dylib)
frame #59: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #60: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #61: Rf_eval + 1012 (0x10171ed34 in libR.dylib)
frame #62: do_begin + 396 (0x101723fcc in libR.dylib)

