In [1]:
import $file.^.Magic

[32mimport [39m[36m$file.$[39m

In [2]:
case class GPTConfig(
  vocabularySize: Int,
  contextLength: Int,
  embeddingDimension: Int,
  attentionHeadsCount: Int,
  layersCount: Int,
  dropoutRate: Double,
  queryKeyValueBias: Boolean
)

val gptConfig = GPTConfig(
  vocabularySize = 50257,
  contextLength = 1024,
  embeddingDimension = 768,
  attentionHeadsCount = 12,
  layersCount = 12,
  dropoutRate = 0.1,
  queryKeyValueBias = true
)

defined [32mclass[39m [36mGPTConfig[39m
[36mgptConfig[39m: [32mGPTConfig[39m = [33mGPTConfig[39m(
  vocabularySize = [32m50257[39m,
  contextLength = [32m1024[39m,
  embeddingDimension = [32m768[39m,
  attentionHeadsCount = [32m12[39m,
  layersCount = [32m12[39m,
  dropoutRate = [32m0.1[39m,
  queryKeyValueBias = [32mtrue[39m
)

In [3]:
Magic.!("pip", "install", "torch==2.4.*")




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: pip install --upgrade pip


In [4]:
import $ivy.`dev.scalapy::scalapy-core:0.5.3`

import me.shadaj.scalapy.py

val torch = py.module("torch")

[32mimport [39m[36m$ivy.$[39m
[32mimport [39m[36mme.shadaj.scalapy.py[39m
[36mtorch[39m: [32mpy[39m.[32mModule[39m = <module 'torch' from '/usr/local/lib/python3.12/site-packages/torch/__init__.py'>

In [5]:
import py.PyQuote

type TorchTensor = py.Dynamic

// Workaround to define a class that inherits from a Python class
py.exec {
  s"""import torch.nn as nn
     |
     |class MultiHeadAttention(nn.Module):
     |  def __init__(self, init):
     |    super().__init__()
     |    init(self)
     |""".stripMargin
}
def MultiHeadAttention(
  inputDimension: Int,
  outputDimension: Int,
  dropoutProbability: Double,
  contextLength: Int,
  headsCount: Int,
  queryKeyValueBias: Boolean
): py.Dynamic = {
  assert(outputDimension % headsCount == 0, "Output dimension must be a multiple of heads count")
  val headDimension = outputDimension / headsCount
    
  val init = (self: py.Dynamic) => {
    self.weightsQuery = torch.nn.Linear(inputDimension, outputDimension, bias = queryKeyValueBias)
    self.weightsKey = torch.nn.Linear(inputDimension, outputDimension, bias = queryKeyValueBias)
    self.weightsValue = torch.nn.Linear(inputDimension, outputDimension, bias = queryKeyValueBias)
    self.outputProjection = torch.nn.Linear(outputDimension, outputDimension)
    self.dropout = torch.nn.Dropout(dropoutProbability)
    self.register_buffer("mask", torch.triu(torch.ones(contextLength, contextLength), diagonal = 1))
      
    val forward = (batchedInputs: TorchTensor) => {
      val (batchesCount, tokensCount, tokenDimension) = batchedInputs.shape.as[(Int, Int, Int)]
      val queries = self.weightsQuery(batchedInputs)
        .view(batchesCount, tokensCount, headsCount, headDimension)
        .transpose(1, 2)
      val keys = self.weightsKey(batchedInputs)
        .view(batchesCount, tokensCount, headsCount, headDimension)
        .transpose(1, 2)
      val values = self.weightsValue(batchedInputs)
        .view(batchesCount, tokensCount, headsCount, headDimension)
        .transpose(1, 2)
      val attentionScores = py"$queries @ $keys.transpose(2, 3)"
      attentionScores.masked_fill_(py"${self.mask}.bool()[:$tokensCount, :$tokensCount]", -torch.inf)
      val attentionWeights = self.dropout(torch.softmax(py"$attentionScores / $headDimension**0.5", dim = -1))
      self.outputProjection(
        py"$attentionWeights @ $values"
          .transpose(1, 2)
          .reshape(batchesCount, tokensCount, outputDimension)
      )
    }
    self.forward = forward
  }
  py.Dynamic.global.MultiHeadAttention(init)
}

[32mimport [39m[36mpy.PyQuote[39m
defined [32mtype[39m [36mTorchTensor[39m
defined [32mfunction[39m [36mMultiHeadAttention[39m

In [6]:
// Workaround to define a class that inherits from a Python class
// Because it mostly uses Python operators, it's implemented fully in Python
py.exec {
  s"""import torch
     |import torch.nn as nn
     |
     |class GELU(nn.Module):
     |  def __init__(self):
     |    super().__init__()
     |
     |  def forward(self, inputs):
     |    return 0.5 * inputs * (
     |      1 + torch.tanh(
     |        torch.sqrt(torch.tensor(2.0 / torch.pi)) * (inputs + 0.044715 * torch.pow(inputs, 3))
     |      )
     |    )
     |""".stripMargin
}
def GELU() = py.Dynamic.global.GELU()

defined [32mfunction[39m [36mGELU[39m

In [7]:
// Workaround to define a class that inherits from a Python class
py.exec {
  s"""import torch.nn as nn
     |
     |class FeedForward(nn.Module):
     |  def __init__(self, init):
     |    super().__init__()
     |    init(self)
     |""".stripMargin
}
def FeedForward(
  embeddingDimension: Int
): py.Dynamic = {
  val init = (self: py.Dynamic) => {
    self.layers = torch.nn.Sequential(
      torch.nn.Linear(embeddingDimension, 4 * embeddingDimension),
      GELU(),
      torch.nn.Linear(4 * embeddingDimension, embeddingDimension)
    )
      
    val forward = (inputs: TorchTensor) => self.layers(inputs)
    self.forward = forward
  }
  py.Dynamic.global.FeedForward(init)
}

defined [32mfunction[39m [36mFeedForward[39m

In [8]:
// Workaround to define a class that inherits from a Python class
py.exec {
  s"""import torch.nn as nn
     |
     |class NormalizationLayer(nn.Module):
     |  def __init__(self, init):
     |    super().__init__()
     |    init(self)
     |""".stripMargin
}
def NormalizationLayer(
  embeddingDimension: Int
): py.Dynamic = {
  val epsilon = 1e-5
  val init = (self: py.Dynamic) => {
    self.scale = torch.nn.Parameter(torch.ones(embeddingDimension))
    self.shift = torch.nn.Parameter(torch.zeros(embeddingDimension))
      
    val forward = (inputs: TorchTensor) => {
      val mean = inputs.mean(dim = -1, keepdim = true)
      val variance = inputs.`var`(dim = -1, keepdim = true, unbiased = false)
      val normalizedInputs = py"($inputs - $mean) / torch.sqrt($variance + $epsilon)"
      py"${self.scale} * $normalizedInputs + ${self.shift}"
    }
    self.forward = forward
  }
  py.Dynamic.global.NormalizationLayer(init)
}

defined [32mfunction[39m [36mNormalizationLayer[39m

In [9]:
import scala.util.chaining._

py.exec {
  s"""import torch.nn as nn
     |
     |class TransformerBlock(nn.Module):
     |  def __init__(self, init):
     |    super().__init__()
     |    init(self)
     |""".stripMargin
}
def TransformerBlock(
  config: GPTConfig
): py.Dynamic = {
  val init = (self: py.Dynamic) => {
    self.multiHeadAttention = MultiHeadAttention(
      inputDimension = config.embeddingDimension,
      outputDimension = config.embeddingDimension,
      dropoutProbability = config.dropoutRate,
      contextLength = config.contextLength,
      headsCount = config.attentionHeadsCount,
      queryKeyValueBias = config.queryKeyValueBias
    )
    self.feedForward = FeedForward(config.embeddingDimension)
    self.normalization1 = NormalizationLayer(config.embeddingDimension)
    self.normalization2 = NormalizationLayer(config.embeddingDimension)
    self.dropoutShortcut = torch.nn.Dropout(config.dropoutRate)
    
    val forward = (inputs: TorchTensor) => {
      val shortcut = inputs
      val newShortcut = inputs
        .pipe(self.normalization1(_))
        .pipe(self.multiHeadAttention(_))
        .pipe(self.dropoutShortcut(_))
        .pipe(o => py"$o + $shortcut")
      newShortcut
        .pipe(self.normalization2(_))
        .pipe(self.feedForward(_))
        .pipe(self.dropoutShortcut(_))
        .pipe(o => py"$o + $newShortcut")
    }
    self.forward = forward
  }
  py.Dynamic.global.TransformerBlock(init)
}

[32mimport [39m[36mscala.util.chaining._[39m
defined [32mfunction[39m [36mTransformerBlock[39m

In [10]:
// Workaround to define a class that inherits from a Python class
py.exec {
  s"""import torch.nn as nn
     |
     |class GPTModel(nn.Module):
     |  def __init__(self, init):
     |    super().__init__()
     |    init(self)
     |""".stripMargin
}
type Model = py.Dynamic
def GPTModel(
  config: GPTConfig
): Model = {
  val transformerBlocks = Seq.fill(config.layersCount)(TransformerBlock(config))
  val init = (self: py.Dynamic) => {
    self.tokenEmbeddingLayer = torch.nn.Embedding(config.vocabularySize, config.embeddingDimension)
    self.positionEmbeddingLayer = torch.nn.Embedding(config.contextLength, config.embeddingDimension)
    self.dropoutEmbeddingLayer = torch.nn.Dropout(config.dropoutRate)
    self.transformerBlocksLayer = py"nn.Sequential(*${transformerBlocks.toPythonProxy})"
    self.finalNormalizationLayer = NormalizationLayer(config.embeddingDimension)
    self.outputLayer = torch.nn.Linear(config.embeddingDimension, config.vocabularySize, bias = false)
      
    val forward = (batchedInputs: TorchTensor) => {
      val (_, sequenceLength) = batchedInputs.shape.as[(Int, Int)]
      val tokenEmbeddings = self.tokenEmbeddingLayer(batchedInputs)
      val positionEmbeddings = self.positionEmbeddingLayer(torch.arange(sequenceLength, device = batchedInputs.device))
      py"$tokenEmbeddings + $positionEmbeddings"
        .pipe(self.dropoutEmbeddingLayer(_))
        .pipe(self.transformerBlocksLayer(_))
        .pipe(self.finalNormalizationLayer(_))
        .pipe(self.outputLayer(_))
    }
    self.forward = forward
  }
  py.Dynamic.global.GPTModel(init)
}

defined [32mtype[39m [36mModel[39m
defined [32mfunction[39m [36mGPTModel[39m

In [11]:
val model = GPTModel(gptConfig)
val device = torch.device(if (torch.cuda.is_available().as[Boolean]) "cuda" else "cpu")
model.to(device)
val checkpoint = torch.load("model_and_optimizer.pth", map_location = device)
val modelStateKey = "model"
model.load_state_dict(checkpoint.bracketAccess(modelStateKey))

[36mmodel[39m: [32mModel[39m = GPTModel(
  (tokenEmbeddingLayer): Embedding(50257, 768)
  (positionEmbeddingLayer): Embedding(1024, 768)
  (dropoutEmbeddingLayer): Dropout(p=0.1, inplace=False)
  (transformerBlocksLayer): Sequential(
    (0): TransformerBlock(
      (multiHeadAttention): MultiHeadAttention(
        (weightsQuery): Linear(in_features=768, out_features=768, bias=True)
        (weightsKey): Linear(in_features=768, out_features=768, bias=True)
        (weightsValue): Linear(in_features=768, out_features=768, bias=True)
        (outputProjection): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feedForward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (normalization1): NormalizationLayer()
      (normalization2): Normaliza

In [12]:
Magic.!("pip", "install", "tiktoken==0.7.*")




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: pip install --upgrade pip


In [13]:
val tiktoken = py.module("tiktoken")
val tokenizer = tiktoken.get_encoding("gpt2")

[36mtiktoken[39m: [32mpy[39m.[32mModule[39m = <module 'tiktoken' from '/usr/local/lib/python3.12/site-packages/tiktoken/__init__.py'>
[36mtokenizer[39m: [32mpy[39m.[32mDynamic[39m = <Encoding 'gpt2'>

In [32]:
import py.SeqConverters

type Tokenizer = py.Dynamic

def generateTextSimple(
  model: Model,
  maxNewTokens: Int,
  contextLength: Int
)(
  encodedInput: Vector[Int]
): Vector[Int] =
  LazyList.iterate(encodedInput) { currentEncodedOutput =>
    py.local {
      val croppedInput = currentEncodedOutput.takeRight(contextLength)
      val logits = py.`with`(torch.no_grad()) { _ =>
        val inputTensor = torch.tensor(croppedInput.toPythonProxy).unsqueeze(0)
        model(inputTensor)
      }
      py"$logits[:, -1, :]"
        .pipe(torch.softmax(_, dim = -1))
        .pipe(torch.argmax(_, dim = -1, keepdim = true))
        .pipe(_.squeeze(0).tolist().as[Seq[Int]].head)
        .pipe(nextEncodedOutput => currentEncodedOutput :+ nextEncodedOutput)
    }
  }.drop(maxNewTokens).head

def textToTokenIds(
  text: String, 
  tokenizer: Tokenizer
): Vector[Int] = {
  val allowedSpecial = py.Dynamic.global.set(Seq("<|endoftext|>").toPythonProxy)
  val encodedText = tokenizer.encode(text, allowed_special = allowedSpecial)
  torch.tensor(encodedText).tolist().as[Vector[Int]]
}
    
def tokenIdsToText(
  tokenIds: Vector[Int], 
  tokenizer: Tokenizer
): String =
  tokenizer.decode(tokenIds.toPythonProxy).as[String]

[32mimport [39m[36mpy.SeqConverters[39m
defined [32mtype[39m [36mTokenizer[39m
defined [32mfunction[39m [36mgenerateTextSimple[39m
defined [32mfunction[39m [36mtextToTokenIds[39m
defined [32mfunction[39m [36mtokenIdsToText[39m

In [16]:
case class InstructionDataRecord(
  instruction: String,
  input: String,
  output: String
) {
  lazy val alpacaFormat: String = {
    val formattedInput = if (input.nonEmpty) s"\n### Input:\n$input\n" else ""
    s"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
       |
       |### Instruction:
       |$instruction
       |$formattedInput
       |### Response:
       |$output
       |""".stripMargin
  }
}

defined [32mclass[39m [36mInstructionDataRecord[39m

In [17]:
val testInputRecords = List(
  InstructionDataRecord(
    instruction = "Evaluate the following phrase by transforming it into the spelling given.",
    input = "freind --> friend",
    output = "The spelling of the given phrase \"freind\" is incorrect, the correct spelling is \"friend\"."
  ),
  InstructionDataRecord(
    instruction = "Edit the following sentence for grammar.",
    input = "He go to the park every day.",
    output = "He goes to the park every day."
  ),
  InstructionDataRecord(
    instruction = "Convert 45 kilometers to meters.",
    input = "",
    output = "45 kilometers is 45000 meters."
  )
)

[36mtestInputRecords[39m: [32mList[39m[[32mInstructionDataRecord[39m] = [33mList[39m(
  [33mInstructionDataRecord[39m(
    instruction = [32m"Evaluate the following phrase by transforming it into the spelling given."[39m,
    input = [32m"freind --> friend"[39m,
    output = [32m"The spelling of the given phrase \"freind\" is incorrect, the correct spelling is \"friend\"."[39m
  ),
  [33mInstructionDataRecord[39m(
    instruction = [32m"Edit the following sentence for grammar."[39m,
    input = [32m"He go to the park every day."[39m,
    output = [32m"He goes to the park every day."[39m
  ),
  [33mInstructionDataRecord[39m(
    instruction = [32m"Convert 45 kilometers to meters."[39m,
    input = [32m""[39m,
    output = [32m"45 kilometers is 45000 meters."[39m
  )
)

In [36]:
testInputRecords.foreach { record =>
  val inputText = record.alpacaFormat
  val decodedOutputText = py.local {
    val outputTextIds = generateTextSimple(
      model = model, 
      maxNewTokens = 256,
      contextLength = gptConfig.contextLength
    )(
      encodedInput = textToTokenIds(inputText, tokenizer)
    )
    tokenIdsToText(outputTextIds, tokenizer)
  }
  val responseText = decodedOutputText.drop(inputText.length)
  println(s"Correct response: ${record.output}")
  println(s"Model response: $responseText")
  println("---------------------------------------\n")
}

Correct response: The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".
Model response: <|endoftext|>, and the following sentence.
<|endoftext|>, and the following sentence.
<|endoftext|>, the word 'The the following sentence.
<|endoftext|>, and the word ', and the following sentence.
<|endoftext|>, and the following.

<|endoftext|>, the sentence.
<|endoftext|>.
<|endoftext|>, and the following sentence.

<|endoftext|>, and a, the word ', and the following sentence.
<|endoftext|>.
<|endoftext|>.

<|endoftext|> the following sentence.
<|endoftext|>
<|endoftext|> is a sentence.
<|endoftext|>, the following the following, the word ', the sentence.

<|endoftext|>, and the following the following sentence.
<|endoftext|>, and, and, the sentence.
<|endoftext|>
<|endoftext|>
<|endoftext|>, and, and the following
<|endoftext|>, the following sentence.
<|endoftext|>, the sentence.
<|endoftext|>, the following sentence.
<|endoftext|>, and the following sentence