In [5]:
import scala.io.Source

val filePath = "data/the_verdict.txt"
val rawText = Source.fromFile(filePath).mkString

println(s"Read ${rawText.length} characters from $filePath")

Read 20479 characters from data/the_verdict.txt


[32mimport [39m[36mscala.io.Source[39m
[36mfilePath[39m: [32mString[39m = [32m"data/the_verdict.txt"[39m
[36mrawText[39m: [32mString[39m = [32m"""I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last 

In [6]:
def tokenize(text: String): Vector[String] = {
  val splitBy = """[,.:;?_!"()\']|--|\s"""
  text.split(s"(?<=$splitBy)|(?=$splitBy)").filter(!_.isBlank).toVector
}

println(tokenize("Hello, world. Is this-- a test?"))

Vector(Hello, ,, world, ., Is, this, --, a, test, ?)


defined [32mfunction[39m [36mtokenize[39m

In [7]:
val tokenizedText = tokenize(rawText)

println(s"Extracted ${tokenizedText.length} tokens")

Extracted 4690 tokens


[36mtokenizedText[39m: [32mVector[39m[[32mString[39m] = [33mVector[39m(
  [32m"I"[39m,
  [32m"HAD"[39m,
  [32m"always"[39m,
  [32m"thought"[39m,
  [32m"Jack"[39m,
  [32m"Gisburn"[39m,
  [32m"rather"[39m,
  [32m"a"[39m,
  [32m"cheap"[39m,
  [32m"genius"[39m,
  [32m"--"[39m,
  [32m"though"[39m,
  [32m"a"[39m,
  [32m"good"[39m,
  [32m"fellow"[39m,
  [32m"enough"[39m,
  [32m"--"[39m,
  [32m"so"[39m,
  [32m"it"[39m,
  [32m"was"[39m,
  [32m"no"[39m,
  [32m"great"[39m,
  [32m"surprise"[39m,
  [32m"to"[39m,
  [32m"me"[39m,
  [32m"to"[39m,
  [32m"hear"[39m,
  [32m"that"[39m,
  [32m","[39m,
  [32m"in"[39m,
  [32m"the"[39m,
  [32m"height"[39m,
  [32m"of"[39m,
  [32m"his"[39m,
  [32m"glory"[39m,
  [32m","[39m,
  [32m"he"[39m,
  [32m"had"[39m,
...

In [9]:
val sortedDistinctTokens = tokenizedText.sorted.distinct

println(s"${sortedDistinctTokens.length} distinct tokens in total")

1130 distinct tokens in total


[36msortedDistinctTokens[39m: [32mVector[39m[[32mString[39m] = [33mVector[39m(
  [32m"!"[39m,
  [32m"\""[39m,
  [32m"'"[39m,
  [32m"("[39m,
  [32m")"[39m,
  [32m","[39m,
  [32m"--"[39m,
  [32m"."[39m,
  [32m":"[39m,
  [32m";"[39m,
  [32m"?"[39m,
  [32m"A"[39m,
  [32m"Ah"[39m,
  [32m"Among"[39m,
  [32m"And"[39m,
  [32m"Are"[39m,
  [32m"Arrt"[39m,
  [32m"As"[39m,
  [32m"At"[39m,
  [32m"Be"[39m,
  [32m"Begin"[39m,
  [32m"Burlington"[39m,
  [32m"But"[39m,
  [32m"By"[39m,
  [32m"Carlo"[39m,
  [32m"Chicago"[39m,
  [32m"Claude"[39m,
  [32m"Come"[39m,
  [32m"Croft"[39m,
  [32m"Destroyed"[39m,
  [32m"Devonshire"[39m,
  [32m"Don"[39m,
  [32m"Dubarry"[39m,
  [32m"Emperors"[39m,
  [32m"Florence"[39m,
  [32m"For"[39m,
  [32m"Gallery"[39m,
  [32m"Gideon"[39m,
...

In [27]:
val vocabulary = sortedDistinctTokens.zipWithIndex.toMap

class SimpleTokenizerV1(
  vocabulary: Map[String, Int]
) {
  val inverseVocabulary = vocabulary.map(_.swap)

  def encode(text: String): Vector[Int] = 
    tokenize(text).map(vocabulary(_))

  def tokenize(text: String): Vector[String] = {
    val splitBy = """[,.:;?_!"()\']|--|\s"""
    val tokenizer = s"(?<=$splitBy)|(?=$splitBy)"
    text.split(tokenizer).filter(!_.isBlank).toVector
  }

  def decode(ids: Vector[Int]): String = 
    ids
      .map(inverseVocabulary(_))
      .mkString(" ")
      .replaceAll("\\s+([,.?!\"()\'])", "$1") 
}

[36mvocabulary[39m: [32mMap[39m[[32mString[39m, [32mInt[39m] = [33mHashMap[39m(
  [32m"inevitable"[39m -> [32m571[39m,
  [32m"Monte"[39m -> [32m64[39m,
  [32m"down"[39m -> [32m362[39m,
  [32m"economy"[39m -> [32m377[39m,
  [32m"interesting"[39m -> [32m578[39m,
  [32m"luxury"[39m -> [32m652[39m,
  [32m"serious"[39m -> [32m870[39m,
  [32m"forgotten"[39m -> [32m463[39m,
  [32m"muscles"[39m -> [32m695[39m,
  [32m"beneath"[39m -> [32m215[39m,
  [32m"used"[39m -> [32m1057[39m,
  [32m"eye"[39m -> [32m415[39m,
  [32m"straining"[39m -> [32m934[39m,
  [32m"At"[39m -> [32m18[39m,
  [32m"hooded"[39m -> [32m554[39m,
  [32m"murmur"[39m -> [32m694[39m,
  [32m"adulation"[39m -> [32m133[39m,
  [32m"gloried"[39m -> [32m495[39m,
  [32m"widow"[39m -> [32m1102[39m,
  [32m"panel"[39m -> [32m752[39m,
  [32m"sitters"[39m -> [32m898[39m,
  [32m"quality"[39m -> [32m808[39m,
  [32m"On"[39m -> [32m75[39m,
  [32m

In [28]:
val tokenizer = new SimpleTokenizerV1(vocabulary)

val textToEncode = """It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
val ids = tokenizer.encode(textToEncode)
val decodedText = tokenizer.decode(ids)

println(decodedText)

It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


[36mtokenizer[39m: [32mSimpleTokenizerV1[39m = ammonite.$sess.cmd27$Helper$SimpleTokenizerV1@23dc8abe
[36mtextToEncode[39m: [32mString[39m = [32m"""It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""[39m
[36mids[39m: [32mVector[39m[[32mInt[39m] = [33mVector[39m(
  [32m56[39m,
  [32m2[39m,
  [32m850[39m,
  [32m988[39m,
  [32m602[39m,
  [32m533[39m,
  [32m746[39m,
  [32m5[39m,
  [32m1126[39m,
  [32m596[39m,
  [32m5[39m,
  [32m1[39m,
  [32m67[39m,
  [32m7[39m,
  [32m38[39m,
  [32m851[39m,
  [32m1108[39m,
  [32m754[39m,
  [32m793[39m,
  [32m7[39m
)
[36mdecodedText[39m: [32mString[39m = [32m"It' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride."[39m