In [1]:
import $file.^.Magic

[32mimport [39m[36m$file.$[39m

In [2]:
val zipName = "sms+spam+collection.zip"
val datasetUrl = s"https://archive.ics.uci.edu/static/public/228/$zipName"
val outputDir = "data/sms-spam-raw"

[36mzipName[39m: [32mString[39m = [32m"sms+spam+collection.zip"[39m
[36msmsSpamUrl[39m: [32mString[39m = [32m"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"[39m
[36moutputDir[39m: [32mString[39m = [32m"data/sms-spam-raw"[39m

In [3]:
Magic.!("curl", "--create-dirs", "-O", "--output-dir", outputDir, datasetUrl)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 36789    0 36789    0     0  48504      0 --:--:-- --:--:-- --:--:-- 48470
100  198k    0  198k    0     0   176k      0 --:--:--  0:00:01 --:--:--  176k


In [4]:
Magic.!("unzip", s"$outputDir/$zipName", "-d", outputDir)

Archive:  data/sms-spam-raw/sms+spam+collection.zip
  inflating: data/sms-spam-raw/SMSSpamCollection  
  inflating: data/sms-spam-raw/readme  


In [17]:
import scala.io.Source

val datasetRaw = Source.fromFile(s"$outputDir/SMSSpamCollection").mkString

case class SmsSpamRecord(
  text: String,
  isSpam: Boolean
)

type Dataset = Vector[SmsSpamRecord]

val smsSpamRecords: Dataset = datasetRaw.split("\n").map {
  case s"spam\t$text" => SmsSpamRecord(text, isSpam = true)
  case s"ham\t$text" => SmsSpamRecord(text, isSpam = false)
}.toVector

val (spamRecords, notSpamRecords) = smsSpamRecords.partition(_.isSpam)
println(s"Spam count: ${spamRecords.size}")
println(s"Not spam count: ${notSpamRecords.size}")

Spam count: 747
Not spam count: 4827


[32mimport [39m[36mscala.io.Source[39m
[36mdatasetRaw[39m: [32mString[39m = [32m"""ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea

In [18]:
import scala.collection.mutable
import scala.util.Random

val balancedDataset: Dataset = {

  def sample(records: Vector[SmsSpamRecord], targetSize: Int): Vector[SmsSpamRecord] = {
    val balancedDatasetSpam = mutable.Map[String, SmsSpamRecord]()
    while (balancedDatasetSpam.size < targetSize) {
      val randomRecord = records(Random.nextInt(records.size))
      if (!balancedDatasetSpam.contains(randomRecord.text))
        balancedDatasetSpam += randomRecord.text -> randomRecord
    }
    balancedDatasetSpam.values.toVector
  }

  if (spamRecords.size < notSpamRecords.size)
    spamRecords ++ sample(notSpamRecords, targetSize = spamRecords.size)
  else
    notSpamRecords ++ sample(spamRecords, targetSize = notSpamRecords.size)
}

[32mimport [39m[36mscala.collection.mutable[39m
[32mimport [39m[36mscala.util.Random[39m
[36mbalancedDataset[39m: [32mDataset[39m = [33mVector[39m(
  [33mSmsSpamRecord[39m(
    text = [32m"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"[39m,
    isSpam = [32mtrue[39m
  ),
  [33mSmsSpamRecord[39m(
    text = [32m"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"[39m,
    isSpam = [32mtrue[39m
  ),
  [33mSmsSpamRecord[39m(
    text = [32m"WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."[39m,
    isSpam = [32mtrue[39m
  ),
  [33mSmsSpamRecord[39m(
    text = [32m"Had your mobile 11 months or more? U R entitled to Update to the latest colour m

In [22]:
type Training = Dataset
type Validation = Dataset
type Test = Dataset

def randomSplit(dataset: Vector[SmsSpamRecord], trainingFraction: Double, validationFraction: Double): (Training, Validation, Test) = {
  val shuffledDataset = Random.shuffle(dataset)
  val trainingSize = (shuffledDataset.size * trainingFraction).floor.toInt
  val validationSize = (shuffledDataset.size * validationFraction).floor.toInt

  val (training, remainingRecords) = shuffledDataset.splitAt(trainingSize)
  val (validation, test) = remainingRecords.splitAt(validationSize)
  (training, validation, test)
}

val (training, validation, test) = randomSplit(balancedDataset, trainingFraction = 0.7, validationFraction = 0.1) 

defined [32mtype[39m [36mTraining[39m
defined [32mtype[39m [36mValidation[39m
defined [32mtype[39m [36mTest[39m
defined [32mfunction[39m [36mrandomSplit[39m
[36mtraining[39m: [32mTraining[39m = [33mVector[39m(
  [33mSmsSpamRecord[39m(
    text = [32m"* FREE* POLYPHONIC RINGTONE Text SUPER to 87131 to get your FREE POLY TONE of the week now! 16 SN PoBox202 NR31 7ZS subscription 450pw"[39m,
    isSpam = [32mtrue[39m
  ),
  [33mSmsSpamRecord[39m(text = [32m"Do you know when the result."[39m, isSpam = [32mfalse[39m),
  [33mSmsSpamRecord[39m(text = [32m"Aight, we'll head out in a few"[39m, isSpam = [32mfalse[39m),
  [33mSmsSpamRecord[39m(
    text = [32m"Hi, Mobile no.  &lt;#&gt;  has added you in their contact list on www.fullonsms.com It s a great place to send free sms to people For more visit fullonsms.com"[39m,
    isSpam = [32mfalse[39m
  ),
  [33mSmsSpamRecord[39m(text = [32m"You will go to walmart. I.ll stay."[39m, isSpam = [32mfals

In [23]:
import $ivy.`com.github.tototoshi::scala-csv:2.0.0`

import scala.util.Using
import com.github.tototoshi.csv.CSVWriter

def writeToCsv(path: String, dataset: Dataset): Unit = {
  val headers = Vector("Text", "Label")

  Using.resource(CSVWriter.open(path)) { writer =>
    val rows = dataset.map {
      case SmsSpamRecord(text, isSpam) => Vector(text, if (isSpam) "1" else "0")
    }
    writer.writeAll(headers +: rows)
  }
}

writeToCsv("data/training.csv", training)
writeToCsv("data/validation.csv", validation)
writeToCsv("data/test.csv", test)

[32mimport [39m[36m$ivy.$[39m
[32mimport [39m[36mscala.util.Using[39m
[32mimport [39m[36mcom.github.tototoshi.csv.CSVWriter[39m
defined [32mfunction[39m [36mwriteToCsv[39m