In [14]:
%%configure -f
{"jars":["/user/livy/repl_jars/dl4j-assembly-0.6.0.jar"],"driverMemory":"3g","executorMemory":"2g","conf":{"spark.driver.extraClassPath":"/home/livy/dl4j-assembly-0.6.0.jar","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.kryo.registrator":"org.nd4j.Nd4jRegistrator"}}

In [15]:
val rawData = sc.textFile("data/mls/ch05/train.tsv")

Creating SparkContext as 'sc'


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1476777146650_0020,spark,idle,Link,Link,✔


Creating HiveContext as 'sqlContext'
SparkContext and HiveContext created. Executing user code ...
rawData: org.apache.spark.rdd.RDD[String] = data/mls/ch05/train.tsv MapPartitionsRDD[1] at textFile at <console>:27

In [16]:
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint

val rawDataNoHeader = rawData.filter(line => !line.contains("hasDomainLink"))

val records = rawDataNoHeader.map(line => line.split("\t"))

val data = records.map { r =>
  val trimmed = r.map(_.replaceAll("\"", ""))
  val label = trimmed(r.size - 1).toInt
  val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
  LabeledPoint(label.toDouble, Vectors.dense(features))
}

data.cache
println(data.count)

7395

In [17]:
val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap
val numCategories = categories.size
println(categories)
println(numCategories)

val dataCategories = records.map { r =>
  val trimmed = r.map(_.replaceAll("\"", ""))
  val label = trimmed(r.size - 1).toInt
  val categoryIdx = categories(r(3))
  val categoryFeatures = Array.ofDim[Double](numCategories)
  categoryFeatures(categoryIdx) = 1.0
  val otherFeatures = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
  val features = categoryFeatures ++ otherFeatures
  LabeledPoint(label, Vectors.dense(features))
}

import org.apache.spark.mllib.feature.StandardScaler
val scalerCats = new StandardScaler(withMean = true, withStd = true).fit(dataCategories.map(lp => lp.features))
val scaledDataCats = dataCategories.map(lp => LabeledPoint(lp.label, scalerCats.transform(lp.features)))
val dataNB = records.map { r =>
  val trimmed = r.map(_.replaceAll("\"", ""))
  val label = trimmed(r.size - 1).toInt
  val categoryIdx = categories(r(3))
  val categoryFeatures = Array.ofDim[Double](numCategories)
  categoryFeatures(categoryIdx) = 1.0
  LabeledPoint(label, Vectors.dense(categoryFeatures))
}

dataNB: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[13] at map at <console>:40

In [18]:
val trainTestSplit = scaledDataCats.randomSplit(Array(0.6, 0.4), 123)
val train = trainTestSplit(0)
val test = trainTestSplit(1)

test: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[15] at randomSplit at <console>:46

In [19]:
val iterations = 10
val seed = 123
val learningRate = 0.005
val nEpochs = 20
val numInputs = 36
val numOutputs = 2
val numHiddenNodes = 30
val batchSizePerWorker = 16

batchSizePerWorker: Int = 16

In [20]:
import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator
import org.deeplearning4j.nn.api.OptimizationAlgorithm
import org.deeplearning4j.nn.conf.layers.{ DenseLayer, OutputLayer }
import org.deeplearning4j.nn.conf.{ NeuralNetConfiguration, Updater }
import org.deeplearning4j.nn.weights.WeightInit
import org.deeplearning4j.spark.api.{ Repartition, RepartitionStrategy }
import org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer
import org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster
import org.deeplearning4j.spark.stats.StatsUtils
import org.nd4j.linalg.dataset.DataSet
import org.nd4j.linalg.lossfunctions.LossFunctions._

val nnconf = new NeuralNetConfiguration.Builder().
                seed(seed).
                iterations(iterations).
                optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).
                learningRate(learningRate).
                updater(Updater.NESTEROVS).momentum(0.9).
                list().
                layer(0, new DenseLayer.Builder().
                               nIn(numInputs).
                               nOut(numHiddenNodes).
                               weightInit(WeightInit.XAVIER).
                               activation("relu").
                               build()).
                layer(1, new OutputLayer.Builder(LossFunction.NEGATIVELOGLIKELIHOOD).
                               weightInit(WeightInit.XAVIER).
                               activation("softmax").
                               nIn(numHiddenNodes).nOut(numOutputs).
                               build()).
                pretrain(false).backprop(true).build()


nnconf: org.deeplearning4j.nn.conf.MultiLayerConfiguration = 
{
  "backprop" : true,
  "backpropType" : "Standard",
  "confs" : [ {
    "extraArgs" : [ 0 ],
    "l1ByParam" : { },
    "l2ByParam" : { },
    "layer" : {
      "dense" : {
        "activationFunction" : "relu",
        "adamMeanDecay" : "NaN",
        "adamVarDecay" : "NaN",
        "biasInit" : 0.0,
        "biasL1" : 0.0,
        "biasL2" : 0.0,
        "biasLearningRate" : 0.005,
        "dist" : null,
        "dropOut" : 0.0,
        "epsilon" : "NaN",
        "gradientNormalization" : "None",
        "gradientNormalizationThreshold" : 1.0,
        "l1" : 0.0,
        "l2" : 0.0,
        "layerName" : null,
        "learningRate" : 0.005,
        "learningRateSchedule" : null,
        "momentum" : 0.9,
        "momentu...

In [21]:
val tm = new ParameterAveragingTrainingMaster.Builder(batchSizePerWorker).
    averagingFrequency(10).
    saveUpdater(true).
    workerPrefetchNumBatches(2).
    batchSizePerWorker(batchSizePerWorker).
    repartionData(Repartition.Always).
    repartitionStrategy(RepartitionStrategy.SparkDefault).
    build()

tm: org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster = ParameterAveragingTrainingMaster(saveUpdater=true, numWorkers=null, rddDataSetNumExamples=16, batchSizePerWorker=16, averagingFrequency=10, prefetchNumBatches=2, collectTrainingStats=false, stats=null, listeners=null, iterationCount=0, repartition=Always, repartitionStrategy=SparkDefault, storageLevel=StorageLevel(false, true, false, false, 1), storageLevelStreams=StorageLevel(false, true, false, true, 1), rddTrainingApproach=Export, exportDirectory=null, rng=java.util.Random@f6f6e0e, lastExportedRDDId=-2147483648, lastRDDExportPath=null, trainingMasterUID=1476803550950_-3ad7dce)

In [22]:
val sparkNet = new SparkDl4jMultiLayer(sc, nnconf, tm)

sparkNet: org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer = org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer@5dfb69b1

In [23]:
for (i <- 0 until nEpochs) {
    val _ = sparkNet.fitLabeledPoint(train)
}

In [24]:
import org.deeplearning4j.spark.util._

val evaluation = sparkNet.evaluate(MLLibUtil.fromLabeledPoint(test, 2, batchSizePerWorker))
println(evaluation.stats())

Examples labeled as 0 classified by model as 0: 884 times
Examples labeled as 0 classified by model as 1: 600 times
Examples labeled as 1 classified by model as 0: 401 times
Examples labeled as 1 classified by model as 1: 1085 times


 Accuracy:  0.663
 Precision: 0.6659
 Recall:    0.6629
 F1 Score:  0.6644