In [1]:
%%configure -f
{"jars":["/user/livy/repl_jars/dl4j-assembly-0.6.0.jar"],"driverMemory":"3g","executorMemory":"2g","conf":{"spark.driver.extraClassPath":"/home/livy/dl4j-assembly-0.6.0.jar","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.kryo.registrator":"org.nd4j.Nd4jRegistrator"}}

In [2]:
val rawData = sc.textFile("data/mls/ch05/train.tsv")

Creating SparkContext as 'sc'


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4,application_1476777146650_0012,spark,idle,Link,Link,✔


Creating HiveContext as 'sqlContext'
SparkContext and HiveContext created. Executing user code ...
rawData: org.apache.spark.rdd.RDD[String] = data/mls/ch05/train.tsv MapPartitionsRDD[1] at textFile at <console>:27

In [3]:
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint

val rawDataNoHeader = rawData.filter(line => !line.contains("hasDomainLink"))

val records = rawDataNoHeader.map(line => line.split("\t"))

val data = records.map { r =>
  val trimmed = r.map(_.replaceAll("\"", ""))
  val label = trimmed(r.size - 1).toInt
  val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
  LabeledPoint(label.toDouble, Vectors.dense(features))
}

data.cache
println(data.count)

7395

In [4]:
val trainTestSplit = data.randomSplit(Array(0.6, 0.4), 123)
val train = trainTestSplit(0)
val test = trainTestSplit(1)

test: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[6] at randomSplit at <console>:37

In [5]:
val iterations = 10
val seed = 123
val listenerFreq = iterations/5
val learningRate = 0.005
val nEpochs = 10

val numInputs = 22
val numOutputs = 2
val numHiddenNodes = 20
val batchSizePerWorker = 16

batchSizePerWorker: Int = 16

In [6]:
import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator
import org.deeplearning4j.nn.api.OptimizationAlgorithm
import org.deeplearning4j.nn.conf.layers.{ DenseLayer, OutputLayer }
import org.deeplearning4j.nn.conf.{ NeuralNetConfiguration, Updater }
import org.deeplearning4j.nn.weights.WeightInit
import org.deeplearning4j.spark.api.{ Repartition, RepartitionStrategy }
import org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer
import org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster
import org.deeplearning4j.spark.stats.StatsUtils
import org.nd4j.linalg.dataset.DataSet
import org.nd4j.linalg.lossfunctions.LossFunctions._

val nnconf = new NeuralNetConfiguration.Builder().
                seed(seed).
                iterations(iterations).
                optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).
                learningRate(learningRate).
                updater(Updater.NESTEROVS).momentum(0.9).
                list().
                layer(0, new DenseLayer.Builder().
                               nIn(numInputs).
                               nOut(numHiddenNodes).
                               weightInit(WeightInit.XAVIER).
                               activation("relu").
                               build()).
                layer(1, new OutputLayer.Builder(LossFunction.NEGATIVELOGLIKELIHOOD).
                               weightInit(WeightInit.XAVIER).
                               activation("softmax").
                               nIn(numHiddenNodes).nOut(numOutputs).
                               build()).
                pretrain(false).backprop(true).build()


nnconf: org.deeplearning4j.nn.conf.MultiLayerConfiguration = 
{
  "backprop" : true,
  "backpropType" : "Standard",
  "confs" : [ {
    "extraArgs" : [ 0 ],
    "l1ByParam" : { },
    "l2ByParam" : { },
    "layer" : {
      "dense" : {
        "activationFunction" : "relu",
        "adamMeanDecay" : "NaN",
        "adamVarDecay" : "NaN",
        "biasInit" : 0.0,
        "biasL1" : 0.0,
        "biasL2" : 0.0,
        "biasLearningRate" : 0.005,
        "dist" : null,
        "dropOut" : 0.0,
        "epsilon" : "NaN",
        "gradientNormalization" : "None",
        "gradientNormalizationThreshold" : 1.0,
        "l1" : 0.0,
        "l2" : 0.0,
        "layerName" : null,
        "learningRate" : 0.005,
        "learningRateSchedule" : null,
        "momentum" : 0.9,
        "momentu...

In [7]:
val tm = new ParameterAveragingTrainingMaster.Builder(batchSizePerWorker).
    averagingFrequency(10).
    saveUpdater(true).
    workerPrefetchNumBatches(2).
    batchSizePerWorker(batchSizePerWorker).
    repartionData(Repartition.Always).
    repartitionStrategy(RepartitionStrategy.SparkDefault).
    build()

tm: org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster = ParameterAveragingTrainingMaster(saveUpdater=true, numWorkers=null, rddDataSetNumExamples=16, batchSizePerWorker=16, averagingFrequency=10, prefetchNumBatches=2, collectTrainingStats=false, stats=null, listeners=null, iterationCount=0, repartition=Always, repartitionStrategy=SparkDefault, storageLevel=StorageLevel(false, true, false, false, 1), storageLevelStreams=StorageLevel(false, true, false, true, 1), rddTrainingApproach=Export, exportDirectory=null, rng=java.util.Random@31e6a053, lastExportedRDDId=-2147483648, lastRDDExportPath=null, trainingMasterUID=1476788962269_-438f453)

In [8]:
val sparkNet = new SparkDl4jMultiLayer(sc, nnconf, tm)

sparkNet: org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer = org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer@684e5144

In [9]:
for (i <- 0 until nEpochs) {
    val _ = sparkNet.fitLabeledPoint(train)
}

In [12]:
import org.deeplearning4j.spark.util._

val evaluation = sparkNet.evaluate(MLLibUtil.fromLabeledPoint(test, 2, batchSizePerWorker))
println(evaluation.stats())

Examples labeled as 0 classified by model as 1: 1484 times
Examples labeled as 1 classified by model as 1: 1486 times


 Accuracy:  0.5003
 Precision: 0.5003
 Recall:    0.5
 F1 Score:  0.5002

In [13]:
train.take(10).foreach(println(_))

(0.0,[0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])
(0.0,[0.719157,2.676470588,0.5,0.222222222,0.12345679,0.043209877,0.446143274,0.0,0.0,0.024908425,0.0,0.228887247,0.050473186,1.0,1.0,14.0,0.0,12032.0,162.0,10.0,0.098765432,0.082568807])
(0.0,[0.0,119.0,0.745454545,0.581818182,0.290909091,0.018181818,0.434639175,0.0,0.0,0.01984127,0.0,0.298299595,0.038636364,0.0,0.0,12.0,0.0,4368.0,55.0,3.0,0.054545455,0.087356322])
(1.0,[0.22111,0.773809524,0.215053763,0.053763441,0.043010753,0.043010753,0.579596413,0.0,0.0,0.039568345,0.0,0.218978009,0.311377246,1.0,0.0,21.0,0.0,1287.0,93.0,3.0,0.548387097,0.064327485])
(0.0,[0.0,1.883333333,0.71969697,0.265151515,0.113636364,0.015151515,0.49934811,0.0,0.0,0.02661597,0.0,0.173745927,0.025830258,0.0,0.0,5.0,0.0,27656.0,132.0,4.0,0.068181818,0.148550725])
(1.0,[0.0,2.41011236,0.469325153,0.101226994,0.018404908

In [14]:
test.take(10).foreach(println(_))

(1.0,[0.574147,3.677966102,0.50802139,0.288770053,0.213903743,0.144385027,0.468648998,0.0,0.0,0.098707403,0.0,0.203489628,0.088652482,1.0,1.0,40.0,0.0,4973.0,187.0,9.0,0.181818182,0.125448029])
(1.0,[0.996526,2.382882883,0.562015504,0.321705426,0.120155039,0.042635659,0.525448029,0.0,0.0,0.072447859,0.0,0.22640177,0.120535714,1.0,1.0,55.0,0.0,2240.0,258.0,11.0,0.166666667,0.057613169])
(1.0,[0.801248,1.543103448,0.4,0.1,0.016666667,0.0,0.480724749,0.0,0.0,0.095860566,0.0,0.265655744,0.035343035,1.0,0.0,24.0,0.0,2737.0,120.0,5.0,0.041666667,0.100858369])
(1.0,[0.0,0.471502591,0.190721649,0.036082474,0.0,0.0,0.383199079,0.0,0.0,0.021705426,0.0,0.11496229,1.136645963,1.0,0.0,17.0,0.0,2471.0,194.0,7.0,0.644329897,0.125])
(0.0,[0.816604,2.506527415,0.637755102,0.293367347,0.091836735,0.048469388,0.592321755,0.0,0.0,0.056497175,0.0,0.223003543,0.511363636,1.0,1.0,53.0,0.0,4401.0,392.0,0.0,0.160714286,0.073684211])
(0.0,[0.548963,0.990430622,0.522522523,0.108108108,0.009009009,0.0,0.414154653