# Spylon Test

For XGBoost - Spark 2.4.5 only with scala 2_11

Need to make sure that the spark nodes have libgomp first in order to be able to train models successfully

In [1]:
%%init_spark
launcher.packages = ["ml.dmlc:xgboost4j_2.11:1.0.0", "ml.dmlc:xgboost4j-spark_2.11:1.0.0"]
launcher.master = "spark://spark-master:7077"
launcher.conf.spark.app.name = "XGBoost Test App"
launcher.conf.spark.executor.cores = 4
launcher.conf.spark.executor.memory = "4g"

In [2]:
// the import starts a spark sesh on local
// Need to change so taht we can create the session later
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

Intitializing Scala interpreter ...

Spark Web UI available at http://0fe5e027a5ec:4040
SparkContext available as 'sc' (version = 2.4.5, master = spark://spark-master:7077, app id = app-20200620142151-0003)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}


In [3]:
val schema = new StructType(Array(
  StructField("sepal_length", DoubleType, true),
  StructField("sepal_width", DoubleType, true),
  StructField("petal_length", DoubleType, true),
  StructField("petal_width", DoubleType, true),
  StructField("species", StringType, true)))

schema: org.apache.spark.sql.types.StructType = StructType(StructField(sepal_length,DoubleType,true), StructField(sepal_width,DoubleType,true), StructField(petal_length,DoubleType,true), StructField(petal_width,DoubleType,true), StructField(species,StringType,true))


In [4]:
val rawInput = spark.read
                    .option("header", "true")
                    .schema(schema).csv("/opt/spark-data/iris.csv")

rawInput: org.apache.spark.sql.DataFrame = [sepal_length: double, sepal_width: double ... 3 more fields]


In [5]:
rawInput.take(6)

res0: Array[org.apache.spark.sql.Row] = Array([5.1,3.5,1.4,0.2,setosa], [4.9,3.0,1.4,0.2,setosa], [4.7,3.2,1.3,0.2,setosa], [4.6,3.1,1.5,0.2,setosa], [5.0,3.6,1.4,0.2,setosa], [5.4,3.9,1.7,0.4,setosa])


In [6]:
import org.apache.spark.ml.feature.StringIndexer
val stringIndexer = new StringIndexer().
  setInputCol("species").
  setOutputCol("classIndex").
  fit(rawInput)
val labelTransformed = stringIndexer.transform(rawInput).drop("species")

import org.apache.spark.ml.feature.StringIndexer
stringIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_cb694769d9bb
labelTransformed: org.apache.spark.sql.DataFrame = [sepal_length: double, sepal_width: double ... 3 more fields]


In [7]:
import org.apache.spark.ml.feature.VectorAssembler
val vectorAssembler = new VectorAssembler().
  setInputCols(Array("sepal_length", "sepal_width", "petal_length", "petal_width")).
  setOutputCol("features")
val xgbInput = vectorAssembler.transform(labelTransformed).select("features", "classIndex")

import org.apache.spark.ml.feature.VectorAssembler
vectorAssembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_287f3b2d2715
xgbInput: org.apache.spark.sql.DataFrame = [features: vector, classIndex: double]


In [8]:
import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
import ml.dmlc.xgboost4j.scala.spark.TrackerConf

import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
import ml.dmlc.xgboost4j.scala.spark.TrackerConf


In [9]:
val xgbParam = Map("eta" -> 0.1f,
      "missing" -> -999,
      "objective" -> "multi:softprob",
      "num_class" -> 3,
      "num_round" -> 3,
      "num_workers" -> 3,
      "tracker_conf"-> TrackerConf(60 * 60 * 1000, "scala"))
val xgbClassifier = new XGBoostClassifier(xgbParam).
      setFeaturesCol("features").
      setLabelCol("classIndex")

xgbParam: scala.collection.immutable.Map[String,Any] = Map(num_workers -> 3, num_class -> 3, objective -> multi:softprob, num_round -> 3, missing -> -999, tracker_conf -> TrackerConf(3600000,scala), eta -> 0.1)
xgbClassifier: ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier = xgbc_05eebe01d4a3


In [None]:
xgbClassifier.fit(xgbInput)

[INFO] [06/20/2020 14:22:10.091] [RabitTracker-akka.actor.default-dispatcher-2] [akka://RabitTracker/user/Handler] Tracker listening @ 172.20.0.3:51166
[INFO] [06/20/2020 14:22:10.091] [RabitTracker-akka.actor.default-dispatcher-2] [akka://RabitTracker/user/Handler] Worker connection timeout is 1 hour.


In [None]:
// spark.stop