In [1]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.7.0

In [2]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.4.5

In [3]:
case class Iris
(
  sepalLength: Double,
  sepalWidth: Double,
  petalLength: Double,
  petalWidth: Double,
  irisClass: String
)

defined class Iris


In [4]:
import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

val sepalLength = FeatureBuilder.Real[Iris].extract(_.sepalLength.toReal).asPredictor
val sepalWidth = FeatureBuilder.Real[Iris].extract(_.sepalWidth.toReal).asPredictor
val petalLength = FeatureBuilder.Real[Iris].extract(_.petalLength.toReal).asPredictor
val petalWidth = FeatureBuilder.Real[Iris].extract(_.petalWidth.toReal).asPredictor
val irisClass = FeatureBuilder.Text[Iris].extract(_.irisClass.toText).asResponse

Feature(name = irisClass, uid = Text_000000000005, isResponse = true, originStage = FeatureGeneratorStage_000000000005, parents = [], distributions = [])

In [5]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf


In [6]:
val conf = new SparkConf().setMaster("local[*]").setAppName("TitanicPrediction")
implicit val spark = SparkSession.builder.config(conf).getOrCreate()

org.apache.spark.sql.SparkSession@4b7715f2

In [7]:
import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders
import com.salesforce.op.stages.impl.classification.MultiClassificationModelSelector
import com.salesforce.op.stages.impl.tuning.DataCutter
import org.apache.spark.sql.Encoders

import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders
import com.salesforce.op.stages.impl.classification.MultiClassificationModelSelector
import com.salesforce.op.stages.impl.tuning.DataCutter
import org.apache.spark.sql.Encoders


In [8]:
implicit val irisEncoder = Encoders.product[Iris]

class[sepalLength[0]: double, sepalWidth[0]: double, petalLength[0]: double, petalWidth[0]: double, irisClass[0]: string]

In [9]:
val irisReader = DataReaders.Simple.csvCase[Iris]()

com.salesforce.op.readers.CSVProductReader@7aa785f9

In [10]:
val labels = irisClass.indexed()
val features = Seq(sepalLength, sepalWidth, petalLength, petalWidth).transmogrify()


Feature(name = petalLength-petalWidth-sepalLength-sepalWidth_2-stagesApplied_OPVector_000000000008, uid = OPVector_000000000008, isResponse = false, originStage = VectorsCombiner_000000000008, parents = [OPVector_000000000007], distributions = [])

In [11]:
val randomSeed = 42L
val cutter = DataCutter(reserveTestFraction = 0.2, seed = randomSeed)

DataCutter_000000000009

In [12]:
val prediction = MultiClassificationModelSelector
    .withCrossValidation(splitter = Option(cutter), seed = randomSeed)
    .setInput(labels, features).getOutput()

Feature(name = irisClass-petalLength-petalWidth-sepalLength-sepalWidth_4-stagesApplied_Prediction_000000000011, uid = Prediction_000000000011, isResponse = true, originStage = ModelSelector_000000000011, parents = [RealNN_000000000006,OPVector_000000000008], distributions = [])

In [13]:
val evaluator = Evaluators.MultiClassification.f1().setLabelCol(labels).setPredictionCol(prediction)

OpMultiClassificationEvaluator_000000000012

In [14]:
implicit val spark = SparkSession.builder.config(conf).getOrCreate()
import spark.implicits._ // Needed for Encoders for the Passenger case class
import com.salesforce.op.readers.DataReaders

val trainFilePath = "/home/beakerx/helloworld/src/main/resources/IrisDataset/iris.data"
    // Define a way to read data into our Passenger class from our CSV file
val trainDataReader = DataReaders.Simple.csvCase[Iris](
      path = Option(trainFilePath)
      //key = _.id.toString
    )

org.apache.spark.sql.SparkSession$implicits$@36575b59

In [15]:
val workflow = new OpWorkflow().setResultFeatures(prediction, labels).setReader(trainDataReader)

com.salesforce.op.OpWorkflow@3021d783

In [16]:
val fittedWorkflow = workflow.train()
println("Summary:\n" + fittedWorkflow.summaryPretty())

Summary:
Evaluated OpLogisticRegression, OpRandomForestClassifier models using Cross Validation and error metric.
Evaluated 8 OpLogisticRegression models with error metric between [0.047397682477246225, 0.08995087396660793].
Evaluated 18 OpRandomForestClassifier models with error metric between [0.06100312465411695, 0.6725498710122346].
+--------------------------------------------------------+
|         Selected Model - OpLogisticRegression          |
+--------------------------------------------------------+
| Model Param      | Value                               |
+------------------+-------------------------------------+
| aggregationDepth | 2                                   |
| elasticNetParam  | 0.1                                 |
| family           | auto                                |
| fitIntercept     | true                                |
| maxIter          | 50                                  |
| modelType        | OpLogisticRegression                |
| name      

null

In [17]:
println("Scoring the model:\n=================")
val (dataframe, metrics) = fittedWorkflow.scoreAndEvaluate(evaluator = evaluator)

println("Transformed dataframe columns:\n--------------------------")
dataframe.columns.foreach(println)

println("Metrics:\n------------")
println(metrics)

Scoring the model:
Transformed dataframe columns:
--------------------------
key
irisClass_1-stagesApplied_RealNN_000000000006
irisClass-petalLength-petalWidth-sepalLength-sepalWidth_4-stagesApplied_Prediction_000000000011
Metrics:
------------
{
  "Precision" : 0.9600000000000001,
  "Recall" : 0.9600000000000001,
  "F1" : 0.9600000000000001,
  "Error" : 0.040000000000000036,
  "ThresholdMetrics" : {
    "topNs" : [ 1, 3 ],
    "thresholds" : [ 0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.9

null