In [1]:
%classpath add mvn com.salesforce.transmogrifai transmogrifai-core_2.11 0.7.0

In [2]:
%classpath add mvn org.apache.spark spark-mllib_2.11 2.4.5

In [3]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf

import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.udf
import com.salesforce.op._
import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.evaluators.Evaluators


In [4]:
import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders

import com.salesforce.op.OpWorkflow
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.DataReaders


In [5]:
val conf = new SparkConf().setMaster("local[*]").setAppName("HousingPricesPrediction")
implicit val spark = SparkSession.builder.config(conf).getOrCreate()

org.apache.spark.sql.SparkSession@1de3380f

In [6]:
case class HousingPrices(
  lotFrontage: Double,
  area: Integer,
  lotShape: String,
  yrSold : Integer,
  saleType: String,
  saleCondition: String,
  salePrice: Double)

defined class HousingPrices


In [7]:
import org.apache.spark.sql.{Encoders}
implicit val srEncoder = Encoders.product[HousingPrices]

class[lotFrontage[0]: double, area[0]: int, lotShape[0]: string, yrSold[0]: int, saleType[0]: string, saleCondition[0]: string, salePrice[0]: double]

In [8]:
val lotFrontage = FeatureBuilder.Real[HousingPrices].extract(_.lotFrontage.toReal).asPredictor
val area = FeatureBuilder.Integral[HousingPrices].extract(_.area.toIntegral).asPredictor

Feature(name = area, uid = Integral_000000000002, isResponse = false, originStage = FeatureGeneratorStage_000000000002, parents = [], distributions = [])

In [9]:
val lotShape = FeatureBuilder.Integral[HousingPrices].extract(_.lotShape match {
    case "IR1" => 1.toIntegral
    case _ => 0.toIntegral
}).asPredictor

Feature(name = lotShape, uid = Integral_000000000003, isResponse = false, originStage = FeatureGeneratorStage_000000000003, parents = [], distributions = [])

In [10]:
val yrSold = FeatureBuilder.Integral[HousingPrices].extract(_.yrSold.toIntegral).asPredictor

Feature(name = yrSold, uid = Integral_000000000004, isResponse = false, originStage = FeatureGeneratorStage_000000000004, parents = [], distributions = [])

In [11]:
val saleType = FeatureBuilder.Text[HousingPrices].extract(_.saleType.toText).asPredictor.indexed()

Feature(name = saleType_1-stagesApplied_RealNN_000000000006, uid = RealNN_000000000006, isResponse = false, originStage = OpStringIndexerNoFilter_000000000006, parents = [Text_000000000005], distributions = [])

In [12]:
val saleCondition = FeatureBuilder.Text[HousingPrices]
  .extract(_.saleCondition.toText).asPredictor.indexed()

Feature(name = saleCondition_1-stagesApplied_RealNN_000000000008, uid = RealNN_000000000008, isResponse = false, originStage = OpStringIndexerNoFilter_000000000008, parents = [Text_000000000007], distributions = [])

In [13]:
val salePrice = FeatureBuilder.RealNN[HousingPrices].extract(_.salePrice.toRealNN).asResponse

Feature(name = salePrice, uid = RealNN_000000000009, isResponse = true, originStage = FeatureGeneratorStage_000000000009, parents = [], distributions = [])

In [14]:
 val trainFilePath = "/home/beakerx/helloworld/src/main/resources/HousingPricesDataset/train_lf_la_ls_ys_st_sc.csv"

/home/beakerx/helloworld/src/main/resources/HousingPricesDataset/train_lf_la_ls_ys_st_sc.csv

Create a training data reader from the `trainFilePath` using `DataReaders.Simple`

In [15]:
val trainDataReader = DataReaders.Simple.csvCase[HousingPrices](
      path = Option(trainFilePath)
    )

com.salesforce.op.readers.CSVProductReader@11173c5f

In [16]:
import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}
val features = Seq(lotFrontage,area,lotShape, yrSold, saleType, saleCondition).transmogrify()
val randomSeed = 42L
val splitter = DataSplitter(seed = randomSeed)

DataSplitter_00000000000e

In [17]:
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.{OpGBTRegressor, OpRandomForestRegressor}

val prediction1 = RegressionModelSelector
      .withCrossValidation(
        dataSplitter = Some(splitter), seed = randomSeed,
        modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
      ).setInput(salePrice,features).getOutput()

Feature(name = area-lotFrontage-lotShape-saleCondition-salePrice-saleType-yrSold_7-stagesApplied_Prediction_000000000017, uid = Prediction_000000000017, isResponse = true, originStage = ModelSelector_000000000017, parents = [RealNN_000000000009,OPVector_00000000000d], distributions = [])

In [18]:
val evaluator = Evaluators.Regression().setLabelCol(salePrice).setPredictionCol(prediction1)

OpRegressionEvaluator_000000000018

In [19]:
val workflow = new OpWorkflow().setResultFeatures(prediction1, salePrice).setReader(trainDataReader)
val workflowModel = workflow.train()

com.salesforce.op.OpWorkflowModel@47167e30

In [20]:
val (scores, metrics) = workflowModel.scoreAndEvaluate(evaluator)
scores.show(false)

+--------------------+---------+---------------------------------------------------------------------------------------------------------+
|key                 |salePrice|area-lotFrontage-lotShape-saleCondition-salePrice-saleType-yrSold_7-stagesApplied_Prediction_000000000017|
+--------------------+---------+---------------------------------------------------------------------------------------------------------+
|2587754930100963237 |208500.0 |[prediction -> 164822.27126980046]                                                                       |
|-6679898534360193130|181500.0 |[prediction -> 184666.61988456224]                                                                       |
|-9172575567373448966|223500.0 |[prediction -> 208644.10371143607]                                                                       |
|-3279579769696099579|140000.0 |[prediction -> 162722.31447600614]                                                                       |
|9018456301635309930 |25000

null

In [21]:
metrics.toString()

{
  "RootMeanSquaredError" : 59471.929439328924,
  "MeanSquaredError" : 3.536910391236518E9,
  "R2" : 0.49094715177074066,
  "MeanAbsoluteError" : 42083.20232575104
}

In [22]:
println("Metrics:\n------------")
println(metrics)

Metrics:
------------
{
  "RootMeanSquaredError" : 59471.929439328924,
  "MeanSquaredError" : 3.536910391236518E9,
  "R2" : 0.49094715177074066,
  "MeanAbsoluteError" : 42083.20232575104
}


null