# 0 - Load the modules

In [16]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, StringIndexerModel, VectorAssembler, VectorIndexer}
import org.apache.spark.ml.Pipeline

import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.evaluation.RegressionEvaluator

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, StringIndexerModel, VectorAssembler, VectorIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.evaluation.RegressionEvaluator


# 1 - Start a Spark session and load the data

In [5]:
// ##### Start Spark session
val spark = SparkSession
.builder
.appName("auto")
.config("spark.master", "local")
.getOrCreate()

import spark.implicits._


// ##### load data
val workingDir = "data/"
val data = spark.read.format("csv").option("header","true").load(workingDir+"regtree.csv")
data.show()
data.printSchema()

+--------+----+--------+-----+-----+
| outlook|temp|humidity|windy|hours|
+--------+----+--------+-----+-----+
|   rainy| hot|    high|FALSE|   25|
|   rainy| hot|    high| TRUE|   30|
|overcast| hot|    high|FALSE|   46|
|   sunny|mild|    high|FALSE|   45|
|   sunny|cool|  normal|FALSE|   52|
|   sunny|cool|  normal| TRUE|   23|
|overcast|cool|  normal| TRUE|   43|
|   rainy|mild|    high|FALSE|   35|
|   rainy|cool|  normal|FALSE|   38|
|   sunny|mild|  normal|FALSE|   46|
|   rainy|mild|  normal| TRUE|   48|
|overcast|mild|    high| TRUE|   52|
|overcast| hot|  normal|FALSE|   44|
|   sunny|mild|    high| TRUE|   30|
+--------+----+--------+-----+-----+

root
 |-- outlook: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- humidity: string (nullable = true)
 |-- windy: string (nullable = true)
 |-- hours: string (nullable = true)



spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@17ed7250
import spark.implicits._
workingDir: String = data/
data: org.apache.spark.sql.DataFrame = [outlook: string, temp: string ... 3 more fields]


# 2 - Features indexation, Target indexation, Training/Test Split

## 2.1 Target indexation

In [3]:
// ##### index the label attribute */
val label = "hours"

val labelIndexer = new StringIndexer()
    .setInputCol(label)
    .setOutputCol("indexed_" + label) 

label: String = hours
labelIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_26389e8035fb


## 2.2 - Features String encoding

In [25]:
val attributes= data.columns.filterNot(_.contains(label))

val catFeatIndexer= attributes.map{
    att => 
    new StringIndexer()
    .setInputCol(att)
    .setOutputCol("indexed_" + att)   
} 

//println(attributes)

attributes: Array[String] = Array(outlook, temp, humidity, windy)
catFeatIndexer: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_402e3f9f83a1, strIdx_8ce220626c0b, strIdx_6dc2640d133c, strIdx_ee7f4de8d18a)


## 2.3 - Fit the string indexer to the data and extract the labels

In [28]:
val indcatFeatIndexer = catFeatIndexer.map(x=>x.fit(data).labels.zipWithIndex)

indcatFeatIndexer: Array[Array[(String, Int)]] = Array(Array((rainy,0), (sunny,1), (overcast,2)), Array((mild,0), (cool,1), (hot,2)), Array((high,0), (normal,1)), Array((FALSE,0), (TRUE,1)))


In [6]:
/*val attributes= data.columns.filterNot(_.contains(label))
val catFeatIndexer= attributes.map{
    att => 
    new StringIndexer()
    .setInputCol(att)
    .setOutputCol("indexed_" + att)   
} */


attributes: Array[String] = Array(outlook, temp, humidity, windy)
catFeatIndexer: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_6515ff294c1b, strIdx_605404e8b5bc, strIdx_6d46ce77927f, strIdx_4775172b654e)


In [19]:
/*val att_outlook = "outlook"
val outlook_indexer = new StringIndexer()
    .setInputCol(att_outlook)
    .setOutputCol("indexed_" + att) 
    .fit(data)


outlook_indexer.labels.zipWithIndex*/

att_outlook: String = outlook
outlook_indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_f23748d4e994
res12: Array[(String, Int)] = Array((rainy,0), (sunny,1), (overcast,2))


In [25]:
/*val att_temp = "temp"
val temp_indexer = new StringIndexer()
    .setInputCol(att_temp)
    .setOutputCol("indexed_" + att) 
    .fit(data)

//val data_1 = temp_indexer.fit(data).transform(data)

temp_indexer.labels.zipWithIndex*/


att_temp: String = temp
temp_indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_52fdaf4e6fef
res18: Array[(String, Int)] = Array((mild,0), (cool,1), (hot,2))


In [17]:
/*val att_hum = "humidity"
val hum_indexer = new StringIndexer()
    .setInputCol(att_hum)
    .setOutputCol("indexed_" + att) 
    .fit(temp_indexer)
hum_indexer.labels.zipWithIndex*/

att_hum: String = humidity
hum_indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_7dbac1adf9f0
res10: Array[(String, Int)] = Array((high,0), (normal,1))


In [9]:
/*val att = "windy"
val hstrmodel = new StringIndexer()
    .setInputCol(att)
    .setOutputCol("indexed_" + att) 
    .fit(data)
hstrmodel.labels.zipWithIndex*/

att: String = humidity
hstrmodel: org.apache.spark.ml.feature.StringIndexerModel = strIdx_384e7db14bb4
res4: Array[(String, Int)] = Array((high,0), (normal,1))


## 2.4 - Assemble the indexed features

In [7]:
val features = catFeatIndexer.map(_.getOutputCol)

// assemble with the rest of the features
val vectorAssemb = new VectorAssembler()
.setInputCols(features)
.setOutputCol("assembled")

features: Array[String] = Array(indexed_outlook, indexed_temp, indexed_humidity, indexed_windy)
vectorAssemb: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_271c434a6e40, handleInvalid=error, numInputCols=4
maxCat: Int = 4
vecIndexer: org.apache.spark.ml.feature.VectorIndexer = vecIdx_60631d41fc12


## 2.5 - Index the vectors

In [None]:
// index the vector
val maxCat = 4
val vecIndexer = new VectorIndexer()
.setInputCol(vectorAssemb.getOutputCol)
.setOutputCol("features")
.setMaxCategories(maxCat)

# 3 - Build and fit the pipeline on data

In [9]:
val pipeline = new Pipeline()
.setStages(Array(labelIndexer)++catFeatIndexer++Array(vectorAssemb,vecIndexer))


pipeline: org.apache.spark.ml.Pipeline = pipeline_fdd1b3ec06cf


In [11]:
import org.apache.spark.sql.types.DoubleType
val ftdata = pipeline.fit(data).transform(data)
.withColumn("label",col(label).cast(DoubleType))
.select("features","label")


import org.apache.spark.sql.types.DoubleType
ftdata: org.apache.spark.sql.DataFrame = [features: vector, label: double]


## Check final dataset

In [12]:
ftdata.printSchema
ftdata.show()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|    (4,[1],[2.0])| 25.0|
|[0.0,2.0,0.0,1.0]| 30.0|
|[2.0,2.0,0.0,0.0]| 46.0|
|    (4,[0],[1.0])| 45.0|
|[1.0,1.0,1.0,0.0]| 52.0|
|[1.0,1.0,1.0,1.0]| 23.0|
|[2.0,1.0,1.0,1.0]| 43.0|
|        (4,[],[])| 35.0|
|[0.0,1.0,1.0,0.0]| 38.0|
|[1.0,0.0,1.0,0.0]| 46.0|
|[0.0,0.0,1.0,1.0]| 48.0|
|[2.0,0.0,0.0,1.0]| 52.0|
|[2.0,2.0,1.0,0.0]| 44.0|
|[1.0,0.0,0.0,1.0]| 30.0|
+-----------------+-----+



# 4 - Train/Test Split

In [13]:
val Array(trainingData,testData)= ftdata.randomSplit(Array(0.7,0.3))

trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [features: vector, label: double]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [features: vector, label: double]


# 5 - Decision Tree : build, train, evaluate

In [17]:
// train the decision tree
val dt = new DecisionTreeRegressor()
.setLabelCol("label")
.setFeaturesCol("features")
.setMinInstancesPerNode(2)

val model = dt.fit(trainingData)


dt: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_f1ffe28775f8
model: org.apache.spark.ml.regression.DecisionTreeRegressionModel = DecisionTreeRegressionModel: uid=dtr_f1ffe28775f8, depth=2, numNodes=7, numFeatures=4


In [18]:
val treeModel=model.asInstanceOf[DecisionTreeRegressionModel]
print(s"Learned classification tree model :\n ${treeModel.toDebugString}")

Learned classification tree model :
 DecisionTreeRegressionModel: uid=dtr_f1ffe28775f8, depth=2, numNodes=7, numFeatures=4
  If (feature 2 in {0.0})
   If (feature 0 in {0.0})
    Predict: 30.0
   Else (feature 0 not in {0.0})
    Predict: 40.333333333333336
  Else (feature 2 not in {0.0})
   If (feature 0 in {0.0})
    Predict: 43.0
   Else (feature 0 not in {0.0})
    Predict: 47.5


treeModel: org.apache.spark.ml.regression.DecisionTreeRegressionModel = DecisionTreeRegressionModel: uid=dtr_f1ffe28775f8, depth=2, numNodes=7, numFeatures=4


## Model evaluation on Test Data

In [19]:
// ####### make predictions

val predictions = model.transform(testData)

// ####### Select (prediction, true label) and compute error

val evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")

// ####### get RMSE
evaluator.setMetricName("rmse")
val mse = evaluator.evaluate(predictions)
println(s"Root Mean Squared Error (MSE) on test data = $mse")

// ####### get RMSE
evaluator.setMetricName("mae")
val mae = evaluator.evaluate(predictions)
println(s"Root Mean Squared Error (MAE) on test data = $mae")

Root Mean Squared Error (MSE) on test data = 13.700922515574554
Root Mean Squared Error (MAE) on test data = 10.291666666666666


predictions: org.apache.spark.sql.DataFrame = [features: vector, label: double ... 1 more field]
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = RegressionEvaluator: uid=regEval_8feebce69d33, metricName=mae, throughOrigin=false
mse: Double = 13.700922515574554
mae: Double = 10.291666666666666


## Features importance

In [20]:
val featureImportances = model.featureImportances
val res = attributes.zip(featureImportances.toArray).sortBy(-_._2).foreach(println)

(humidity,0.5749234273148526)
(outlook,0.42507657268514737)
(temp,0.0)
(windy,0.0)


featureImportances: org.apache.spark.ml.linalg.Vector = (4,[0,2],[0.42507657268514737,0.5749234273148526])
