# 0 - Load the modules

In [4]:
import org.apache.spark.sql.SparkSession

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import spark.implicits._
import org.apache.spark.ml.feature.VectorIndexer

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import spark.implicits._
import org.apache.spark.ml.feature.VectorIndexer


In [2]:
/*!wget "http://webia.lip6.fr/~baazizi/tc/fc/psl/20/data/DTClass.zip"
%cd
!unzip DTClass.zip
!ls -hal DTClass
!cd DTClass;
!ls /home/moi/Decision_Tree*/

# 1 - Start a Spark session and load the data

In [5]:
val spark = SparkSession
.builder
.appName("credit")
.config("spark.master", "local")
.getOrCreate()

import spark.implicits._

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@11b2e092
import spark.implicits._


In [7]:
val workingDir = "data/"
val data = spark.read.format("csv").option("header","true").load(workingDir+"credit.csv")
data.show()

+------+------+-------+-------------+-----+
|   age|income|student|credit_rating|label|
+------+------+-------+-------------+-----+
| young|  high|     no|         fair|   no|
| young|  high|     no|    excellent|   no|
|middle|  high|     no|         fair|  yes|
|senior|medium|     no|         fair|  yes|
|senior|   low|    yes|         fair|  yes|
|senior|   low|    yes|    excellent|   no|
|middle|   low|    yes|    excellent|  yes|
| young|medium|     no|         fair|   no|
| young|   low|    yes|         fair|  yes|
|senior|medium|    yes|         fair|  yes|
| young|medium|    yes|    excellent|  yes|
|middle|medium|     no|    excellent|  yes|
|middle|  high|    yes|         fair|  yes|
|senior|medium|     no|    excellent|   no|
+------+------+-------+-------------+-----+



workingDir: String = data/
data: org.apache.spark.sql.DataFrame = [age: string, income: string ... 3 more fields]


# 2 - Features indexation, Target indexation, Training/Test Split

## 2.1 Target indexation

In [8]:
// ##### index the label attribute
val label = "label"

val labelIndexer = new StringIndexer()
    .setInputCol(label)
    .setOutputCol("indexed_" + label) 

label: String = label
labelIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_0661e82caa8d


## 2.2 - Features String encoding

In [9]:
val attributes= data.columns.filterNot(_.contains(label))

val catFeatIndexer= attributes.map{
    att => 
    new StringIndexer()
    .setInputCol(att)
    .setOutputCol("indexed_" + att)   
} 


attributes: Array[String] = Array(age, income, student, credit_rating)
catFeatIndexer: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_c3d4e1a7cea4, strIdx_0c2df3884339, strIdx_b8136b405779, strIdx_6e9afff0a6c5)


## 2.3 - Fit the string indexer to the data and extract the labels

In [10]:
val indcatFeatIndexer = catFeatIndexer.map(x=>x.fit(data).labels.zipWithIndex)

indcatFeatIndexer: Array[Array[(String, Int)]] = Array(Array((senior,0), (young,1), (middle,2)), Array((medium,0), (high,1), (low,2)), Array((no,0), (yes,1)), Array((fair,0), (excellent,1)))


## 2.4 - Assemble the indexed features

In [11]:
val features = catFeatIndexer.map(_.getOutputCol)

val vectorAssemb = new VectorAssembler()
.setInputCols(features)
.setOutputCol("assembled")



features: Array[String] = Array(indexed_age, indexed_income, indexed_student, indexed_credit_rating)
vectorAssemb: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_a5116cf48202, handleInvalid=error, numInputCols=4


## 2.5 - Index the vectors

In [13]:
val maxCat = 4
val vecIndexer = new VectorIndexer()
.setInputCol(vectorAssemb.getOutputCol)
.setOutputCol("features")
.setMaxCategories(maxCat)

maxCat: Int = 4
vecIndexer: org.apache.spark.ml.feature.VectorIndexer = vecIdx_0bfcdc13b472


## 3 - Build and fit the pipeline on data

In [14]:
val pipeline = new Pipeline()
.setStages(Array(labelIndexer)++catFeatIndexer++Array(vectorAssemb,vecIndexer))

pipeline: org.apache.spark.ml.Pipeline = pipeline_176af8fbb618


In [19]:
val ftdata = pipeline.fit(data)
                         .transform(data)
                         .select("features","indexed_label")


ftdata: org.apache.spark.sql.DataFrame = [features: vector, indexed_label: double]


## Check final dataset

In [20]:
ftdata.printSchema
ftdata.show()

root
 |-- features: vector (nullable = true)
 |-- indexed_label: double (nullable = false)

+-----------------+-------------+
|         features|indexed_label|
+-----------------+-------------+
|[1.0,1.0,0.0,0.0]|          1.0|
|[1.0,1.0,0.0,1.0]|          1.0|
|[2.0,1.0,0.0,0.0]|          0.0|
|        (4,[],[])|          0.0|
|[0.0,2.0,1.0,0.0]|          0.0|
|[0.0,2.0,1.0,1.0]|          1.0|
|[2.0,2.0,1.0,1.0]|          0.0|
|    (4,[0],[1.0])|          1.0|
|[1.0,2.0,1.0,0.0]|          0.0|
|    (4,[2],[1.0])|          0.0|
|[1.0,0.0,1.0,1.0]|          0.0|
|[2.0,0.0,0.0,1.0]|          0.0|
|[2.0,1.0,1.0,0.0]|          0.0|
|    (4,[3],[1.0])|          1.0|
+-----------------+-------------+



# 4 - Train/Test Split

In [32]:
val Array(trainingData,testData)= ftdata.randomSplit(Array(0.7,0.3))

trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [features: vector, indexed_label: double]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [features: vector, indexed_label: double]


# 5 - Decision Tree : build, train, evaluate

In [55]:
val dt = new DecisionTreeClassifier()
.setLabelCol("indexed_label")
.setFeaturesCol("features")
.setMinInstancesPerNode(2)

val dtModel = dt.fit(trainingData)
print(s"Learned classification tree model :\n ${dtModel.toDebugString}")

Learned classification tree model :
 DecisionTreeClassificationModel: uid=dtc_2fcb9be31b62, depth=3, numNodes=7, numClasses=2, numFeatures=4
  If (feature 2 in {1.0})
   Predict: 0.0
  Else (feature 2 not in {1.0})
   If (feature 0 in {2.0})
    Predict: 0.0
   Else (feature 0 not in {2.0})
    If (feature 0 in {0.0})
     Predict: 0.0
    Else (feature 0 not in {0.0})
     Predict: 1.0


dt: org.apache.spark.ml.classification.DecisionTreeClassifier = dtc_2fcb9be31b62
dtModel: org.apache.spark.ml.classification.DecisionTreeClassificationModel = DecisionTreeClassificationModel: uid=dtc_2fcb9be31b62, depth=3, numNodes=7, numClasses=2, numFeatures=4


# 6 - Model evaluation on Train and Test Data

In [56]:
val predictions = dtModel.transform(testData)
predictions.show()

+-----------------+-------------+-------------+-----------+----------+
|         features|indexed_label|rawPrediction|probability|prediction|
+-----------------+-------------+-------------+-----------+----------+
|    (4,[0],[1.0])|          1.0|    [0.0,2.0]|  [0.0,1.0]|       1.0|
|[0.0,2.0,1.0,0.0]|          0.0|    [4.0,0.0]|  [1.0,0.0]|       0.0|
|[0.0,2.0,1.0,1.0]|          1.0|    [4.0,0.0]|  [1.0,0.0]|       0.0|
|[2.0,2.0,1.0,1.0]|          0.0|    [4.0,0.0]|  [1.0,0.0]|       0.0|
+-----------------+-------------+-------------+-----------+----------+



predictions: org.apache.spark.sql.DataFrame = [features: vector, indexed_label: double ... 3 more fields]


## Helper function for performance metrics

In [52]:
def performance(model : org.apache.spark.ml.classification.DecisionTreeClassificationModel,model_name : String): Any = {

            val training_predictions = model.transform(trainingData)
                                  .select("prediction", "indexed_label")
                                  .cache()

            val test_predictions = model.transform(testData)
                                  .select("prediction", "indexed_label")
                                   .cache()
    
            val predictions = Array(training_predictions, test_predictions)
            val names = Array(" Training ", " Test ")
    
    println(s"\n##################### ${model_name} Performance #########################")
    
    for (i <- 0 until predictions.length)
    {        
         var data = predictions(i)
         var name = names(i)
    
            // Select (prediction, true label) and compute test error.
            var evaluator = new MulticlassClassificationEvaluator()
              .setLabelCol("indexed_label")
              .setPredictionCol("prediction")
              .setMetricName("accuracy")

            var accuracy = evaluator.evaluate(data)


    
            evaluator = new MulticlassClassificationEvaluator()
              .setLabelCol("indexed_label")
              .setPredictionCol("prediction")
              .setMetricName("f1")

            var f1 = evaluator.evaluate(data)
            
            
            println(s"\n---------------------- ${name} performance metrics----------------------\n")
            println(s"\t- Accuracy = ${(accuracy * 100)}")
            println(s"\t- Error = ${(1.0 - accuracy)*100}")
            println(s"\t- F1 score = ${(f1)}")
    }
      
                }

performance: (model: org.apache.spark.ml.classification.DecisionTreeClassificationModel, model_name: String)Any


In [57]:
performance(dtModel,"Decision Tree")


##################### Decision Tree Performance #########################

----------------------  Training  performance metrics----------------------

	- Accuracy = 90.0
	- Error = 9.999999999999998
	- F1 score = 0.8933333333333333

----------------------  Test  performance metrics----------------------

	- Accuracy = 75.0
	- Error = 25.0
	- F1 score = 0.7333333333333334


res32: Any = ()


In [54]:
println(dtModel.getClass)

class org.apache.spark.ml.classification.DecisionTreeClassificationModel


## Prediction on synthetic data

In [23]:

val test_df = Seq(("young","high","no","fair"),
                 ("senior","high","yes","excellent"))
.toDF("age","income","student","credit_rating")

test_df.show()

+------+------+-------+-------------+
|   age|income|student|credit_rating|
+------+------+-------+-------------+
| young|  high|     no|         fair|
|senior|  high|    yes|    excellent|
+------+------+-------+-------------+



test_df: org.apache.spark.sql.DataFrame = [age: string, income: string ... 2 more fields]


In [24]:
val predictionPipeline = new Pipeline().setStages(pipeline.getStages.slice(1,pipeline.getStages.size))

predictionPipeline: org.apache.spark.ml.Pipeline = pipeline_f99a232c44c9


In [25]:
val test_data = predictionPipeline.fit(test_df).transform(test_df).select("features")
test_data.show()

+-----------------+
|         features|
+-----------------+
|[1.0,0.0,0.0,1.0]|
|    (4,[2],[1.0])|
+-----------------+



test_data: org.apache.spark.sql.DataFrame = [features: vector]


In [26]:
val predictions = dtModel.transform(test_data)
predictions.show()

+-----------------+-------------+-----------+----------+
|         features|rawPrediction|probability|prediction|
+-----------------+-------------+-----------+----------+
|[1.0,0.0,0.0,1.0]|    [0.0,2.0]|  [0.0,1.0]|       1.0|
|    (4,[2],[1.0])|    [3.0,0.0]|  [1.0,0.0]|       0.0|
+-----------------+-------------+-----------+----------+



predictions: org.apache.spark.sql.DataFrame = [features: vector, rawPrediction: vector ... 2 more fields]
