In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
import pyspark.sql.functions as f
import os, sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
conf = SparkConf()
conf.set("spark.app.name","trees")
conf.set("spark.master","local[*]")
conf.set("spark.driver.memory","8g")

<pyspark.conf.SparkConf at 0x1d1d8aeba00>

In [4]:
spark = SparkSession.builder\
                    .config(conf=conf)\
                    .getOrCreate()

In [6]:
data_without_header = spark.read\
                           .format("csv")\
                           .option("header",False)\
                           .option("inferSchema",True)\
                           .load(r"C:\Users\blais\Documents\ML\data\covertype\covtype.data")

In [7]:
data_without_header.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: integer (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)
 |-- _c11: integer (nullable = true)
 |-- _c12: integer (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: integer (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: integer (nullable = true)
 |-- _c25: integer (nullable = true)
 |-- _c26: integer (nullable = true)
 |-- _

Code reads the input as csv and doesn't attempt to parse the first line as a header of column names. It also requests that the type of each column be inferred by examining the data. It correctly infers that all of the columns are numbers, and, more specifically integers. 

We can look at the covtype.info file for the column names:

In any event before proceeding, it is useful to add column names to this DataFrame to make it easier to work with:

In [9]:
from pyspark.sql.types import DoubleType

In [10]:
colnames = ["Elevation", "Aspect", "Slope", \
            "Horizontal_Distance_To_Hydrology", \
            "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", \
            "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", \
            "Horizontal_Distance_To_Fire_Points"] + \
           [f"Wilderness_Area_{i}" for i in range(4)] + \
           [f"Soil_Type_{i}" for i in range(40)] + \
           ["Cover_Type"]

In [13]:
data = data_without_header.toDF(*colnames)\
                           .withColumn("Cover_Type", f.col("Cover_Type").cast(DoubleType()))

In [14]:
data.head()

Row(Elevation=2596, Aspect=51, Slope=3, Horizontal_Distance_To_Hydrology=258, Vertical_Distance_To_Hydrology=0, Horizontal_Distance_To_Roadways=510, Hillshade_9am=221, Hillshade_Noon=232, Hillshade_3pm=148, Horizontal_Distance_To_Fire_Points=6279, Wilderness_Area_0=1, Wilderness_Area_1=0, Wilderness_Area_2=0, Wilderness_Area_3=0, Soil_Type_0=0, Soil_Type_1=0, Soil_Type_2=0, Soil_Type_3=0, Soil_Type_4=0, Soil_Type_5=0, Soil_Type_6=0, Soil_Type_7=0, Soil_Type_8=0, Soil_Type_9=0, Soil_Type_10=0, Soil_Type_11=0, Soil_Type_12=0, Soil_Type_13=0, Soil_Type_14=0, Soil_Type_15=0, Soil_Type_16=0, Soil_Type_17=0, Soil_Type_18=0, Soil_Type_19=0, Soil_Type_20=0, Soil_Type_21=0, Soil_Type_22=0, Soil_Type_23=0, Soil_Type_24=0, Soil_Type_25=0, Soil_Type_26=0, Soil_Type_27=0, Soil_Type_28=1, Soil_Type_29=0, Soil_Type_30=0, Soil_Type_31=0, Soil_Type_32=0, Soil_Type_33=0, Soil_Type_34=0, Soil_Type_35=0, Soil_Type_36=0, Soil_Type_37=0, Soil_Type_38=0, Soil_Type_39=0, Cover_Type=5.0)

**Our First Decision Tree:**

In [15]:
(train_data, test_data) = data.randomSplit([0.9,0.1])
train_data.cache()
test_data.cache()

DataFrame[Elevation: int, Aspect: int, Slope: int, Horizontal_Distance_To_Hydrology: int, Vertical_Distance_To_Hydrology: int, Horizontal_Distance_To_Roadways: int, Hillshade_9am: int, Hillshade_Noon: int, Hillshade_3pm: int, Horizontal_Distance_To_Fire_Points: int, Wilderness_Area_0: int, Wilderness_Area_1: int, Wilderness_Area_2: int, Wilderness_Area_3: int, Soil_Type_0: int, Soil_Type_1: int, Soil_Type_2: int, Soil_Type_3: int, Soil_Type_4: int, Soil_Type_5: int, Soil_Type_6: int, Soil_Type_7: int, Soil_Type_8: int, Soil_Type_9: int, Soil_Type_10: int, Soil_Type_11: int, Soil_Type_12: int, Soil_Type_13: int, Soil_Type_14: int, Soil_Type_15: int, Soil_Type_16: int, Soil_Type_17: int, Soil_Type_18: int, Soil_Type_19: int, Soil_Type_20: int, Soil_Type_21: int, Soil_Type_22: int, Soil_Type_23: int, Soil_Type_24: int, Soil_Type_25: int, Soil_Type_26: int, Soil_Type_27: int, Soil_Type_28: int, Soil_Type_29: int, Soil_Type_30: int, Soil_Type_31: int, Soil_Type_32: int, Soil_Type_33: int, S

The data needs a little more preparation to be used with a classifier in MLlib. The input dataframe contains many columns, each holding one feature that could be used to predict the target column. MLlib requires all of the inputs to be collected into one column - whose value is a vector. PySpark's VectorAssembler class is an abstraction for vectors in the linear algebra sense and contains only numbers. For most intents and purposes, they work like a simple array of double values (floating-point numbers). Of course, some of the input features are conceptually categorical, even if they are represented with numbers in the input. 

In [16]:
from pyspark.ml.feature import VectorAssembler

In [18]:
input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")

In [19]:
assembled_train_data = vector_assembler.transform(train_data)

In [22]:
assembled_train_data.select("featureVector").show(truncate=False)

+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,5,6,7,8,9,13,18],[1874.0,18.0,14.0,90.0,208.0,209.0,135.0,793.0,1.0,1.0])                 |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,14],[1889.0,28.0,22.0,150.0,23.0,120.0,205.0,185.0,108.0,759.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1889.0,353.0,30.0,95.0,39.0,67.0,153.0,172.0,146.0,600.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175

In [23]:
assembled_train_data.show()

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+----------------------------------+-----------------+-----------------+-----------------+-----------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+----------+--------------------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm|Horizontal_Distance_To_F

VectorAssembler is an example of a Transformer within the current MLlib Pipelines API. It transforms the input DataFrame into another DataFrame based on some logic, and is composable with other transformations into a pipeline. 

In [24]:
from pyspark.ml.classification import DecisionTreeClassifier

In [25]:
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type",
                                    featuresCol="featureVector",
                                    predictionCol="prediction")

In [26]:
model = classifier.fit(assembled_train_data)

In [27]:
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_a5c5340a56e9, depth=5, numNodes=41, numClasses=8, numFeatures=54
  If (feature 0 <= 3046.5)
   If (feature 0 <= 2483.5)
    If (feature 3 <= 15.0)
     If (feature 12 <= 0.5)
      If (feature 23 <= 0.5)
       Predict: 4.0
      Else (feature 23 > 0.5)
       Predict: 3.0
     Else (feature 12 > 0.5)
      Predict: 6.0
    Else (feature 3 > 15.0)
     If (feature 16 <= 0.5)
      Predict: 3.0
     Else (feature 16 > 0.5)
      If (feature 9 <= 1303.0)
       Predict: 3.0
      Else (feature 9 > 1303.0)
       Predict: 4.0
   Else (feature 0 > 2483.5)
    If (feature 17 <= 0.5)
     If (feature 15 <= 0.5)
      Predict: 2.0
     Else (feature 15 > 0.5)
      Predict: 3.0
    Else (feature 17 > 0.5)
     If (feature 0 <= 2702.5)
      Predict: 3.0
     Else (feature 0 > 2702.5)
      If (feature 5 <= 1232.0)
       Predict: 5.0
      Else (feature 5 > 1232.0)
       Predict: 2.0
  Else (feature 0 > 3046.5)
   If (feature 0 <= 3

Again - the essential configuration for the classifier consists of column names.
- the column containing the input feature vectors and the column containing the target value to predict
- Because the model will later be used to predict new values of the target, it is given the name of the column to store predictions. 
- Decision trees are able to assess the importance of input features as part of their building process. That is, they can estimate how much each input feature contributes to making correct predictions. This information is simple to access from the model:

In [28]:
import pandas as pd

In [30]:
pd.DataFrame(model.featureImportances.toArray(),
            index=input_cols,columns=['importance']).sort_values(by="importance",ascending=False)

Unnamed: 0,importance
Elevation,0.827774
Soil_Type_3,0.040477
Soil_Type_1,0.03238
Hillshade_Noon,0.027039
Horizontal_Distance_To_Hydrology,0.023549
Soil_Type_31,0.018779
Wilderness_Area_2,0.015569
Horizontal_Distance_To_Roadways,0.004642
Soil_Type_2,0.00378
Hillshade_9am,0.002619


The resulting DecisionTreeClassificationModel is itself a transformer because it can transform a dataframe containing feature vectors into a dataframe also containing predictions. For example, it might be interesting to see what the model predicts on the training data and compare its predictions with the known correct cover type:

In [31]:
predictions = model.transform(assembled_train_data)

In [32]:
predictions.select("Cover_Type","prediction","probability").show(10, truncate=False)

+----------+----------+--------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                         |
+----------+----------+--------------------------------------------------------------------------------------------------------------------+
|6.0       |3.0       |[0.0,0.0,0.052882690511143735,0.6171426664887228,0.021119711730948884,3.3364473508608036E-5,0.30882156679567596,0.0]|
|6.0       |4.0       |[0.0,0.0,0.02650429799426934,0.24426934097421205,0.6396848137535817,0.0,0.08954154727793696,0.0]                    |
|6.0       |3.0       |[0.0,0.0,0.052882690511143735,0.6171426664887228,0.021119711730948884,3.3364473508608036E-5,0.30882156679567596,0.0]|
|6.0       |3.0       |[0.0,0.0,0.052882690511143735,0.6171426664887228,0.021119711730948884,3.3364473508608036E-5,0.30882156679567596,0.0]|
|6.0       |3

Based on the above snippet, it looks like the model could use some work. Its predictions look like they are often wrong. The DecisionTreeClassifier implementation has several hyperparameters for which a value must be chosen, and they've all been left to defaults here. Here, the test set can be used to produce an unbiased evaluation of the expected accuracy of a model built with these default hyperparameters. We will now use MulticlassClassificationEvaluator to compute accuracy and other metrics that evaluate the quality of the model's predictions. It is an example of an evaluator in MLlib, which is responsible for assessing the qualuty of an output DataFrame in some way:

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [34]:
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type",predictionCol="prediction")

In [36]:
evaluator.setMetricName("accuracy").evaluate(predictions)

0.7033842577724764

In [37]:
evaluator.setMetricName("f1").evaluate(predictions)

0.688107077334873

Can be useful to also look at the confusion matrix. rows - actual correct columns, columns - predicted values.

In [40]:
confusion_matrix = predictions.groupBy("Cover_Type")\
                              .pivot("prediction", range(1,8)).count()\
                              .na.fill(0.0)\
                              .orderBy("Cover_Type")

In [41]:
confusion_matrix.show()

+----------+------+------+-----+----+---+---+-----+
|Cover_Type|     1|     2|    3|   4|  5|  6|    7|
+----------+------+------+-----+----+---+---+-----+
|       1.0|131002| 54687|   90|   0| 31|  4| 4829|
|       2.0| 53386|197594| 3035|  37|363| 41|  672|
|       3.0|     0|  4367|27269| 351| 34| 98|    0|
|       4.0|     0|     6| 1313|1158|  0|  0|    0|
|       5.0|     0|  7882|  279|   0|420|  0|    0|
|       6.0|     0|  4892|10251| 125| 11|445|    0|
|       7.0|  8307|    77|    0|   0|  0|  0|10072|
+----------+------+------+-----+----+---+---+-----+



Although 70% accuracy sounds decent, its not immediately clear whether it is outstanding or poor. How well would a simplistic approach do to establish a baseline. We could construct a random classifier by picking a class at random in proportion to its prevalence in the training set. 

In [42]:
from pyspark.sql import DataFrame

In [43]:
def class_probabilities(data):
    total = data.count()
    return data.groupBy("Cover_Type").count()\
                                     .orderBy("Cover_Type")\
                                     .select(f.col("count").cast(DoubleType()))\
                                     .withColumn("count_proportion", f.col("count")/total)\
                                     .select("count_proportion").collect()

In [44]:
train_prior_probabilities = class_probabilities(train_data)
test_prior_probabilities = class_probabilities(test_data)

In [45]:
train_prior_probabilities

[Row(count_proportion=0.36442897340612623),
 Row(count_proportion=0.48769708369653314),
 Row(count_proportion=0.061397975256533774),
 Row(count_proportion=0.004734978819715251),
 Row(count_proportion=0.01640325121194048),
 Row(count_proportion=0.030057653193864598),
 Row(count_proportion=0.03528008441528651)]

In [46]:
train_prior_probabilities = [p[0] for p in train_prior_probabilities]
test_prior_probabilities = [p[0] for p in test_prior_probabilities]

In [47]:
sum([train_p * cv_p for train_p, cv_p in zip(train_prior_probabilities, test_prior_probabilities)])

0.37706379578644733

So, random guessing achieves 37% accuracy then, which makes the 70% achieved earlier seem like a good result after all. But, the later was achieved with default hyperparameters. We can do better by exploring what the hyperparameters actually mean for the tree-building process. 

**Decision Tree Hyperparameters:**
- Maximum Depth:
Simply limits the number of levels in the decision tree. It is the maximum numver of chained decisions that the classifier will make to classify an example. Useful to limit this to avoid overfitting the training data. The decision tree is responsible for coming up with potential decision rules to try at each level. Decisons are always of the same form for numeric features. decisions are of the form feature >= value, and for categorical features are of the form feature in (value1, value2, ...). So, the set of decision rules to try is really a set of valies to plug in to the decision rule. These are referred to as bins in the PySpark MLlib implementation. A larger number of bins requires more processing time but might lead to finding a more optimal decision rule. What makes a good rule? - Good rules divide the training data's target values into relatively homogenous or pure subsets. Picking a best rule means minimizing the impurity of the 2 subsets it induces. there are 2 commonly used measures of impurity: Gini impurity and entropy.

Gini impurity is directly related to the accuracy of the random guess classifier. within a subset - it is the probability that a randomly chosen classification of a random

- Finally minimum information gain - hyperparameter that imposes a minimum onformation gain or decrease in impurity for candidate decision rules. Rules that do not improve the subset's impurity enough are rejected. Like a lower maximum depth, this can help the model resist overfitting because decisions that barely help divide the training input may infact not helpfully divide the data at all. **Increasing the minimum information gain hyperparameter increases regularization**





**Tuning Decision Trees:**

Not obvious looking at the data - which impurity measure leads to better accuracy, or what maximum depth, maxBins or minimum information gain hyperparameter is best. 

In [48]:
from pyspark.ml import Pipeline

In [49]:
assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type",featuresCol="featureVector",
                                    predictionCol="prediction")

In [50]:
pipeline = Pipeline(stages=[assembler,classifier])

Now, we can also define the combinations of hyperparameters that should be tested using the PySpark ML API's built-in support, ParamGridBuilder - plus the evaluation metric that will be used to pock the best hyperparameters, i.e. MulticlassClassificationEvaluator:

In [51]:
from pyspark.ml.tuning import ParamGridBuilder

In [52]:
paramGrid = ParamGridBuilder().\
                              addGrid(classifier.impurity, ["gini","entropy"])\
                              .addGrid(classifier.maxDepth, [1,20])\
                              .addGrid(classifier.maxBins,[40,300])\
                              .addGrid(classifier.minInfoGain, [0.0, 0.05])\
                              .build()

In [53]:
multiclassEval = MulticlassClassificationEvaluator()\
                                                    .setLabelCol("Cover_Type")\
                                                    .setPredictionCol("prediction")\
                                                    .setMetricName("accuracy")

From the param grid - we'll have a total of 16 models evaluated. strategy is the same as with gridsearchcv. 

Finally TrainValidationSplit brings these components together-the pipeline that makes models, model eval metrics, and hyperparameters to try- and can run the eval on training data. 

In [54]:
from pyspark.ml.tuning import TrainValidationSplit

In [55]:
validator = TrainValidationSplit(seed=1234,
                                 estimator=pipeline,
                                 evaluator=multiclassEval,
                                 estimatorParamMaps=paramGrid,
                                 trainRatio=0.9)

In [56]:
validator_model = validator.fit(train_data)

In [57]:
from pprint import pprint

In [58]:
best_model = validator_model.bestModel

In [59]:
pprint(best_model.stages[1].extractParamMap())

{Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='featuresCol', doc='features column name.'): 'featureVector',
 Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,
 Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',
 Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',
 Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,
 Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='cacheNodeIds', doc='If false, the algorithm will pass trees to 

May be wondering if it is possible to see the accuracy that each of the models achieved for each combination of hyperparameters. The hyperparameters and evaluations are exposed by the getEstimatorParamMaps and validationMetrics respectively. They can be combined to display all of the parameter combinations sorted by metric value:

In [60]:
metrics = validator_model.validationMetrics

In [61]:
params = validator_model.getEstimatorParamMaps()

In [62]:
metrics_and_params = list(zip(metrics, params))

In [63]:
metrics_and_params.sort(key=lambda x: x[0], reverse=True)

In [64]:
metrics_and_params

[(0.9123498354885607,
  {Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',
   Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 20,
   Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 300,
   Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0}),
 (0.9099012931364298,
  {Param(parent='DecisionTreeClassifier_91ca7e7d7de8', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entrop

In [65]:
multiclassEval.evaluate(best_model.transform(test_data))

0.9114781286711354

**categorical Features Revisited:**

The categorical features in our dataset are one-hot encoded as several binary 0/1 values. Treating these individual features as numeric turns out to be fine, because any decision rule on numeric features will choose thresholds between 0 and 1 and all are equivalent since all values are 0 or 1.  This encoding forces the decision tree algorithm to consider the values of the underlying categorical features individually. Because features like soil type are broken down into many features and because decision trees treat features individually, its harder to relate information about related soil types. For example, 9 different soil types are actually part of the Leighton family, and they may be related in ways that the decision tree can exploit. If soil type were encoded as a single categorical feature with 40 soil values, then the tree could express rules like if the soil type is one of the 9 leighton family types directly, however when encoded as 40 features, the tree would have to learn a sequence of 9 decsion son soil type to do the same.  

Undoing the onehot encoding

In [70]:
def unencode_one_hot(data):
    wilderness_cols = ['Wilderness_Area_'+str(i) for i in range(4)]
    wilderness_assembler = VectorAssembler()\
                                            .setInputCols(wilderness_cols)\
                                            .setOutputCol("wilderness")
    unhot_udf = f.udf(lambda v: v.toArray().tolist().index(1))
    with_wilderness = wilderness_assembler.transform(data)\
                                          .drop(*wilderness_cols)\
                                          .withColumn("wilderness", unhot_udf(f.col("wilderness")))
    soil_cols = ['Soil_Type_'+str(i) for i in range(40)]
    soil_assembler = VectorAssembler()\
                                      .setInputCols(soil_cols)\
                                      .setOutputCol("soil")
    with_soil = soil_assembler.transform(with_wilderness)\
                              .drop(*soil_cols)\
                              .withColumn("soil", unhot_udf(f.col("soil")))
    return with_soil


In [71]:
unenc_train_data = unencode_one_hot(train_data)
unenc_train_data.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Cover_Type: double (nullable = true)
 |-- wilderness: string (nullable = true)
 |-- soil: string (nullable = true)



In [72]:
unenc_train_data.groupBy("wilderness").count().show()

+----------+------+
|wilderness| count|
+----------+------+
|         3| 33315|
|         0|234716|
|         1| 26840|
|         2|228257|
+----------+------+



From here - same process as above van be used to tune the hyperparameters of the decision tree model built on this data and to choose and evaluate the best model. However the two numeric columns have nothing about them that indicates that they're actually an encoding of categorical values. To treat them as numbers is not correct, as their order is meaningless. Internally, MLlib can store additional metadata about each column. The details of this data are generally hidden from the caller but include information such as whether the column encodes a categorical value and how many distinct values it takes on. To add this metadata, its necessary to put the data through VectorIndexer. Its job is to turn input into properly labelled categorical feature columns.

In [73]:
from pyspark.ml.feature import VectorIndexer

In [74]:
cols = unenc_train_data.columns

In [75]:
inputCols = [c for c in cols if c!='Cover_Type']

In [76]:
assembler = VectorAssembler().setInputCols(inputCols).setOutputCol("featureVector")

In [79]:
indexer = VectorIndexer()\
                        .setMaxCategories(40)\
                        .setInputCol("featureVector").setOutputCol("indexedVector")

In [80]:
classifier = DecisionTreeClassifier().setLabelCol("Cover_Type")\
                                     .setFeaturesCol("indexedVector")\
                                     .setPredictionCol("prediction")

In [81]:
pipeline = Pipeline().setStages([assembler, indexer, classifier])

.....train and eval metrics
..... hyperparameter search//

**Random Forests**

In [82]:
from pyspark.ml.classification import RandomForestClassifier

In [83]:
classifier = RandomForestClassifier(seed=1234, labelCol="Cover_Type",
                                    featuresCol="indexedVector",predictionCol="prediction")

Note - this classifier has another hyperparameter: the number of trees to build. Like the max bins hyperparameter, higher values should give better results up to a point. The cost, however, is that building many trees of course takes many times longer than building one. 
The accuracy of the best random forest model produced from a similar tuning process is 95% off the bat - about 2% better already

...apply same steps as with trees. 

Random forests are appealing in the context of big data becaue trees are supposed to be built independently, and big data technologies like spark and mapreduce inherently need data-parallel problems - where parts of the overall solution can be computed independently on parts of the data. 

In [84]:
spark.stop()