In [3]:
import pyspark
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_4').getOrCreate()



In [5]:
data_without_header = spark.read.option("inferSchema", True).option("header", False).csv("covtype.data")
data_without_header.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: integer (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)
 |-- _c11: integer (nullable = true)
 |-- _c12: integer (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: integer (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: integer (nullable = true)
 |-- _c25: integer (nullable = true)
 |-- _c26: integer (nullable = true)
 |-- _

In [7]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col
colnames = ["Elevation", "Aspect", "Slope", \
"Horizontal_Distance_To_Hydrology", \
"Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
\
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", \
"Horizontal_Distance_To_Fire_Points"] + \
[f"Wilderness_Area_{i}" for i in range(4)] + \
[f"Soil_Type_{i}" for i in range(40)] + \
["Cover_Type"]
data = data_without_header.toDF(*colnames).\
withColumn("Cover_Type",
col("Cover_Type").cast(DoubleType()))
data.head()

Row(Elevation=2596, Aspect=51, Slope=3, Horizontal_Distance_To_Hydrology=258, Vertical_Distance_To_Hydrology=0, Horizontal_Distance_To_Roadways=510, Hillshade_9am=221, Hillshade_Noon=232, Hillshade_3pm=148, Horizontal_Distance_To_Fire_Points=6279, Wilderness_Area_0=1, Wilderness_Area_1=0, Wilderness_Area_2=0, Wilderness_Area_3=0, Soil_Type_0=0, Soil_Type_1=0, Soil_Type_2=0, Soil_Type_3=0, Soil_Type_4=0, Soil_Type_5=0, Soil_Type_6=0, Soil_Type_7=0, Soil_Type_8=0, Soil_Type_9=0, Soil_Type_10=0, Soil_Type_11=0, Soil_Type_12=0, Soil_Type_13=0, Soil_Type_14=0, Soil_Type_15=0, Soil_Type_16=0, Soil_Type_17=0, Soil_Type_18=0, Soil_Type_19=0, Soil_Type_20=0, Soil_Type_21=0, Soil_Type_22=0, Soil_Type_23=0, Soil_Type_24=0, Soil_Type_25=0, Soil_Type_26=0, Soil_Type_27=0, Soil_Type_28=1, Soil_Type_29=0, Soil_Type_30=0, Soil_Type_31=0, Soil_Type_32=0, Soil_Type_33=0, Soil_Type_34=0, Soil_Type_35=0, Soil_Type_36=0, Soil_Type_37=0, Soil_Type_38=0, Soil_Type_39=0, Cover_Type=5.0)

In [8]:
(train_data, test_data) = data.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

DataFrame[Elevation: int, Aspect: int, Slope: int, Horizontal_Distance_To_Hydrology: int, Vertical_Distance_To_Hydrology: int, Horizontal_Distance_To_Roadways: int, Hillshade_9am: int, Hillshade_Noon: int, Hillshade_3pm: int, Horizontal_Distance_To_Fire_Points: int, Wilderness_Area_0: int, Wilderness_Area_1: int, Wilderness_Area_2: int, Wilderness_Area_3: int, Soil_Type_0: int, Soil_Type_1: int, Soil_Type_2: int, Soil_Type_3: int, Soil_Type_4: int, Soil_Type_5: int, Soil_Type_6: int, Soil_Type_7: int, Soil_Type_8: int, Soil_Type_9: int, Soil_Type_10: int, Soil_Type_11: int, Soil_Type_12: int, Soil_Type_13: int, Soil_Type_14: int, Soil_Type_15: int, Soil_Type_16: int, Soil_Type_17: int, Soil_Type_18: int, Soil_Type_19: int, Soil_Type_20: int, Soil_Type_21: int, Soil_Type_22: int, Soil_Type_23: int, Soil_Type_24: int, Soil_Type_25: int, Soil_Type_26: int, Soil_Type_27: int, Soil_Type_28: int, Soil_Type_29: int, Soil_Type_30: int, Soil_Type_31: int, Soil_Type_32: int, Soil_Type_33: int, S

In [9]:
from pyspark.ml.feature import VectorAssembler
input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols,
outputCol="featureVector")
assembled_train_data = vector_assembler.transform(train_data)
assembled_train_data.select("featureVector").show(truncate = False)

+-----------------------------------------------------------------------------------------------------+
|featureVector                                                                                        |
+-----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0])  |
|(54,[0,1,2,5,6,7,8,9,13,18],[1874.0,18.0,14.0,90.0,208.0,209.0,135.0,793.0,1.0,1.0])                 |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1879.0,28.0,19.0,30.0,12.0,95.0,209.0,196.0,117.0,778.0,1.0,1.0])   |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1888.0,33.0,22.0,150.0,46.0,108.0,209.0,185.0,103.0,735.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1889.0,353.0,30.0,95.0,39.0,67.0,153.0,172.0,146.0,600.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1896.0,337.0,12.0,30.0,6.0,175.0,195.0,224.0,168.0,732.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1898.0,34.0,23.0,175.0,56.0,13

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(seed = 1234, labelCol="Cover_Type",
featuresCol="featureVector",
predictionCol="prediction")
model = classifier.fit(assembled_train_data)
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ffd2f0555da7, depth=5, numNodes=43, numClasses=8, numFeatures=54
  If (feature 0 <= 3048.5)
   If (feature 0 <= 2501.5)
    If (feature 3 <= 15.0)
     If (feature 12 <= 0.5)
      If (feature 0 <= 2348.5)
       Predict: 4.0
      Else (feature 0 > 2348.5)
       Predict: 2.0
     Else (feature 12 > 0.5)
      Predict: 6.0
    Else (feature 3 > 15.0)
     If (feature 16 <= 0.5)
      Predict: 3.0
     Else (feature 16 > 0.5)
      If (feature 9 <= 1307.5)
       Predict: 3.0
      Else (feature 9 > 1307.5)
       Predict: 4.0
   Else (feature 0 > 2501.5)
    If (feature 17 <= 0.5)
     If (feature 0 <= 2952.5)
      If (feature 15 <= 0.5)
       Predict: 2.0
      Else (feature 15 > 0.5)
       Predict: 3.0
     Else (feature 0 > 2952.5)
      Predict: 2.0
    Else (feature 17 > 0.5)
     If (feature 0 <= 2712.5)
      Predict: 3.0
     Else (feature 0 > 2712.5)
      If (feature 5 <= 1251.0)
       Predict: 5.0
      Else (f

In [11]:
import pandas as pd
pd.DataFrame(model.featureImportances.toArray(),
index=input_cols, columns=['importance']).\
sort_values(by="importance", ascending=False)

Unnamed: 0,importance
Elevation,0.839072
Soil_Type_3,0.035492
Soil_Type_1,0.030548
Hillshade_Noon,0.026184
Horizontal_Distance_To_Hydrology,0.022362
Soil_Type_31,0.017996
Wilderness_Area_2,0.015323
Horizontal_Distance_To_Roadways,0.004583
Soil_Type_2,0.003516
Hillshade_9am,0.002662


In [13]:
predictions = model.transform(assembled_train_data)
predictions.select("Cover_Type", "prediction", "probability").\
show(10, truncate = False)

+----------+----------+------------------------------------------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                                                         |
+----------+----------+------------------------------------------------------------------------------------------------------------------------------------+
|6.0       |3.0       |[0.0,3.128519584532599E-5,0.06901514203478913,0.6083406332123639,0.020241521711925916,0.0017519709673382556,0.30061944687773745,0.0]|
|6.0       |4.0       |[0.0,0.0,0.005867014341590613,0.3011734028683181,0.5964797913950456,0.0,0.09647979139504563,0.0]                                    |
|6.0       |3.0       |[0.0,3.128519584532599E-5,0.06901514203478913,0.6083406332123639,0.020241521711925916,0.0017519709673382556,0.30061944687773745,0.0]|
|6.0       |3.0       |[0.0,3.128519584532599E-5,0.0690151

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type",
predictionCol="prediction")
evaluator.setMetricName("accuracy").evaluate(predictions)
evaluator.setMetricName("f1").evaluate(predictions)

0.6850828367432688

In [15]:
confusion_matrix = predictions.groupBy("Cover_Type").\
pivot("prediction", range(1,8)).count().\
na.fill(0.0).\
orderBy("Cover_Type")
confusion_matrix.show()

+----------+------+------+-----+----+---+---+-----+
|Cover_Type|     1|     2|    3|   4|  5|  6|    7|
+----------+------+------+-----+----+---+---+-----+
|       1.0|119446| 65801|  102|   0| 20|  7| 5333|
|       2.0| 42482|207574| 3663|   9|360| 57|  786|
|       3.0|     0|  3920|27696| 473| 22|125|    0|
|       4.0|     0|    96| 1205|1173|  0|  0|    0|
|       5.0|     0|  7776|  347|   0|444|  0|    0|
|       6.0|     0|  4511|10438| 148|  6|503|    0|
|       7.0|  7789|   316|    0|   0|  0|  0|10362|
+----------+------+------+-----+----+---+---+-----+



In [17]:
from pyspark.sql import DataFrame
def class_probabilities(data):
    total = data.count()
    return data.groupBy("Cover_Type").count().\
    orderBy("Cover_Type").\
    select(col("count").cast(DoubleType())).\
    withColumn("count_proportion", col("count")/total).\
    select("count_proportion").collect()
train_prior_probabilities = class_probabilities(train_data)
test_prior_probabilities = class_probabilities(test_data)
train_prior_probabilities

[Row(count_proportion=0.36465133176542575),
 Row(count_proportion=0.4874490908047955),
 Row(count_proportion=0.06163788982580929),
 Row(count_proportion=0.004730491978814127),
 Row(count_proportion=0.016380810340541885),
 Row(count_proportion=0.029839958699019103),
 Row(count_proportion=0.03531042658559437)]

In [18]:
train_prior_probabilities = [p[0] for p in train_prior_probabilities]
test_prior_probabilities = [p[0] for p in test_prior_probabilities]
sum([train_p * cv_p for train_p, cv_p in zip(train_prior_probabilities,
test_prior_probabilities)])

0.3773114942184577

In [19]:
from pyspark.ml import Pipeline
assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type",
featuresCol="featureVector",
predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, classifier])

In [20]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder(). \
addGrid(classifier.impurity, ["gini", "entropy"]). \
addGrid(classifier.maxDepth, [1, 20]). \
addGrid(classifier.maxBins, [40, 300]). \
addGrid(classifier.minInfoGain, [0.0, 0.05]). \
build()
multiclassEval = MulticlassClassificationEvaluator(). \
setLabelCol("Cover_Type"). \
setPredictionCol("prediction"). \
setMetricName("accuracy")

In [21]:
from pyspark.ml.tuning import TrainValidationSplit
validator = TrainValidationSplit(seed=1234,
estimator=pipeline,
evaluator=multiclassEval,
estimatorParamMaps=paramGrid,
trainRatio=0.9)
validator_model = validator.fit(train_data)

In [22]:
from pprint import pprint
best_model = validator_model.bestModel
pprint(best_model.stages[1].extractParamMap())

{Param(parent='DecisionTreeClassifier_f87d122ed8ec', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,
 Param(parent='DecisionTreeClassifier_f87d122ed8ec', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 20,
 Param(parent='DecisionTreeClassifier_f87d122ed8ec', name='featuresCol', doc='features column name.'): 'featureVector',
 Param(parent='DecisionTreeClassifier_f87d122ed8ec', name='labelCol', doc='label column name.'): 'Cover_Type',
 Param(parent='DecisionTreeClassifier_f87d122ed8ec', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split cause

In [None]:
validator_model = validator.fit(train_data)
metrics = validator_model.validationMetrics
params = validator_model.getEstimatorParamMaps()
metrics_and_params = list(zip(metrics, params))
metrics_and_params.sort(key=lambda x: x[0], reverse=True)
metrics_and_params

In [None]:
metrics.sort(reverse=True)
print(metrics[0])

In [None]:
multiclassEval.evaluate(best_model.transform(test_data))

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def unencode_one_hot(data):
    wilderness_cols = ['Wilderness_Area_' + str(i) for i in range(4)]
    wilderness_assembler = VectorAssembler().\
                            setInputCols(wilderness_cols).\
                            setOutputCol("wilderness")
    unhot_udf = udf(lambda v: v.toArray().tolist().index(1))
    with_wilderness = wilderness_assembler.transform(data).\
      drop(*wilderness_cols).\
      withColumn("wilderness", unhot_udf(col("wilderness")).cast(IntegerType()))
    soil_cols = ['Soil_Type_' + str(i) for i in range(40)]
    soil_assembler = VectorAssembler().\
                    setInputCols(soil_cols).\
                    setOutputCol("soil")
    with_soil = soil_assembler.\
                transform(with_wilderness).\
                drop(*soil_cols).\
                withColumn("soil", unhot_udf(col("soil")).cast(IntegerType()))
    return with_soil

In [None]:
unenc_train_data = unencode_one_hot(train_data)
unenc_train_data.printSchema()

In [None]:
unenc_train_data.groupBy('wilderness').count().show()

In [None]:
from pyspark.ml.feature import VectorIndexer
cols = unenc_train_data.columns
input_cols = [c for c in cols if c!='Cover_Type']
assembler = VectorAssembler().setInputCols(input_cols).setOutputCol("featureVector")
indexer = VectorIndexer().\
setMaxCategories(40).\
setInputCol("featureVector").setOutputCol("indexedVector")
classifier = DecisionTreeClassifier().setLabelCol("Cover_Type").\
                                        setFeaturesCol("indexedVector").\
                                        setPredictionCol("prediction")
pipeline = Pipeline().setStages([assembler, indexer, classifier])

In [None]:
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(seed=1234, labelCol="Cover_Type",
featuresCol="indexedVector",predictionCol="prediction")

In [None]:
unenc_train_data.columns

In [None]:
######## LONGER TIME ##################################
cols = unenc_train_data.columns
input_cols = [c for c in cols if c!='Cover_Type']
assembler = VectorAssembler().setInputCols(input_cols).
,→setOutputCol("featureVector")
indexer = VectorIndexer().\
setMaxCategories(40).\
setInputCol("featureVector").setOutputCol("indexedVector")
pipeline = Pipeline().setStages([assembler, indexer, classifier])
paramGrid = ParamGridBuilder(). \
addGrid(classifier.impurity, ["gini", "entropy"]). \
addGrid(classifier.maxDepth, [1, 20]). \
addGrid(classifier.maxBins, [40, 300]). \
addGrid(classifier.minInfoGain, [0.0, 0.05]). \
build()
multiclassEval = MulticlassClassificationEvaluator(). \
setLabelCol("Cover_Type"). \
setPredictionCol("prediction"). \
setMetricName("accuracy")
validator = TrainValidationSplit(seed=1234,
estimator=pipeline,
evaluator=multiclassEval,
estimatorParamMaps=paramGrid,
trainRatio=0.9)
validator_model = validator.fit(unenc_train_data)
best_model = validator_model.bestModel

In [None]:
forest_model = best_model.stages[2]
feature_importance_list = list(zip(input_cols,
forest_model.featureImportances.toArray()))
feature_importance_list.sort(key=lambda x: x[1], reverse=True)
pprint(feature_importance_list)

In [None]:
unenc_test_data = unencode_one_hot(test_data)
best_model.transform(unenc_test_data.drop("Cover_Type")).\
select("prediction").show(1)