# PySpark Cookbook

### Tomasz Drabas, Denny Lee
#### Version: 0.1
#### Date: 3/10/2018

# Loading the data

In [1]:
import pyspark.sql.functions as func
forest_path = '../data/forest_coverage_type.csv'

forest = spark.read.csv(
    forest_path
    , header=True
    , inferSchema=True
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,,pyspark,idle,,,✔


SparkSession available as 'spark'.


In [2]:
forest.printSchema()

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshade_9am: integer (nullable = true)
 |-- Hillshade_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_Rawah: integer (nullable = true)
 |-- Wilderness_Area_Neota: integer (nullable = true)
 |-- Wilderness_Area_Comanche: integer (nullable = true)
 |-- Wilderness_Area_CacheLaPoudre: integer (nullable = true)
 |-- Soil_type_2702: integer (nullable = true)
 |-- Soil_type_2703: integer (nullable = true)
 |-- Soil_type_2704: integer (nullable = true)
 |-- Soil_type_2705: integer (nullable = true)
 |-- Soil_type_2706: integer (nullable = true)
 |-- Soil_type_2

# Introducing Transformers

List of most popular **Transformers**
* Binarizer
* Bucketizer
* ChiSqSelector
* CountVectorizer
* DCT
* ElementwiseProduct
* HashingTF
* IDF
* IndexToString
* MaxAbsScaler
* MinMaxScaler
* NGram
* Normalizer
* OneHotEncoder
* PCA
* PolynomialExpansion
* QuantileDiscretizer
* RegexTokenizer
* RFormula
* SQLTransformer
* StandardScaler
* StopWordsRemover
* StringIndexer
* Tokenizer
* VectorAssembler
* VectorIndexer
* VectorSlicer
* Word2Vec

In [3]:
import pyspark.ml.feature as feat
import pyspark.sql.functions as f
import numpy as np

buckets_no = 10

dist_min_max = (
    forest.agg(
        f.min('Horizontal_Distance_To_Hydrology').alias('min'), 
        f.max('Horizontal_Distance_To_Hydrology').alias('max')
    )
    .rdd
    .map(lambda row: (row.min, row.max))
    .collect()[0]
)

rng = dist_min_max[1] - dist_min_max[0]

splits = list(np.arange(dist_min_max[0], dist_min_max[1], rng / (buckets_no + 1)))

bucketizer = feat.Bucketizer(
    splits=splits
    , inputCol='Horizontal_Distance_To_Hydrology'
    , outputCol='Horizontal_Distance_To_Hydrology_Bkt'
)

bucketizer.transform(forest).select('Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Hydrology_Bkt').show(5)

+--------------------------------+------------------------------------+
|Horizontal_Distance_To_Hydrology|Horizontal_Distance_To_Hydrology_Bkt|
+--------------------------------+------------------------------------+
|                             258|                                 2.0|
|                             212|                                 1.0|
|                             268|                                 2.0|
|                             242|                                 1.0|
|                             153|                                 1.0|
+--------------------------------+------------------------------------+
only showing top 5 rows

In [10]:
vectorAssembler = feat.VectorAssembler(inputCols=forest.columns, outputCol='feat')
pca = feat.PCA(k=5, inputCol=vectorAssembler.getOutputCol(), outputCol='pca_feat')
pca.fit(vectorAssembler.transform(forest)).transform(forest).show(5)

'Field "feat" does not exist.'
Traceback (most recent call last):
  File "/opt/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 105, in transform
    return self._transform(dataset)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/ml/wrapper.py", line 281, in _transform
    return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
  File "/opt/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 79, in deco
    raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: 'Field "feat" does not exist.'



# Introducing Estimators

List of most popular **Estimators**
1. Classification
 * LinearSVC
 * LogisticRegression 
 * DecisionTreeClassifier
 * GBTClassifier
 * RandomForestClassifier
 * NaiveBayes
 * MultilayerPerceptronClassifier
 * OneVsRest
2. Regression
 * AFTSurvivalRegression
 * DecisionTreeRegressor
 * GBTRegressor
 * GeneralizedLinearRegression
 * IsotonicRegression
 * LinearRegression
 * RandomForestRegressor
3. Clustering
 * BisectingKMeans
 * Kmeans
 * GaussianMixture
 * LDA

In [11]:
forest.select('CoverType').groupBy('CoverType').count().show()

+---------+------+
|CoverType| count|
+---------+------+
|        1|211840|
|        6| 17367|
|        3| 35754|
|        5|  9493|
|        4|  2747|
|        7| 20510|
|        2|283301|
+---------+------+

In [12]:
import pyspark.ml.classification as cl

vectorAssembler = feat.VectorAssembler(inputCols=forest.columns[0:-1], outputCol='features')

fir_dataset = (
    vectorAssembler
    .transform(forest)
    .withColumn('label', (f.col('CoverType') == 1).cast('integer'))
    .select('label', 'features')
)

svc_obj = cl.LinearSVC(maxIter=10, regParam=0.01)
svc_model = svc_obj.fit(fir_dataset)

svc_model.coefficients

DenseVector([-0.0001, -0.0, -0.0023, -0.0, -0.0001, 0.0, -0.001, -0.0017, -0.0003, -0.0, 0.0, 0.0401, -0.0071, -0.0958, -0.0901, -0.0653, -0.0655, -0.0437, -0.0928, -0.0848, -0.0211, -0.0045, -0.0498, -0.0829, -0.0522, -0.0325, -0.0263, -0.0923, -0.0889, -0.0275, -0.0606, -0.0595, 0.0341, -0.003, 0.0822, 0.0607, 0.0351, 0.0093, 0.0048, -0.0154, 0.0422, -0.0673, -0.0039, -0.0142, 0.0036, 0.0078, 0.0, -0.0117, 0.0283, -0.0002, -0.0463, 0.0394, 0.0292, 0.0358])

In [13]:
import pyspark.ml.regression as rg

vectorAssembler = feat.VectorAssembler(inputCols=forest.columns[1:], outputCol='features')

elevation_dataset = (
    vectorAssembler
    .transform(forest)
    .withColumn('label', f.col('Elevation').cast('float'))
    .select('label', 'features')
)
    
lr_obj = rg.LinearRegression(maxIter=10, regParam=0.01, elasticNetParam=1.00)
lr_model = lr_obj.fit(elevation_dataset)

lr_model.coefficients

DenseVector([0.0309, 0.6522, 0.1911, 0.1424, 0.0342, 0.7402, 1.053, -0.0017, -0.0041, 2.7163, 189.0362, 27.8238, -265.8505, -407.4379, -346.0612, -364.3841, -302.6788, -400.5852, -212.9918, -126.1329, -117.7423, -312.0478, -248.7118, -221.4788, -155.1459, -84.5129, -398.0433, -387.8102, -179.4485, -261.3875, -337.7875, 48.0629, -94.7813, 149.8043, 135.144, 80.0901, 64.3659, 124.0233, -115.0126, 119.1285, -181.7498, 10.8056, -42.7849, 65.5441, 102.2562, 36.9865, -48.1163, 379.2091, 256.0169, 497.1714, 313.0607, 337.172, 397.0758, -14.4551])

In [14]:
summary = lr_model.summary

summary.r2, summary.rootMeanSquaredError, summary.meanAbsoluteError

(0.7860412464754236, 129.50871925702438, 103.34079732698483)

# Introducing Pipelines

In [15]:
from pyspark.ml import Pipeline

vectorAssembler = feat.VectorAssembler(inputCols=forest.columns[1:], outputCol='features')
lr_obj = rg.GeneralizedLinearRegression(
    labelCol='Elevation'
    , maxIter=10
    , regParam=0.01
    , link='identity'
    , linkPredictionCol="p"
)

pip = Pipeline(stages=[vectorAssembler, lr_obj])

pip.fit(forest).transform(forest).select('Elevation', 'prediction').show(5)

+---------+------------------+
|Elevation|        prediction|
+---------+------------------+
|     2596|2840.7801831411316|
|     2590|2828.7464246669683|
|     2804| 2842.761272955131|
|     2785| 2966.057500325109|
|     2595|2817.1687155114637|
+---------+------------------+
only showing top 5 rows

# Selecting the most predictable features

In [20]:
vectorAssembler = feat.VectorAssembler(inputCols=forest.columns[0:-1], outputCol='features')
selector = feat.ChiSqSelector(labelCol='CoverType', numTopFeatures=10, outputCol='selected')
selector.fit(vectorAssembler.transform(forest)).transform(vectorAssembler.transform(forest)).select('selected').show(5)

+--------------------+
|            selected|
+--------------------+
|(10,[0,1,2,3,5,6,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 5 rows

# Predicting forest coverage type

In [21]:
forest_train, forest_test = forest.randomSplit([0.7, 0.3], seed=666)

vectorAssembler = feat.VectorAssembler(inputCols=forest.columns[0:-1], outputCol='features')
selector = feat.ChiSqSelector(labelCol='CoverType', numTopFeatures=10, outputCol='selected')
logReg_obj = cl.LogisticRegression(
    labelCol='CoverType'
    , featuresCol=selector.getOutputCol()
    , regParam=0.01
    , elasticNetParam=1.0
    , family='multinomial'
)

pipeline = Pipeline(stages=[vectorAssembler, selector, logReg_obj])
pModel = pipeline.fit(forest_train)

In [22]:
results_logReg = pModel.transform(forest_test).select('CoverType', 'probability', 'prediction')
results_logReg.show(5)

+---------+--------------------+----------+
|CoverType|         probability|prediction|
+---------+--------------------+----------+
|        3|[8.67479362381023...|       3.0|
|        3|[9.19887826242121...|       3.0|
|        6|[9.00292703965869...|       3.0|
|        6|[1.06415417198863...|       3.0|
|        6|[1.04354579793880...|       3.0|
+---------+--------------------+----------+
only showing top 5 rows

In [23]:
import pyspark.ml.evaluation as ev

evaluator = ev.MulticlassClassificationEvaluator(
    predictionCol='prediction'
    , labelCol='CoverType')

(
    evaluator.evaluate(results_logReg)
    , evaluator.evaluate(results_logReg, {evaluator.metricName: 'weightedPrecision'})
    , evaluator.evaluate(results_logReg, {evaluator.metricName: 'accuracy'})
)

(0.6007121761095855, 0.590858268546363, 0.6343096137892893)

In [24]:
rf_obj = cl.RandomForestClassifier(
    labelCol='CoverType'
    , featuresCol=selector.getOutputCol()
    , minInstancesPerNode=10
    , numTrees=10
)

pipeline = Pipeline(stages=[vectorAssembler, selector, rf_obj])
pModel = pipeline.fit(forest_train)

In [None]:
results_rf = pModel.transform(forest_test).select('CoverType', 'probability', 'prediction')
results_rf.show(5)

+---------+--------------------+----------+
|CoverType|         probability|prediction|
+---------+--------------------+----------+
|        3|[0.0,0.0180630802...|       3.0|
|        3|[0.0,0.0180630802...|       3.0|
|        6|[0.0,0.0180630802...|       3.0|
|        6|[0.0,0.0077261652...|       3.0|
|        6|[0.0,0.0077261652...|       3.0|
+---------+--------------------+----------+
only showing top 5 rows

In [None]:
(
    evaluator.evaluate(results_rf)
    , evaluator.evaluate(results_rf, {evaluator.metricName: 'weightedPrecision'})
    , evaluator.evaluate(results_rf, {evaluator.metricName: 'accuracy'})
)

# Estimating forest elevation

# Clustering forest cover type

# Tuning hyper parameters

# Calculating performance characteristics

# Extracting features

# Discretizing continuous variables

# Standardizing continuous variables

# Topic mining