In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dogFoodTree').getOrCreate()

## Import data

In [2]:
data = spark.read.csv('dog_food.csv', inferSchema=True, header=True)
data.printSchema()
print('Number of samples:', data.count())
data.show(5)

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)

Number of samples: 490
+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [3]:
# note 'filter' and 'where' are aliases
# also note that both syntax styles (sql and python dataframe) work

print('Number of non-spoiled samples:', data.filter('Spoiled=0').count())
data.filter('Spoiled=0').show(5)

print('-'*50)

print('Number of spoiled samples:', data.where(data['Spoiled']==1).count())
data.where(data['Spoiled']==1).show(5)

Number of non-spoiled samples: 350
+---+---+---+---+-------+
|  A|  B|  C|  D|Spoiled|
+---+---+---+---+-------+
|  4|  2|8.0|  9|    0.0|
|  4|  8|9.0|  1|    0.0|
| 10|  8|8.0|  6|    0.0|
|  8|  6|9.0|  4|    0.0|
|  7|  2|7.0|  8|    0.0|
+---+---+---+---+-------+
only showing top 5 rows

--------------------------------------------------
Number of spoiled samples: 140
+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



## Prepare data to build model

In [4]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')

In [5]:
data_feed = assembler.transform(data).select(['features','Spoiled'])
data_feed.printSchema()
data_feed.show(5)

root
 |-- features: vector (nullable = true)
 |-- Spoiled: double (nullable = true)

+------------------+-------+
|          features|Spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
|[5.0,6.0,12.0,7.0]|    1.0|
|[6.0,2.0,13.0,6.0]|    1.0|
|[4.0,2.0,12.0,1.0]|    1.0|
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 5 rows



## Build some models

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

In [17]:
dtc = DecisionTreeClassifier(labelCol='Spoiled', maxDepth=5)
rfc = RandomForestClassifier(labelCol='Spoiled', maxDepth=5, numTrees=50)

In [18]:
dtc_model = dtc.fit(data_feed)
rfc_model = rfc.fit(data_feed)

In [19]:
dtc_model.featureImportances

SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199})

In [20]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0224, 1: 0.0276, 2: 0.9243, 3: 0.0257})

## Check correlation to confirm (column C should show the highest correlation with 'Spoiled' according to 'featureImportances'

In [21]:
for col in ['A', 'B', 'C', 'D']:
    print('Covarinace between %s and %s: %.4f:' % (col, 'Spoiled', data.corr(col,'Spoiled')))

Covarinace between A and Spoiled: 0.0600:
Covarinace between B and Spoiled: -0.0865:
Covarinace between C and Spoiled: 0.8586:
Covarinace between D and Spoiled: -0.0161:


## Try fit and evaluate and test_data

In [22]:
dtc_results = dtc_model.transform(data_feed)
rfc_results = rfc_model.transform(data_feed)

In [23]:
print('DTC sample results: ')
dtc_results.show(5)

print('-'*50)

print('RFC sample results: ')
rfc_results.show(5)

DTC sample results: 
+------------------+-------+-------------+-----------+----------+
|          features|Spoiled|rawPrediction|probability|prediction|
+------------------+-------+-------------+-----------+----------+
|[4.0,2.0,12.0,3.0]|    1.0|   [0.0,94.0]|  [0.0,1.0]|       1.0|
|[5.0,6.0,12.0,7.0]|    1.0|   [0.0,94.0]|  [0.0,1.0]|       1.0|
|[6.0,2.0,13.0,6.0]|    1.0|   [0.0,94.0]|  [0.0,1.0]|       1.0|
|[4.0,2.0,12.0,1.0]|    1.0|   [0.0,94.0]|  [0.0,1.0]|       1.0|
|[4.0,2.0,12.0,3.0]|    1.0|   [0.0,94.0]|  [0.0,1.0]|       1.0|
+------------------+-------+-------------+-----------+----------+
only showing top 5 rows

--------------------------------------------------
RFC sample results: 
+------------------+-------+--------------------+--------------------+----------+
|          features|Spoiled|       rawPrediction|         probability|prediction|
+------------------+-------+--------------------+--------------------+----------+
|[4.0,2.0,12.0,3.0]|    1.0|[0.82040440179

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
acc_eval = MulticlassClassificationEvaluator(labelCol='Spoiled', metricName='accuracy')
f1_eval = MulticlassClassificationEvaluator(labelCol='Spoiled', metricName='f1')

In [26]:
print('DTC model accuracy:', acc_eval.evaluate(dtc_results))
print('DTC model F1-score:', f1_eval.evaluate(dtc_results))

print('-'*50)

print('RFC model accuracy:', acc_eval.evaluate(rfc_results))
print('RFC model F1-score:', f1_eval.evaluate(rfc_results))

DTC model accuracy: 0.9857142857142858
DTC model F1-score: 0.9856350741457125
--------------------------------------------------
RFC model accuracy: 0.9877551020408163
RFC model F1-score: 0.987701157538114


## DTC already gives good accuracy and F1-score --> add more tree does not really help

## No hyperparameter tuning of RFC

## Evaluate only train_data cannot show if the model overfits the data