In [1]:
import findspark
findspark.init()

import pyspark
import random
from pathlib import Path
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier,GBTClassifier,DecisionTreeClassifier)

In [3]:
spark=SparkSession.builder.appName('tree3').getOrCreate()

In [4]:
path = Path('D:\Python environments\Pyspark_env\environments\Data\Python-and-Spark-for-Big-Data-master\Spark_for_Machine_Learning\Tree_Methods')

In [5]:
data=spark.read.csv(f'{path}/dog_food.csv',
                       inferSchema=True,header=True)

In [7]:
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [10]:
assembler=VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol='features')

In [11]:
output=assembler.transform(data)

In [12]:
output.show()

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

In [13]:
from pyspark.ml.classification import (RandomForestClassifier,GBTClassifier,DecisionTreeClassifier)

In [15]:
final_data=output.select('features','Spoiled')

In [16]:
train,test=final_data.randomSplit([0.7,.3])

In [17]:
rfc=RandomForestClassifier(labelCol='Spoiled')

In [18]:
rfc_model=rfc.fit(train)

In [25]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0199, 1: 0.0214, 2: 0.9321, 3: 0.0266})

In [19]:
pred_and_labels=rfc_model.transform(test)

In [20]:
pred_and_labels.show()

+-------------------+-------+--------------------+--------------------+----------+
|           features|Spoiled|       rawPrediction|         probability|prediction|
+-------------------+-------+--------------------+--------------------+----------+
| [1.0,1.0,12.0,4.0]|    1.0|[0.16666666666666...|[0.00833333333333...|       1.0|
|  [1.0,4.0,8.0,7.0]|    0.0|[19.9575552527878...|[0.99787776263939...|       0.0|
|  [1.0,4.0,9.0,3.0]|    0.0|[19.8299638160497...|[0.99149819080248...|       0.0|
| [1.0,5.0,8.0,10.0]|    0.0|[19.9627635861211...|[0.99813817930605...|       0.0|
|[1.0,5.0,12.0,10.0]|    1.0|[0.02777777777777...|[0.00138888888888...|       1.0|
|  [1.0,6.0,7.0,8.0]|    0.0|[19.9575552527878...|[0.99787776263939...|       0.0|
|  [1.0,6.0,8.0,3.0]|    0.0|[19.8299638160497...|[0.99149819080248...|       0.0|
| [1.0,7.0,11.0,9.0]|    1.0|[0.02777777777777...|[0.00138888888888...|       1.0|
|  [1.0,8.0,8.0,7.0]|    0.0|[19.9575552527878...|[0.99787776263939...|       0.0|
| [1

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
my_eval=MulticlassClassificationEvaluator(predictionCol='prediction',
    labelCol='Spoiled',
    metricName='accuracy')


In [24]:
my_eval.evaluate(pred_and_labels)

0.9523809523809523