###### Here I'm using findspark to indicate where the spark module is located

In [1]:
import findspark
findspark.init(r'C:\Users\q1011812\Downloads\spark-3.0.1-bin-hadoop3.2\spark-3.0.1-bin-hadoop3.2')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Trees-project').getOrCreate()

In [4]:
dataset = spark.read.csv('dog_food.csv', inferSchema = True, header = True)
dataset.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [5]:
dataset.show(4)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
+---+---+----+---+-------+
only showing top 4 rows



In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier,
                                       GBTClassifier,
                                       DecisionTreeClassifier)
from pyspark.ml.feature import VectorAssembler

In [7]:
assembler = VectorAssembler(inputCols = ['A', 'B', 'C', 'D'],outputCol = 'features')

In [8]:
dataset = assembler.transform(dataset)

In [9]:
dataset.columns

['A', 'B', 'C', 'D', 'Spoiled', 'features']

In [10]:
dataset = dataset.select(['features','Spoiled'])

# Random Forest Classifier

In [11]:
rfc = RandomForestClassifier(labelCol = 'Spoiled', featuresCol = 'features', numTrees= 100)

In [12]:
rfc_model = rfc.fit(dataset)

In [13]:
rfc_preds = rfc_model.transform(dataset)

In [14]:
rfc_preds.show(5)

+------------------+-------+--------------------+--------------------+----------+
|          features|Spoiled|       rawPrediction|         probability|prediction|
+------------------+-------+--------------------+--------------------+----------+
|[4.0,2.0,12.0,3.0]|    1.0|[2.31102149850368...|[0.02311021498503...|       1.0|
|[5.0,6.0,12.0,7.0]|    1.0|[2.92764645663940...|[0.02927646456639...|       1.0|
|[6.0,2.0,13.0,6.0]|    1.0|[3.01068912018369...|[0.03010689120183...|       1.0|
|[4.0,2.0,12.0,1.0]|    1.0|[2.54748274967081...|[0.02547482749670...|       1.0|
|[4.0,2.0,12.0,3.0]|    1.0|[2.31102149850368...|[0.02311021498503...|       1.0|
+------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bin_eval = BinaryClassificationEvaluator(labelCol = 'Spoiled')

In [16]:
print("RFC")
print(bin_eval.evaluate(rfc_preds))

RFC
0.9993061224489797


In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(labelCol = 'Spoiled', metricName = 'accuracy')

In [18]:
# RFC
print("RFC")
print(acc_eval.evaluate(rfc_preds))

RFC
0.9857142857142858


In [19]:
rfc_model.featureImportances

SparseVector(4, {0: 0.023, 1: 0.0215, 2: 0.9319, 3: 0.0236})

### Conclusion: As the Feature C has most importance, that's the one causing Spoilage