# Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Tree Model").getOrCreate()

In [2]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler, VectorIndexer

# Load and verify data

In [3]:
data = spark.read.csv('dog_food.csv',header = True, inferSchema = True)

In [4]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: integer (nullable = true)



In [5]:
data.head(3)

[Row(A=4, B=2, C=12.0, D=3, Spoiled=1),
 Row(A=5, B=6, C=12.0, D=7, Spoiled=1),
 Row(A=6, B=2, C=13.0, D=6, Spoiled=1)]

In [6]:
for item in data.head(1)[0]:
    print(item)

4
2
12.0
3
1


In [7]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                  0|
|    max|                10|                10|              14.0|                10|                  1|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [8]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

# Data Preprocessing

In [9]:
assembler = VectorAssembler(inputCols =['A', 'B', 'C', 'D'],outputCol='features')
output = assembler.transform(data)

In [10]:
final_data = output.select('features','Spoiled')

# Build Model and Check Important Feature  

## DecisionTreeClassifier

In [12]:
classifier = DecisionTreeClassifier(labelCol = 'Spoiled',featuresCol='features')
model = classifier.fit(final_data)
model.featureImportances

SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})

## RandomForestClassifier

In [13]:
classifier = RandomForestClassifier(numTrees=150, labelCol = 'Spoiled',featuresCol='features')
model = classifier.fit(final_data)
model.featureImportances

SparseVector(4, {0: 0.0196, 1: 0.0202, 2: 0.9385, 3: 0.0217})

## GBTClassifier

In [14]:
classifier = GBTClassifier(labelCol = 'Spoiled',featuresCol='features')
model = classifier.fit(final_data)
model.featureImportances

SparseVector(4, {0: 0.0296, 1: 0.0383, 2: 0.8286, 3: 0.1034})