In [1]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder.appName('dog_food_spoild_pred').getOrCreate()

22/06/06 22:48:23 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.8.100 instead (on interface wlp1s0)
22/06/06 22:48:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/06 22:48:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.csv('dog_food.csv', inferSchema=True , header= True) 

In [3]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [4]:
df.show(1)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 1 row



### How can we predict the food is spoild or not ? 

- We can use the coefficient value for each feature involved on resulting the spoiled column. 
- With that we can understand the **importance** or **predictive power** 
- the Tree methods classifiers has a feature called **featureimportances** it can help us spot the important features.

In [5]:
from pyspark.ml.feature import VectorAssembler 

In [6]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [7]:
assembler = VectorAssembler(inputCols=[
   'A', 'B', 'C', 'D' 
] , outputCol= 'features')

In [8]:
output = assembler.transform(df)

In [9]:
# import Random Forest 

from pyspark.ml.classification import RandomForestClassifier 

In [10]:
rfc = RandomForestClassifier(labelCol= 'Spoiled' , featuresCol='features')

In [12]:
# check
output.show(1)

+---+---+----+---+-------+------------------+
|  A|  B|   C|  D|Spoiled|          features|
+---+---+----+---+-------+------------------+
|  4|  2|12.0|  3|    1.0|[4.0,2.0,12.0,3.0]|
+---+---+----+---+-------+------------------+
only showing top 1 row



In [13]:
# selecting only features , Spoiled 

final_df = output.select('features' , 'Spoiled') 

In [14]:
final_df.show(1)

+------------------+-------+
|          features|Spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 1 row



In [15]:
# train classifier with the data 

rfc_model = rfc.fit(final_df)

22/06/06 23:27:11 WARN BlockManager: Asked to remove block broadcast_25_piece0, which does not exist


In [16]:
# explore the feature importances 

rfc_model.featureImportances

SparseVector(4, {0: 0.0239, 1: 0.016, 2: 0.937, 3: 0.023})

## Result:

Chemical(C) -> has 0.937 which means it's the most important feature 
