In [1]:
from pyspark.sql import SparkSession
spark =SparkSession.builder.appName('dt').getOrCreate()

In [2]:
df = spark.sql('select * from dog_food')

In [3]:
df.describe().show()

In [4]:
df.groupBy('Spoiled').count().show()

In [5]:
df.show(5)

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol='features')
output = assembler.transform(df)
final_data = output.select('features','Spoiled')

In [7]:
train,test = final_data.randomSplit([0.7,0.3])

In [8]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [9]:
# GridSearch CV
rfc = RandomForestClassifier(featuresCol='features', labelCol='Spoiled', predictionCol='prediction')
grid = ParamGridBuilder().addGrid(rfc.numTrees, [100, 150,200]).build()
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Spoiled', metricName='f1')
cv = CrossValidator(estimator=rfc, estimatorParamMaps=grid, evaluator=evaluator,numFolds=5, seed=100,parallelism=2)

In [10]:
rfc_cv_model = cv.fit(train)

In [11]:
pred = rfc_cv_model.transform(test)

In [12]:
pred.show(5)

In [13]:
eval_f1 = evaluator.evaluate(pred)
eval_f1

In [14]:
# Feature Importance could not be found in cross validated model.Lets try without cross validation  to find feature importance
rfc = RandomForestClassifier(featuresCol='features', labelCol='Spoiled', predictionCol='prediction',numTrees=150)
model_rfc = rfc.fit(train)

In [15]:
model_rfc.featureImportances

In [16]:
# Feature importance for chemical 'C' is high.Thus 'C' is spoiling the dog food