In [1]:
import findspark
findspark.init(findspark.find())

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Predict Iris Species").getOrCreate()

22/07/26 21:10:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.csv("IRIS.csv",inferSchema=True,header=True)

                                                                                

In [4]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [6]:
from pyspark.sql.functions import col
features = df.drop('species')
features = features.select(*(col(c).cast('float').alias(c) for c in features.columns))
features.printSchema()

root
 |-- sepal_length: float (nullable = true)
 |-- sepal_width: float (nullable = true)
 |-- petal_length: float (nullable = true)
 |-- petal_width: float (nullable = true)



In [7]:
from pyspark.sql.functions import col, count, isnan, when
features.select([count(when(col(c).isNull(), c)).alias(c) for c in features.columns]).show()

+------------+-----------+------------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|
+------------+-----------+------------+-----------+
|           0|          0|           0|          0|
+------------+-----------+------------+-----------+



In [8]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")
output = assembler.transform(df).select('features','species')
output.show()

+-----------------+-----------+
|         features|    species|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
|[5.4,3.9,1.7,0.4]|Iris-setosa|
|[4.6,3.4,1.4,0.3]|Iris-setosa|
|[5.0,3.4,1.5,0.2]|Iris-setosa|
|[4.4,2.9,1.4,0.2]|Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|
|[5.4,3.7,1.5,0.2]|Iris-setosa|
|[4.8,3.4,1.6,0.2]|Iris-setosa|
|[4.8,3.0,1.4,0.1]|Iris-setosa|
|[4.3,3.0,1.1,0.1]|Iris-setosa|
|[5.8,4.0,1.2,0.2]|Iris-setosa|
|[5.7,4.4,1.5,0.4]|Iris-setosa|
|[5.4,3.9,1.3,0.4]|Iris-setosa|
|[5.1,3.5,1.4,0.3]|Iris-setosa|
|[5.7,3.8,1.7,0.3]|Iris-setosa|
|[5.1,3.8,1.5,0.3]|Iris-setosa|
+-----------------+-----------+
only showing top 20 rows



In [9]:
label_stringIdx = StringIndexer(inputCol = 'species', outputCol = 'labelIndex')
df = label_stringIdx.fit(output).transform(output)
df.show()

+-----------------+-----------+----------+
|         features|    species|labelIndex|
+-----------------+-----------+----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|       0.0|
|[4.9,3.0,1.4,0.2]|Iris-setosa|       0.0|
|[4.7,3.2,1.3,0.2]|Iris-setosa|       0.0|
|[4.6,3.1,1.5,0.2]|Iris-setosa|       0.0|
|[5.0,3.6,1.4,0.2]|Iris-setosa|       0.0|
|[5.4,3.9,1.7,0.4]|Iris-setosa|       0.0|
|[4.6,3.4,1.4,0.3]|Iris-setosa|       0.0|
|[5.0,3.4,1.5,0.2]|Iris-setosa|       0.0|
|[4.4,2.9,1.4,0.2]|Iris-setosa|       0.0|
|[4.9,3.1,1.5,0.1]|Iris-setosa|       0.0|
|[5.4,3.7,1.5,0.2]|Iris-setosa|       0.0|
|[4.8,3.4,1.6,0.2]|Iris-setosa|       0.0|
|[4.8,3.0,1.4,0.1]|Iris-setosa|       0.0|
|[4.3,3.0,1.1,0.1]|Iris-setosa|       0.0|
|[5.8,4.0,1.2,0.2]|Iris-setosa|       0.0|
|[5.7,4.4,1.5,0.4]|Iris-setosa|       0.0|
|[5.4,3.9,1.3,0.4]|Iris-setosa|       0.0|
|[5.1,3.5,1.4,0.3]|Iris-setosa|       0.0|
|[5.7,3.8,1.7,0.3]|Iris-setosa|       0.0|
|[5.1,3.8,1.5,0.3]|Iris-setosa|       0.0|
+----------

In [10]:
train,test = df.randomSplit([0.7, 0.3])
train.show()
test.show()

+-----------------+---------------+----------+
|         features|        species|labelIndex|
+-----------------+---------------+----------+
|[4.4,2.9,1.4,0.2]|    Iris-setosa|       0.0|
|[4.4,3.0,1.3,0.2]|    Iris-setosa|       0.0|
|[4.4,3.2,1.3,0.2]|    Iris-setosa|       0.0|
|[4.5,2.3,1.3,0.3]|    Iris-setosa|       0.0|
|[4.6,3.2,1.4,0.2]|    Iris-setosa|       0.0|
|[4.6,3.6,1.0,0.2]|    Iris-setosa|       0.0|
|[4.7,3.2,1.6,0.2]|    Iris-setosa|       0.0|
|[4.8,3.0,1.4,0.3]|    Iris-setosa|       0.0|
|[4.9,2.5,4.5,1.7]| Iris-virginica|       2.0|
|[4.9,3.0,1.4,0.2]|    Iris-setosa|       0.0|
|[4.9,3.1,1.5,0.1]|    Iris-setosa|       0.0|
|[4.9,3.1,1.5,0.1]|    Iris-setosa|       0.0|
|[5.0,2.0,3.5,1.0]|Iris-versicolor|       1.0|
|[5.0,2.3,3.3,1.0]|Iris-versicolor|       1.0|
|[5.0,3.2,1.2,0.2]|    Iris-setosa|       0.0|
|[5.0,3.3,1.4,0.2]|    Iris-setosa|       0.0|
|[5.0,3.4,1.6,0.4]|    Iris-setosa|       0.0|
|[5.0,3.5,1.3,0.3]|    Iris-setosa|       0.0|
|[5.0,3.5,1.6

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier


rf = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'labelIndex')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('features','labelIndex', 'rawPrediction', 'prediction', 'probability').show(500)

+-----------------+----------+--------------+----------+--------------------+
|         features|labelIndex| rawPrediction|prediction|         probability|
+-----------------+----------+--------------+----------+--------------------+
|[4.3,3.0,1.1,0.1]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.6,3.1,1.5,0.2]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.6,3.4,1.4,0.3]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.7,3.2,1.3,0.2]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.8,3.0,1.4,0.1]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.8,3.1,1.6,0.2]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.8,3.4,1.6,0.2]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.8,3.4,1.9,0.2]|       0.0|[27.0,0.0,0.0]|       0.0|       [1.0,0.0,0.0]|
|[4.9,2.4,3.3,1.0]|       1.0| [0.0,0.0,1.0]|       2.0|       [0.0,0.0,1.0]|
|[4.9,3.1,1.5,0.1]|       0.0|[27.0,0.0,0.0]|       0.0|       [

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.9417366946778711
Test Error = 0.05826330532212887


In [16]:
predictions = rfModel.transform(test)
predictions.show(10)
predictions = predictions.withColumnRenamed("labelIndex","label")
predictions.show(10)

+-----------------+---------------+----------+--------------+-------------+----------+
|         features|        species|labelIndex| rawPrediction|  probability|prediction|
+-----------------+---------------+----------+--------------+-------------+----------+
|[4.3,3.0,1.1,0.1]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.1,1.5,0.2]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.4,1.4,0.3]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.7,3.2,1.3,0.2]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.0,1.4,0.1]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.1,1.6,0.2]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.6,0.2]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.9,0.2]|    Iris-setosa|       0.0|[27.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.9,2.4,3.3,1.0]|Iris-versicolor|       1

In [17]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','label']).withColumn('label', F.col('label').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

[[23.  0.  0.]
 [ 0. 16.  2.]
 [ 0.  1.  9.]]


In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
evaluator = BinaryClassificationEvaluator()
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
#print('Area Under ROC', evaluator.evaluate(predictions))


# Get metrics
acc = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedRecall"})
#auc = evaluator.evaluate(predictionAndTarget)

print('Precision', weightedPrecision)
print('Accuracy', acc)
print('F1-Score', f1)
print('Recall', weightedRecall)

Precision 0.9435881304393414
Accuracy 0.9411764705882353
F1-Score 0.9417366946778711
Recall 0.9411764705882353


In [19]:
spark.stop()