In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *


In [31]:
spark = SparkSession.builder.appName("iris_clf").getOrCreate()

In [32]:
df = spark.read.csv('iris.csv', header=True, inferSchema=True)
df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [33]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



In [34]:
schema = StructType([
    StructField("sepal_length", DoubleType()),
    StructField("sepal_width", DoubleType()),
    StructField("petal_length", DoubleType()),
    StructField("petal_width", DoubleType()),
    StructField("type", StringType(), True)
])

In [35]:
df2 = spark.read.csv('iris.csv', header=True, schema=schema)
df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [36]:
from pyspark.ml.feature import VectorAssembler

In [37]:
input_col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
vectorizer = VectorAssembler(inputCols = input_col, outputCol='features')

df = vectorizer.transform(df)

df.show(5)

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-----------------+
only showing top 5 rows



In [38]:
from pyspark.ml.feature import StringIndexer

In [39]:
indexer = StringIndexer(inputCol='variety', outputCol='indexed_type')
df = indexer.fit(df).transform(df)
df.show(5)

+------------+-----------+------------+-----------+-------+-----------------+------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|indexed_type|
+------------+-----------+------------+-----------+-------+-----------------+------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|         0.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|[4.9,3.0,1.4,0.2]|         0.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|[4.7,3.2,1.3,0.2]|         0.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|[4.6,3.1,1.5,0.2]|         0.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|[5.0,3.6,1.4,0.2]|         0.0|
+------------+-----------+------------+-----------+-------+-----------------+------------+
only showing top 5 rows



In [40]:
df_train, df_test = df.randomSplit([0.8, 0.2], seed=0)

df_train.show()

+------------+-----------+------------+-----------+----------+-----------------+------------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|         features|indexed_type|
+------------+-----------+------------+-----------+----------+-----------------+------------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|[4.3,3.0,1.1,0.1]|         0.0|
|         4.4|        2.9|         1.4|        0.2|    Setosa|[4.4,2.9,1.4,0.2]|         0.0|
|         4.4|        3.0|         1.3|        0.2|    Setosa|[4.4,3.0,1.3,0.2]|         0.0|
|         4.4|        3.2|         1.3|        0.2|    Setosa|[4.4,3.2,1.3,0.2]|         0.0|
|         4.5|        2.3|         1.3|        0.3|    Setosa|[4.5,2.3,1.3,0.3]|         0.0|
|         4.6|        3.1|         1.5|        0.2|    Setosa|[4.6,3.1,1.5,0.2]|         0.0|
|         4.6|        3.2|         1.4|        0.2|    Setosa|[4.6,3.2,1.4,0.2]|         0.0|
|         4.6|        3.4|         1.4|        0.3|    Setos

In [41]:
df_test.show()

+------------+-----------+------------+-----------+----------+-----------------+------------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|         features|indexed_type|
+------------+-----------+------------+-----------+----------+-----------------+------------+
|         4.9|        3.1|         1.5|        0.2|    Setosa|[4.9,3.1,1.5,0.2]|         0.0|
|         4.9|        3.6|         1.4|        0.1|    Setosa|[4.9,3.6,1.4,0.1]|         0.0|
|         5.0|        3.4|         1.6|        0.4|    Setosa|[5.0,3.4,1.6,0.4]|         0.0|
|         5.0|        3.5|         1.3|        0.3|    Setosa|[5.0,3.5,1.3,0.3]|         0.0|
|         5.0|        3.5|         1.6|        0.6|    Setosa|[5.0,3.5,1.6,0.6]|         0.0|
|         5.1|        3.5|         1.4|        0.3|    Setosa|[5.1,3.5,1.4,0.3]|         0.0|
|         5.1|        3.7|         1.5|        0.4|    Setosa|[5.1,3.7,1.5,0.4]|         0.0|
|         5.1|        3.8|         1.6|        0.2|    Setos

In [42]:
from pyspark.ml.classification import RandomForestClassifier

In [43]:
rf_clf = RandomForestClassifier(featuresCol='features', labelCol='indexed_type')

In [44]:
rf_clf = rf_clf.fit(df_train)

In [45]:
df_test = rf_clf.transform(df_test)
df_test.show(5)

+------------+-----------+------------+-----------+-------+-----------------+------------+--------------+---------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|indexed_type| rawPrediction|    probability|prediction|
+------------+-----------+------------+-----------+-------+-----------------+------------+--------------+---------------+----------+
|         4.9|        3.1|         1.5|        0.2| Setosa|[4.9,3.1,1.5,0.2]|         0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|         4.9|        3.6|         1.4|        0.1| Setosa|[4.9,3.6,1.4,0.1]|         0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|         5.0|        3.4|         1.6|        0.4| Setosa|[5.0,3.4,1.6,0.4]|         0.0|[19.0,1.0,0.0]|[0.95,0.05,0.0]|       0.0|
|         5.0|        3.5|         1.3|        0.3| Setosa|[5.0,3.5,1.3,0.3]|         0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|         5.0|        3.5|         1.6|        0.6| Setosa|[5.0,3.5,1

In [46]:
df_test.select('variety','probability','indexed_type','prediction').show()

+----------+--------------------+------------+----------+
|   variety|         probability|indexed_type|prediction|
+----------+--------------------+------------+----------+
|    Setosa|       [1.0,0.0,0.0]|         0.0|       0.0|
|    Setosa|       [1.0,0.0,0.0]|         0.0|       0.0|
|    Setosa|     [0.95,0.05,0.0]|         0.0|       0.0|
|    Setosa|       [1.0,0.0,0.0]|         0.0|       0.0|
|    Setosa|       [0.7,0.3,0.0]|         0.0|       0.0|
|    Setosa|       [1.0,0.0,0.0]|         0.0|       0.0|
|    Setosa|     [0.95,0.05,0.0]|         0.0|       0.0|
|    Setosa|       [1.0,0.0,0.0]|         0.0|       0.0|
|    Setosa|       [1.0,0.0,0.0]|         0.0|       0.0|
|Versicolor|       [0.0,1.0,0.0]|         1.0|       1.0|
|Versicolor|       [0.0,1.0,0.0]|         1.0|       1.0|
| Virginica|       [0.0,0.1,0.9]|         2.0|       2.0|
|Versicolor|       [0.0,1.0,0.0]|         1.0|       1.0|
| Virginica|       [0.0,0.1,0.9]|         2.0|       2.0|
|    Setosa|  

In [47]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [48]:
criterion = MulticlassClassificationEvaluator(labelCol='indexed_type')

In [49]:
acc = criterion.evaluate(df_test)
print(f'Accurcy: {acc*100}%')

Accurcy: 100.0%
