### Spark for Machine Learning & AI
### 04 Classification

In [1]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

### prepocessing

In [8]:
iris_df = spark.read.csv("./iris.data", inferSchema=True)
iris_df.take(1)

[Row(_c0=5.1, _c1=3.5, _c2=1.4, _c3=0.2, _c4='Iris-setosa')]

In [9]:
iris_df=iris_df.select(col("_c0").alias("sepal_length"),
                       col("_c1").alias("sepal_width"),
                       col("_c2").alias("petal_length"),
                       col("_c3").alias("petal_width"),
                       col("_c4").alias("species")
                      )
iris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa')]

In [10]:
vectorAssembler=VectorAssembler(inputCols=["sepal_length","sepal_width","petal_length","petal_width"],
                                outputCol="features")
viris_df=vectorAssembler.transform(iris_df)
viris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]))]

In [11]:
indexer=StringIndexer(inputCol="species",outputCol="label")
iviris_df=indexer.fit(viris_df).transform(viris_df)
iviris_df.show(1)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 1 row



### Naive Bayes Classification

In [12]:
iviris_df

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string, features: vector, label: double]

In [13]:
iviris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]), label=0.0)]

In [15]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [17]:
splits=iviris_df.randomSplit([0.6,0.4],1)
train_df=splits[0]
test_df=splits[1]

In [18]:
train_df.count()

98

In [19]:
test_df.count()

52

In [20]:
iviris_df.count()

150

In [24]:
nb=NaiveBayes(modelType="multinomial")
nbmodel=nb.fit(train_df)
predictions_df=nbmodel.transform(test_df)
predictions_df.take(1)

[Row(sepal_length=4.3, sepal_width=3.0, petal_length=1.1, petal_width=0.1, species='Iris-setosa', features=DenseVector([4.3, 3.0, 1.1, 0.1]), label=0.0, rawPrediction=DenseVector([-9.9894, -11.3476, -11.902]), probability=DenseVector([0.7118, 0.183, 0.1051]), prediction=0.0)]

In [25]:
evaluator=MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accurcay")
nbaccuarcy=evaluator.evaluate(predictions_df)
nbaccuarcy

IllegalArgumentException: MulticlassClassificationEvaluator_985720786975 parameter metricName given invalid value accurcay.