##### Fernando Amaral
##### NaiveBayes

In [None]:
# NaiveBayes example

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("naive").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")
print(iris.count())
iris.show(5)

150
+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [3]:
# we need a column with the features vectorized
from pyspark.ml.feature import RFormula
Rformula = RFormula(formula="class ~ .", featuresCol="independente", labelCol="dependente")
irisrf = Rformula.fit(iris).transform(iris)
irisrf.select("independente","dependente").show(5)

+-----------------+----------+
|     independente|dependente|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|       0.0|
|[4.9,3.0,1.4,0.2]|       0.0|
|[4.7,3.2,1.3,0.2]|       0.0|
|[4.6,3.1,1.5,0.2]|       0.0|
|[5.0,3.6,1.4,0.2]|       0.0|
+-----------------+----------+
only showing top 5 rows



In [4]:
# train test split 
irisTreino, irisTeste = irisrf.randomSplit([0.7,0.3])
print(irisTreino.count())
print(irisTeste.count())

110
40


In [5]:
# model fit
from pyspark.ml.classification import NaiveBayes
naive = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol="independente", labelCol="dependente")
modelo = naive.fit(irisTreino)

In [6]:
# predict
previsao = modelo.transform(irisTeste)
previsao.select("dependente","prediction").show(20)

+----------+----------+
|dependente|prediction|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       2.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       2.0|       2.0|
|       0.0|       0.0|
|       2.0|       1.0|
+----------+----------+
only showing top 20 rows



In [8]:
# evaluate
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
performance = MulticlassClassificationEvaluator(labelCol="dependente", predictionCol="prediction",
                                               metricName="accuracy")
acuracia = performance.evaluate(previsao)
print(acuracia)

0.75
