## Atividade

Criar e aplicar um modelo de classificacao Naive Bayes para o arquivo *iris.csv*.

In [23]:
# Carregando libs e iniciando Session
from pyspark.sql import SparkSession, functions as func
from pyspark.sql.types import *
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import RFormula

spark = SparkSession.builder.getOrCreate()

In [24]:
# Importando dados
iris = spark.read.csv('../arquivos/iris.csv', inferSchema=True, header=True)
iris.show(3)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 3 rows



In [25]:
# Transformando os dados
form = RFormula(formula="class ~ .", featuresCol="features", labelCol="label", handleInvalid="skip")
iris_t = form.fit(iris).transform(iris)
iris_t.select('features','label').show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows



In [26]:
# Separando a base em treino e teste
iris_treino, iris_teste = iris_t.randomSplit([0.7, 0.3])
print(iris_treino.count())
print(iris_teste.count())

105
45


In [27]:
# Verificando separacao dos labels (labels concentrados em teste ou treino podem ser um problema para o modelo)
iris_treino.groupBy('label').count().show()
iris_teste.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   32|
|  1.0|   40|
|  2.0|   33|
+-----+-----+

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   18|
|  1.0|   10|
|  2.0|   17|
+-----+-----+



In [28]:
# Criando o modelo
nbc = NaiveBayes(labelCol='label', featuresCol='features')
modelo = nbc.fit(iris_treino)

In [29]:
# Obtendo previsoes
previsao = modelo.transform(iris_teste)
previsao.select('prediction', 'label').show(5)

+----------+-----+
|prediction|label|
+----------+-----+
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
+----------+-----+
only showing top 5 rows



In [30]:
# Avaliando o modelo
avaliador = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
print(avaliador.evaluate(previsao))

0.6444444444444445
