##### Fernando Amaral
##### CrossValidator

In [None]:
# Tunning + Pipeline with CrossValidator

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("tunningcross").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")
irisTreino, irisTeste = iris.randomSplit([0.7,0.3])

In [3]:
from pyspark.ml.feature import VectorAssembler
vector = VectorAssembler(inputCols=["sepallength","sepalwidth","petallength","petalwidth"],outputCol="independente" )

In [4]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="class", outputCol="label")

In [5]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4,5,4,3], featuresCol="independente", labelCol="label")

In [6]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vector, indexer, mlp])

In [9]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
performance = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

# these are the hyper parameters we want to test
grid = ParamGridBuilder().addGrid(mlp.maxIter,[10,100,1000]).addGrid(mlp.layers,[[4,5,4,3],[4,4,3]]).build()
crossval = CrossValidator(estimator=pipeline,estimatorParamMaps=grid,evaluator=performance,numFolds=3)

In [10]:
modelo = crossval.fit(irisTreino)

In [11]:
previsao = modelo.transform(irisTeste)
previsao.select("rawprediction","probability","prediction").show()

+--------------------+--------------------+----------+
|       rawprediction|         probability|prediction|
+--------------------+--------------------+----------+
|[-65.012837093793...|[2.56470034230802...|       2.0|
|[-65.012837093792...|[2.56470034230925...|       2.0|
|[-65.012837093792...|[2.56470034230889...|       2.0|
|[-65.012837093792...|[2.56470034230889...|       2.0|
|[-65.012837093792...|[2.56470034230860...|       2.0|
|[-65.012837093792...|[2.56470034230860...|       2.0|
|[-65.012837093793...|[2.56470034230831...|       2.0|
|[-65.012837093792...|[2.56470034230947...|       2.0|
|[-65.012837093792...|[2.56470034230925...|       2.0|
|[-65.012837093792...|[2.56470034230925...|       2.0|
|[-65.012837093792...|[2.56470034230925...|       2.0|
|[-65.012837093792...|[2.56470034230925...|       2.0|
|[-65.012837093792...|[2.56470034230918...|       2.0|
|[-7.1440309466339...|[1.21456141799657...|       1.0|
|[-65.012837093792...|[2.56470034230925...|       2.0|
|[-65.0128

In [12]:
performance = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
acuracia = performance.evaluate(previsao)
print(acuracia)

0.9782608695652174
