##### Fernando Amaral
##### TrainValidationSplit

In [None]:
# Tunning + Pipeline with TrainValidationSplit

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("tunningtrain").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")
irisTreino, irisTeste = iris.randomSplit([0.7,0.3])

In [3]:
from pyspark.ml.feature import VectorAssembler
vector = VectorAssembler(inputCols=["sepallength","sepalwidth","petallength","petalwidth"],outputCol="independente" )

In [4]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="class", outputCol="label")

In [5]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4,5,4,3], featuresCol="independente", labelCol="label")

In [6]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vector, indexer, mlp])

In [7]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
performance = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

#these are the hyper parameters we want to test
grid = ParamGridBuilder().addGrid(mlp.maxIter,[10,100,1000]).addGrid(mlp.layers,[[4,5,4,3],[4,4,3]]).build()
trainval = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=grid,evaluator=performance,trainRatio=0.8)

In [8]:
modelo = trainval.fit(irisTreino)

In [9]:
previsao = modelo.transform(irisTeste)
previsao.select("rawprediction","probability","prediction").show()

+--------------------+--------------------+----------+
|       rawprediction|         probability|prediction|
+--------------------+--------------------+----------+
|[-13.097934461929...|[1.53710143356577...|       1.0|
|[-13.097934461509...|[1.53710143477544...|       1.0|
|[-13.097934462052...|[1.53710143321350...|       1.0|
|[-13.097934460936...|[1.53710143642194...|       1.0|
|[60.1396548507122...|[1.0,3.4823515854...|       0.0|
|[-13.097934461819...|[1.53710143388331...|       1.0|
|[59.2305062203538...|[1.0,4.7310136727...|       0.0|
|[-13.097934461397...|[1.53710143509726...|       1.0|
|[-13.097934461921...|[1.53710143358949...|       1.0|
|[-13.097934461971...|[1.53710143344432...|       1.0|
|[-13.097934461996...|[1.53710143337254...|       1.0|
|[60.1436410398455...|[1.0,3.5129635671...|       0.0|
|[-13.097934461917...|[1.53710143360159...|       1.0|
|[-13.097934461942...|[1.53710143352872...|       1.0|
|[-13.097934461975...|[1.53710143343268...|       1.0|
|[48.61539

In [10]:
performance = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
acuracia = performance.evaluate(previsao)
print(acuracia)

1.0
