##### Fernando Amaral
##### Pipelines

In [None]:
# Pipeline example

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("pipeline").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")
irisTreino, irisTeste = iris.randomSplit([0.7,0.3])

In [3]:
# we create the objects, but we do not fit or tranform
# we need a column with the features vectorized
from pyspark.ml.feature import VectorAssembler
vector = VectorAssembler(inputCols=["sepallength","sepalwidth","petallength","petalwidth"],outputCol="independente" )

In [4]:
# class from string to a index (number)
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="class", outputCol="dependente")

In [5]:
# model
from pyspark.ml.classification import MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4,5,4,3], featuresCol="independente", labelCol="dependente")

In [6]:
# create the pipeline
from pyspark.ml import Pipeline
# steps in the order 
pipeline = Pipeline(stages=[vector, indexer, mlp])
modelo = pipeline.fit(irisTreino)

In [7]:
previsao = modelo.transform(irisTeste)

In [8]:
previsao.select("independente","dependente","rawprediction","probability","prediction").show()

+-----------------+----------+--------------------+--------------------+----------+
|     independente|dependente|       rawprediction|         probability|prediction|
+-----------------+----------+--------------------+--------------------+----------+
|[4.4,2.9,1.4,0.2]|       0.0|[51.6517777683934...|[0.99999999999994...|       0.0|
|[4.6,3.1,1.5,0.2]|       0.0|[51.6517777676660...|[0.99999999999994...|       0.0|
|[4.7,3.2,1.3,0.2]|       0.0|[51.6517777675897...|[0.99999999999994...|       0.0|
|[4.8,3.1,1.6,0.2]|       0.0|[51.6517777673324...|[0.99999999999994...|       0.0|
|[4.9,2.5,4.5,1.7]|       1.0|[-226.80564350423...|[7.75227398846502...|       1.0|
|[4.9,3.1,1.5,0.1]|       0.0|[51.6517777674654...|[0.99999999999994...|       0.0|
|[4.9,3.1,1.5,0.1]|       0.0|[51.6517777674654...|[0.99999999999994...|       0.0|
|[5.0,2.3,3.3,1.0]|       2.0|[-145.94387756418...|[3.89891160274124...|       2.0|
|[5.0,3.0,1.6,0.2]|       0.0|[51.6517777669313...|[0.99999999999994...|    

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
performance = MulticlassClassificationEvaluator(labelCol="dependente",predictionCol="prediction", metricName="accuracy")
acuracia = performance.evaluate(previsao)
print(acuracia)

0.9795918367346939
