##### Fernando Amaral
##### MultilayerPerceptronClassifier

In [None]:
# Multilayer Perceptron Example

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("mlp").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")
iris.show(5)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [5]:
# we need a column with the features vectorized
from pyspark.ml.feature import VectorAssembler
asb = VectorAssembler(inputCols=["sepallength","sepalwidth","petallength","petalwidth"], outputCol="independente")
irisas = asb.transform(iris)
irisas.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|     independente|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 5 rows



In [6]:
# transform class to a index (number)
from pyspark.ml.feature import StringIndexer
ind = StringIndexer(inputCol="class",outputCol="dependente")
irisas = ind.fit(irisas).transform(irisas)
irisas.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|     independente|dependente|
+-----------+----------+-----------+----------+-----------+-----------------+----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
+-----------+----------+-----------+----------+-----------+-----------------+----------+
only showing top 5 rows



In [7]:
# train test split
irisTreino, irisTeste = irisas.randomSplit([0.7,0.3])
print(irisTreino.count())
print(irisTeste.count())

98
52


In [9]:
# fit our model
# 4 layers 
from pyspark.ml.classification import MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(maxIter=10, layers=[4,5,4,3], featuresCol="independente", labelCol="dependente")
modelo = mlp.fit(irisTreino)

In [10]:
# predict
previsao = modelo.transform(irisTeste)
previsao.select("dependente","prediction").show(5)

+----------+----------+
|dependente|prediction|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       2.0|       2.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [14]:
# evaluate
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
performance = MulticlassClassificationEvaluator(labelCol="dependente", predictionCol="prediction", metricName="accuracy")
acuracia = performance.evaluate(previsao)
print(acuracia)

0.6153846153846154


In [15]:
# read hyper parameters
print(modelo.getMaxIter())
print(modelo.getLayers())
print(modelo.getStepSize())

10
[4, 5, 4, 3]
0.03


In [16]:
# set hyper parameter
parunico = {modelo.maxIter: 1000}

In [17]:
# new fit
modelo = mlp.fit(irisTreino, parunico)

In [18]:
# check hyper parameters again
print(modelo.getMaxIter())
print(modelo.getLayers())
print(modelo.getStepSize())

1000
[4, 5, 4, 3]
0.03


In [19]:
# predict
previsao = modelo.transform(irisTeste)
previsao.select("dependente","prediction").show(5)

+----------+----------+
|dependente|prediction|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       2.0|       2.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [20]:
# evaluate
performance = MulticlassClassificationEvaluator(labelCol="dependente", predictionCol="prediction", metricName="accuracy")
acuracia = performance.evaluate(previsao)
print(acuracia)

0.9423076923076923
