##### Fernando Amaral
##### KMeans

In [None]:
# Kmenas cluster
# we will evaluate the cluster by comparing it to the class (class column)

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("kmeans").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")
iris.show(5)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
# we need a column with the features vectorized
from pyspark.ml.feature import VectorAssembler
asb = VectorAssembler(inputCols=["sepallength","sepalwidth","petallength","petalwidth"], outputCol="features")
irisas = asb.transform(iris)
irisas.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 5 rows



In [5]:
# transform class to a index (number)
from pyspark.ml.feature import StringIndexer
ind = StringIndexer(inputCol="class",outputCol="classtrans")
irisas = ind.fit(irisas).transform(irisas)
irisas.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|classtrans|
+-----------+----------+-----------+----------+-----------+-----------------+----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
+-----------+----------+-----------+----------+-----------+-----------------+----------+
only showing top 5 rows



In [6]:
# change class type to integer to be the same type as the cluster
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
irisas = irisas.withColumn("classtrans", irisas["classtrans"].cast(IntegerType()))
irisas.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|classtrans|
+-----------+----------+-----------+----------+-----------+-----------------+----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|         0|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|         0|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|         0|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|         0|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|         0|
+-----------+----------+-----------+----------+-----------+-----------------+----------+
only showing top 5 rows



In [7]:
# create and fit the cluster
km = KMeans(predictionCol="grupo", maxIter=100, k=3, featuresCol="features")
modelo = km.fit(irisas)

In [None]:
grupos = modelo.transform(irisas)

In [9]:
# show cluster, column grupo
grupos.show(150)

+-----------+----------+-----------+----------+---------------+-----------------+----------+-----+
|sepallength|sepalwidth|petallength|petalwidth|          class|         features|classtrans|grupo|
+-----------+----------+-----------+----------+---------------+-----------------+----------+-----+
|        5.1|       3.5|        1.4|       0.2|    Iris-setosa|[5.1,3.5,1.4,0.2]|         0|    1|
|        4.9|       3.0|        1.4|       0.2|    Iris-setosa|[4.9,3.0,1.4,0.2]|         0|    1|
|        4.7|       3.2|        1.3|       0.2|    Iris-setosa|[4.7,3.2,1.3,0.2]|         0|    1|
|        4.6|       3.1|        1.5|       0.2|    Iris-setosa|[4.6,3.1,1.5,0.2]|         0|    1|
|        5.0|       3.6|        1.4|       0.2|    Iris-setosa|[5.0,3.6,1.4,0.2]|         0|    1|
|        5.4|       3.9|        1.7|       0.4|    Iris-setosa|[5.4,3.9,1.7,0.4]|         0|    1|
|        4.6|       3.4|        1.4|       0.3|    Iris-setosa|[4.6,3.4,1.4,0.3]|         0|    1|
|        5

In [10]:
#confusion matriz
#higest matchs are fits
classe = grupos.select("classtrans").collect()
agrupado = grupos.select("grupo").collect()
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(classe,agrupado)
print(cm)

[[ 0 50  0]
 [47  0  3]
 [14  0 36]]


In [11]:
# manual evaluate the performance
acuracia = (cm[0,1]+cm[1,0]+cm[2,2]) / 150
print(acuracia)

0.8866666666666667
