##### Fernando Amaral
##### BisectingKMeans

In [None]:
# hierarchical cluster

In [1]:
import findspark, pyspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName("hierarchical").getOrCreate()

In [2]:
iris = spark.read.csv("iris.csv", header=True, inferSchema=True, sep=",")

In [4]:
from pyspark.ml.clustering import BisectingKMeans

In [5]:
# we need a column with the features vectorized
from pyspark.ml.feature import VectorAssembler
asb = VectorAssembler(inputCols=["sepallength","sepalwidth","petallength","petalwidth"], outputCol="features")
irisas = asb.transform(iris)
irisas.show(5)

+-----------+----------+-----------+----------+-----------+-----------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|         features|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 5 rows



In [6]:
# create and fit
bkm = BisectingKMeans(featuresCol="features", predictionCol="cluster", k=3)
model = bkm.fit(irisas)

In [9]:
clusters = model.transform(irisas)
clusters.show(150)

+-----------+----------+-----------+----------+---------------+-----------------+-------+
|sepallength|sepalwidth|petallength|petalwidth|          class|         features|cluster|
+-----------+----------+-----------+----------+---------------+-----------------+-------+
|        5.1|       3.5|        1.4|       0.2|    Iris-setosa|[5.1,3.5,1.4,0.2]|      0|
|        4.9|       3.0|        1.4|       0.2|    Iris-setosa|[4.9,3.0,1.4,0.2]|      0|
|        4.7|       3.2|        1.3|       0.2|    Iris-setosa|[4.7,3.2,1.3,0.2]|      0|
|        4.6|       3.1|        1.5|       0.2|    Iris-setosa|[4.6,3.1,1.5,0.2]|      0|
|        5.0|       3.6|        1.4|       0.2|    Iris-setosa|[5.0,3.6,1.4,0.2]|      0|
|        5.4|       3.9|        1.7|       0.4|    Iris-setosa|[5.4,3.9,1.7,0.4]|      0|
|        4.6|       3.4|        1.4|       0.3|    Iris-setosa|[4.6,3.4,1.4,0.3]|      0|
|        5.0|       3.4|        1.5|       0.2|    Iris-setosa|[5.0,3.4,1.5,0.2]|      0|
|        4

In [13]:
# evaluate cluster performance
from pyspark.ml.evaluation import ClusteringEvaluator
metrica = ClusteringEvaluator(predictionCol="cluster")
silhouette = metrica.evaluate(clusters)
print(silhouette)

0.7231544457999555
