# K-means

In [1]:
# Inicializando Spark
import findspark
findspark.init('/home/macaubas/spark-3.2.1-bin-hadoop3.2')
import pyspark

from pyspark.sql import SparkSession 

spark = SparkSession.builder.appName("clustering").getOrCreate()

22/05/26 17:07:55 WARN Utils: Your hostname, macaubas-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
22/05/26 17:07:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/26 17:07:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [30]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler

## Exemplo pyspark

In [2]:
dataset = spark.read.format('libsvm').load('sample_kmeans_data.txt')

dataset.show()

22/05/26 17:09:29 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [4]:
final_data = dataset.select('features')

final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



## Criando modelo

In [6]:
# 2 grupos com random seed 1
kmeans = KMeans().setK(2).setSeed(1)

In [7]:
# Fitting
model = kmeans.fit(final_data)

In [13]:
# Previsão
prev = model.transform(final_data)

## Avalliando

In [15]:
# Silhueta com distância euclidiana ao quadrado
evaluator = ClusteringEvaluator()
silhueta = evaluator.evaluate(prev)

print(silhueta)

0.9997530305375207


In [17]:
# Centro dos grupos
centers = model.clusterCenters()
centers

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [18]:
prev.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



## Reaplicando o exemplo para 3 grupos

In [19]:
# 2 grupos com random seed 1
kmeans = KMeans().setK(3).setSeed(1)

# Fitting
model = kmeans.fit(final_data)

# Previsão
prev = model.transform(final_data)

# Silhueta com distância euclidiana ao quadrado
evaluator = ClusteringEvaluator()
silhueta = evaluator.evaluate(prev)

print(silhueta)

# Centro dos grupos
centers = model.clusterCenters()
centers

0.6248737134600261


[array([9.1, 9.1, 9.1]), array([0.05, 0.05, 0.05]), array([0.2, 0.2, 0.2])]

In [20]:
prev.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



### A shilueta reduziu em decorrência de aumentar a quantidade de grupos

-----------------------

## Exemplo prático

Sabe-se que existem 3 grupos de sementes.

In [21]:
seeds = spark.read.csv('seeds_dataset.csv', header=True, inferSchema=True)

seeds.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [22]:
# Visualizando base de dados das sementes
seeds.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

In [23]:
# Estatística descritiva
seeds.describe().show()

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210|                  210|                210|
|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533333333335| 3.258604761904762|   3.7001999999999997|  5.408071428571429|
| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867|   1.5035589702547392|0.49148049910240543|
|    min|             10.59|             12.41|              0.8081|              4.899|            

In [24]:
for e in seeds.head(5):
    print(e)
    print('\n')

Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)


Row(area=14.88, perimeter=14.57, compactness=0.8811, length_of_kernel=5.553999999999999, width_of_kernel=3.333, asymmetry_coefficient=1.018, length_of_groove=4.956)


Row(area=14.29, perimeter=14.09, compactness=0.905, length_of_kernel=5.291, width_of_kernel=3.3369999999999997, asymmetry_coefficient=2.699, length_of_groove=4.825)


Row(area=13.84, perimeter=13.94, compactness=0.8955, length_of_kernel=5.324, width_of_kernel=3.3789999999999996, asymmetry_coefficient=2.259, length_of_groove=4.805)


Row(area=16.14, perimeter=14.99, compactness=0.9034, length_of_kernel=5.6579999999999995, width_of_kernel=3.562, asymmetry_coefficient=1.355, length_of_groove=5.175)




## Preparando os dados

In [26]:
seeds.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [29]:
# Criando assembler
assembler = VectorAssembler(inputCols=seeds.columns,
                           outputCol='features')

# Dados finais
seeds_final = assembler.transform(seeds)
seeds_final = seeds_final.select('features')

seeds_final.show()

+--------------------+
|            features|
+--------------------+
|[15.26,14.84,0.87...|
|[14.88,14.57,0.88...|
|[14.29,14.09,0.90...|
|[13.84,13.94,0.89...|
|[16.14,14.99,0.90...|
|[14.38,14.21,0.89...|
|[14.69,14.49,0.87...|
|[14.11,14.1,0.891...|
|[16.63,15.46,0.87...|
|[16.44,15.25,0.88...|
|[15.26,14.85,0.86...|
|[14.03,14.16,0.87...|
|[13.89,14.02,0.88...|
|[13.78,14.06,0.87...|
|[13.74,14.05,0.87...|
|[14.59,14.28,0.89...|
|[13.99,13.83,0.91...|
|[15.69,14.75,0.90...|
|[14.7,14.21,0.915...|
|[12.72,13.57,0.86...|
+--------------------+
only showing top 20 rows



## Normalizando os dados para mesma escala

Kmeans é sensível a escalas diferentes

In [31]:
scaler = StandardScaler(inputCol='features',
                       outputCol='scaledFeatures')

In [54]:
# Fitting
scaler_model = scaler.fit(seeds_final)

# Finalizando dados finais
final_data = scaler_model.transform(seeds_final)

final_data.show()


+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[15.26,14.84,0.87...|[5.24452795332028...|
|[14.88,14.57,0.88...|[5.11393027165175...|
|[14.29,14.09,0.90...|[4.91116018695588...|
|[13.84,13.94,0.89...|[4.75650503761158...|
|[16.14,14.99,0.90...|[5.54696468981581...|
|[14.38,14.21,0.89...|[4.94209121682475...|
|[14.69,14.49,0.87...|[5.04863143081749...|
|[14.11,14.1,0.891...|[4.84929812721816...|
|[16.63,15.46,0.87...|[5.71536696354628...|
|[16.44,15.25,0.88...|[5.65006812271202...|
|[15.26,14.85,0.86...|[5.24452795332028...|
|[14.03,14.16,0.87...|[4.82180387844584...|
|[13.89,14.02,0.88...|[4.77368894309428...|
|[13.78,14.06,0.87...|[4.73588435103234...|
|[13.74,14.05,0.87...|[4.72213722664617...|
|[14.59,14.28,0.89...|[5.01426361985209...|
|[13.99,13.83,0.91...|[4.80805675405968...|
|[15.69,14.75,0.90...|[5.39230954047151...|
|[14.7,14.21,0.915...|[5.05206821191403...|
|[12.72,13.57,0.86...|[4.3715855

In [55]:
final_data.head(1)

[Row(features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

## Criando o modelo

In [58]:
kmeans = KMeans(k = 3,featuresCol = 'scaledFeatures').setSeed(1)

In [59]:
model = kmeans.fit(final_data)

In [60]:
prev = model.transform(final_data)

In [61]:
# Silhueta com distância euclidiana ao quadrado
evaluator = ClusteringEvaluator()
silhueta = evaluator.evaluate(prev)

print(silhueta)

# Centro dos grupos
print('\nCentros dos clusters:')
centers = model.clusterCenters()
print(centers)

0.630000103338996

Centros dos clusters:
[array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
        1.80061978, 10.41913733]), array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
        3.15410901, 10.38031464]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107])]


In [62]:
prev.show()

+--------------------+--------------------+----------+
|            features|      scaledFeatures|prediction|
+--------------------+--------------------+----------+
|[15.26,14.84,0.87...|[5.24452795332028...|         0|
|[14.88,14.57,0.88...|[5.11393027165175...|         0|
|[14.29,14.09,0.90...|[4.91116018695588...|         0|
|[13.84,13.94,0.89...|[4.75650503761158...|         0|
|[16.14,14.99,0.90...|[5.54696468981581...|         0|
|[14.38,14.21,0.89...|[4.94209121682475...|         0|
|[14.69,14.49,0.87...|[5.04863143081749...|         0|
|[14.11,14.1,0.891...|[4.84929812721816...|         0|
|[16.63,15.46,0.87...|[5.71536696354628...|         2|
|[16.44,15.25,0.88...|[5.65006812271202...|         0|
|[15.26,14.85,0.86...|[5.24452795332028...|         0|
|[14.03,14.16,0.87...|[4.82180387844584...|         0|
|[13.89,14.02,0.88...|[4.77368894309428...|         0|
|[13.78,14.06,0.87...|[4.73588435103234...|         0|
|[13.74,14.05,0.87...|[4.72213722664617...|         0|
|[14.59,14