In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('clustering').getOrCreate()

## Import data

This is a real data set about seeds, from UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds.
The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for the experiment. The data set can be used for the tasks of classification and cluster analysis.

In [2]:
dataset = spark.read.csv('seeds_dataset.csv', header=True, inferSchema=True)

In [3]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
print('No. of rows:', dataset.count())
dataset.show(5)

No. of rows: 210
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|           4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|           4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|           4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|           5.175|
+-----+---------+-----------+------------------+------------------+---------------------+----------------+
only showing top 5 r

## Preprocess data and build a model

In [5]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')

In [9]:
data = assembler.transform(dataset)
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
# Data scaling
from pyspark.ml.feature import StandardScaler

In [12]:
# Note that by default the means of features are not re-centered to 0, only transform std to 1
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [13]:
data = scaler.fit(data).transform(data)

In [56]:
data = data.select('features', 'scaledFeatures')
data.show(5)

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[15.26,14.84,0.87...|[5.24452795332028...|
|[14.88,14.57,0.88...|[5.11393027165175...|
|[14.29,14.09,0.90...|[4.91116018695588...|
|[13.84,13.94,0.89...|[4.75650503761158...|
|[16.14,14.99,0.90...|[5.54696468981581...|
+--------------------+--------------------+
only showing top 5 rows



In [57]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [58]:
kmeans_model = kmeans.fit(data)

## Check the predictions

In [82]:
# You can just check the fitted results
kmeans_model.summary.predictions.show(5)

print('-'*50)

print('Sum of squared distance to centroids:', kmeans_model.computeCost(data))

print('-'*50)

for i in range(3):
    print('Locations of centroid no. %d:' % i , kmeans_model.clusterCenters()[i])
    
print('-'*50)
print('Number of members in each cluster: ')
kmeans_model.summary.predictions.groupBy('prediction').count().orderBy('prediction').show()

+--------------------+--------------------+----------+
|            features|      scaledFeatures|prediction|
+--------------------+--------------------+----------+
|[15.26,14.84,0.87...|[5.24452795332028...|         1|
|[14.88,14.57,0.88...|[5.11393027165175...|         1|
|[14.29,14.09,0.90...|[4.91116018695588...|         1|
|[13.84,13.94,0.89...|[4.75650503761158...|         1|
|[16.14,14.99,0.90...|[5.54696468981581...|         1|
+--------------------+--------------------+----------+
only showing top 5 rows

--------------------------------------------------
Sum of squared distance to centroids: 428.96020346563427
--------------------------------------------------
Locations of centroid no. 0: [  4.06133795  10.13721767  35.82681204  11.81771972   7.5087187
   3.25852121  10.4215732 ]
Locations of centroid no. 1: [  4.89953844  10.91313605  37.26599347  12.37677589   8.5736587
   1.80144511  10.362193  ]
Locations of centroid no. 2: [  6.3407095   12.39263108  37.41143125  13.9289

In [84]:
# Or you can transform and check the results (same in this case, but have to do this when clustering a new dataset)
kmeans_model.transform(data).show(5)

print('-'*50)
print('Number of members in each cluster: ')
kmeans_model.transform(data).groupBy('prediction').count().orderBy('prediction').show()

+--------------------+--------------------+----------+
|            features|      scaledFeatures|prediction|
+--------------------+--------------------+----------+
|[15.26,14.84,0.87...|[5.24452795332028...|         1|
|[14.88,14.57,0.88...|[5.11393027165175...|         1|
|[14.29,14.09,0.90...|[4.91116018695588...|         1|
|[13.84,13.94,0.89...|[4.75650503761158...|         1|
|[16.14,14.99,0.90...|[5.54696468981581...|         1|
+--------------------+--------------------+----------+
only showing top 5 rows

--------------------------------------------------
Number of members in each cluster: 
+----------+-----+
|prediction|count|
+----------+-----+
|         0|   66|
|         1|   76|
|         2|   68|
+----------+-----+

